In [1]:
import sys
import os
import importlib
import pandas as pd
from statsmodels.tsa.stattools import adfuller

In [2]:
# Ensure Python finds config.py
sys.path.append(os.path.abspath(".."))  # Moves up one level to find config.py

In [3]:
import config
from config import RAW_DATA_DIR,PROCESSED_DATA_DIR   # Import the global path setup

# Load data using the relative path from config.py
file_path = os.path.join(RAW_DATA_DIR, "macro_data_raw.csv")  
df = pd.read_csv(file_path, index_col="date", parse_dates=True)



In [4]:
df

Unnamed: 0_level_0,CPIAUCSL,UNRATE,FEDFUNDS,INDPRO,M2SL,GDPC1,GDPPOT
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-01-01,186.300,5.7,1.00,92.3268,6082.2,15248.680,15349.36860
2004-02-01,186.700,5.6,1.01,92.8995,6121.9,,
2004-03-01,187.100,5.8,1.00,92.5368,6158.0,,
2004-04-01,187.400,5.6,1.00,92.8957,6199.1,15366.850,15445.75949
2004-05-01,188.200,5.6,1.00,93.5845,6275.9,,
...,...,...,...,...,...,...,...
2024-09-01,314.851,4.1,5.13,102.5954,21252.4,,
2024-10-01,315.564,4.1,4.83,102.2669,21332.7,23536.293,23113.34000
2024-11-01,316.449,4.2,4.64,102.0231,21465.8,,
2024-12-01,317.603,4.1,4.48,103.1654,21549.3,,


In [5]:
# Resample everything to monthly frequency
df = df.resample("ME").ffill()

# Apply linear interpolation for quarterly variables
quarterly_vars = ["GDPC1", "GDPPOT"]
df[quarterly_vars] = df[quarterly_vars].interpolate(method="linear")

print("Interpolation Applied to Quarterly Data:")
print(df.tail(12))  # Display last 12 months to verify


Interpolation Applied to Quarterly Data:
            CPIAUCSL  UNRATE  FEDFUNDS    INDPRO     M2SL      GDPC1  \
date                                                                   
2024-02-29   311.022     3.9      5.33  102.7267  20800.7  23110.332   
2024-03-31   312.107     3.9      5.33  102.5186  20905.6  23167.119   
2024-04-30   313.016     3.9      5.33  102.3568  20931.6  23223.906   
2024-05-31   313.140     4.0      5.33  102.9797  21013.0  23282.702   
2024-06-30   313.131     4.1      5.33  103.2534  21079.2  23341.498   
2024-07-31   313.566     4.2      5.33  102.5192  21093.6  23400.294   
2024-08-31   314.131     4.2      5.33  103.0196  21182.5  23445.627   
2024-09-30   314.851     4.1      5.13  102.5954  21252.4  23490.960   
2024-10-31   315.564     4.1      4.83  102.2669  21332.7  23536.293   
2024-11-30   316.449     4.2      4.64  102.0231  21465.8  23536.293   
2024-12-31   317.603     4.1      4.48  103.1654  21549.3  23536.293   
2025-01-31   319.086   

In [6]:
# Function to check stationarity (ADF Test)
def check_stationarity(series):
    """Returns True if the series is stationary, False otherwise."""
    result = adfuller(series.dropna())
    print(f"ADF Test for {series.name}: p-value = {result[1]:.5f}")
    return result[1] < 0.05  # If p-value < 0.05, it's stationary

# Recursive Differencing Function
def make_stationary(df, max_diffs=5):
    """Recursively differences each column until it becomes stationary, tracking the differences applied."""
    df_diff = df.copy()
    diff_counts = {col: 0 for col in df.columns}

    for col in df.columns:
        while not check_stationarity(df_diff[col]) and diff_counts[col] < max_diffs:
            df_diff[col] = df_diff[col].diff()
            diff_counts[col] += 1  # Track number of differences applied
            df_diff = df_diff.dropna()  # Drop NaNs after differencing

    print("\nAll series are now stationary!")
    print("Number of differences applied per series:", diff_counts)

    return df_diff, diff_counts


In [7]:
# Apply differencing to ensure stationarity
df_diff, diff_counts = make_stationary(df)

# Store the last known real values before differencing
last_real_values = df.iloc[len(df_diff) - 1]

# Save last real values for reversing differencing later
last_real_values_path = os.path.join(PROCESSED_DATA_DIR, "last_real_values.csv")
last_real_values.to_csv(last_real_values_path)


ADF Test for CPIAUCSL: p-value = 0.99874
ADF Test for CPIAUCSL: p-value = 0.15430
ADF Test for CPIAUCSL: p-value = 0.00000
ADF Test for UNRATE: p-value = 0.08106
ADF Test for UNRATE: p-value = 0.00000
ADF Test for FEDFUNDS: p-value = 0.07488
ADF Test for FEDFUNDS: p-value = 0.01105
ADF Test for INDPRO: p-value = 0.20939
ADF Test for INDPRO: p-value = 0.00000
ADF Test for M2SL: p-value = 0.96974
ADF Test for M2SL: p-value = 0.00145
ADF Test for GDPC1: p-value = 0.99798
ADF Test for GDPC1: p-value = 0.00102
ADF Test for GDPPOT: p-value = 1.00000
ADF Test for GDPPOT: p-value = 0.93725
ADF Test for GDPPOT: p-value = 0.00148

All series are now stationary!
Number of differences applied per series: {'CPIAUCSL': 2, 'UNRATE': 1, 'FEDFUNDS': 1, 'INDPRO': 1, 'M2SL': 1, 'GDPC1': 1, 'GDPPOT': 2}


In [8]:
# Import necessary paths from config
from config import PROCESSED_DATA_DIR
import os

# Compute split sizes
varmax_size = int(len(df_diff) * 0.7)
rl_train_size = int(len(df_diff) * 0.2)

# Split the data
varmax_train_data = df_diff.iloc[:varmax_size]  # VARMAX model training (70%)
rl_train_data = df_diff.iloc[varmax_size : varmax_size + rl_train_size]  # RL training (20%)
rl_test_data = df_diff.iloc[varmax_size + rl_train_size:]  # RL evaluation (10%)

# Save the last real values for each dataset
last_real_varmax = df.iloc[varmax_size - 1]  # Last real value before VARMAX training
last_real_rl_train = df.iloc[varmax_size + rl_train_size - 1]  # Last real value before RL training
last_real_rl_test = df.iloc[-1]  # Last real value before the RL test set

# Define file paths
varmax_path = os.path.join(PROCESSED_DATA_DIR, "varmax_train_data.csv")
rl_train_path = os.path.join(PROCESSED_DATA_DIR, "rl_train_data.csv")
rl_test_path = os.path.join(PROCESSED_DATA_DIR, "rl_test_data.csv")

last_real_varmax_path = os.path.join(PROCESSED_DATA_DIR, "last_real_varmax.csv")
last_real_rl_train_path = os.path.join(PROCESSED_DATA_DIR, "last_real_rl_train.csv")
last_real_rl_test_path = os.path.join(PROCESSED_DATA_DIR, "last_real_rl_test.csv")

# Ensure the processed data directory exists
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

# Save datasets
varmax_train_data.to_csv(varmax_path)
rl_train_data.to_csv(rl_train_path)
rl_test_data.to_csv(rl_test_path)

# Save last real values for differencing reversal
last_real_varmax.to_csv(last_real_varmax_path)
last_real_rl_train.to_csv(last_real_rl_train_path)
last_real_rl_test.to_csv(last_real_rl_test_path)

print("Train-Test Split & Last Real Values Saved")
print(f"VARMAX Training Data: {varmax_path} (Shape: {varmax_train_data.shape})")
print(f"RL Training Data: {rl_train_path} (Shape: {rl_train_data.shape})")
print(f"RL Test Data: {rl_test_path} (Shape: {rl_test_data.shape})")
print(f"Last Real Values (VARMAX): {last_real_varmax_path}")
print(f"Last Real Values (RL Train): {last_real_rl_train_path}")
print(f"Last Real Values (RL Test): {last_real_rl_test_path}")


Train-Test Split & Last Real Values Saved
VARMAX Training Data: /Users/nim/Documents/Research/central_bank_rl/data/processed/varmax_train_data.csv (Shape: (170, 7))
RL Training Data: /Users/nim/Documents/Research/central_bank_rl/data/processed/rl_train_data.csv (Shape: (48, 7))
RL Test Data: /Users/nim/Documents/Research/central_bank_rl/data/processed/rl_test_data.csv (Shape: (26, 7))
Last Real Values (VARMAX): /Users/nim/Documents/Research/central_bank_rl/data/processed/last_real_varmax.csv
Last Real Values (RL Train): /Users/nim/Documents/Research/central_bank_rl/data/processed/last_real_rl_train.csv
Last Real Values (RL Test): /Users/nim/Documents/Research/central_bank_rl/data/processed/last_real_rl_test.csv


In [9]:
import pandas as pd
import os
## put in config later?
def reverse_differencing(predictions, last_real_values_path, diff_counts):
    """
    Reconstructs actual macroeconomic values from differenced data.

    predictions: DataFrame containing RL-generated differenced values.
    last_real_values_path: Path to the last known true values before differencing.
    diff_counts: Dictionary storing the number of differences applied per series.
    """

    # Load last real values
    last_real_values = pd.read_csv(last_real_values_path, index_col=0, squeeze=True)

    # Ensure last_real_values is aligned with the predictions columns
    last_real_values = last_real_values.loc[predictions.columns]

    actual_series = last_real_values.copy()

    for col in predictions.columns:
        for _ in range(diff_counts[col]):  # Reverse differencing as many times as applied
            predictions[col] = actual_series[col] + predictions[col].cumsum()
            actual_series[col] = predictions[col].iloc[-1]  # Update last real value

    return predictions
