Economic Regime Forecasting Pipeline

preprocessing

In [5]:
import yfinance as yf
import pandas as pd
import numpy as np
from itertools import combinations

# Define ticker mapping (benchmark + sectors)
tickers = {
    'GOLD': 'GC=F',         # Gold Futures
    'OIL': 'CL=F',          # Crude Oil Futures
    'SPGSCI': '^SPGSCI',    # S&P GSCI Index
    'SPX': '^GSPC',         # S&P 500
    'US02': '^IRX',         # 13-Week Treasury Bill (proxy for US 2Y)
    'US10': '^TNX',         # 10-Year Treasury Note Yield
    'USDCAD': 'CAD=X',      # USD/CAD
    'USDEUR': 'EURUSD=X',   # USD/EUR
    'USDJPY': 'JPY=X',      # USD/JPY
    'VIX': '^VIX',          # Volatility Index
}

# Download data
raw = yf.download(list(tickers.values()),  start="1980-01-01", group_by='ticker', auto_adjust=True)

# Extract only the 'Close' column for each ticker
data = pd.DataFrame({symbol: raw[ticker]['Close'] for symbol, ticker in tickers.items()})

# Calculate daily log returns
daily_returns = np.log(data / data.shift(1)).dropna()

# Replace zero values with NaN, then forward-fill
daily_returns = daily_returns.mask(daily_returns == 0).ffill()

# Compute all rolling 30-day and 60-day correlations for each unique pair
rolling_corr_30 = pd.DataFrame(index=daily_returns.index)
rolling_corr_60 = pd.DataFrame(index=daily_returns.index)

for a, b in combinations(daily_returns.columns, 2):
    col_name_30 = f"{a}_{b}_corr_30"
    col_name_60 = f"{a}_{b}_corr_60"
    rolling_corr_30[col_name_30] = daily_returns[a].rolling(30).corr(daily_returns[b])
    rolling_corr_60[col_name_60] = daily_returns[a].rolling(60).corr(daily_returns[b])

# Feature Engineering on Correlations
fe_corr_30 = rolling_corr_30.copy()
fe_corr_60 = rolling_corr_60.copy()

for col in rolling_corr_30.columns:
    fe_corr_30[f"{col}_30d_avg_diff"] = rolling_corr_30[col].rolling(30).mean() - rolling_corr_30[col]
    fe_corr_30[f"{col}_roc_5"] = rolling_corr_30[col].pct_change(periods=5)

for col in rolling_corr_60.columns:
    fe_corr_60[f"{col}_30d_avg_diff"] = rolling_corr_60[col].rolling(30).mean() - rolling_corr_60[col]
    fe_corr_60[f"{col}_roc_5"] = rolling_corr_60[col].pct_change(periods=5)

# Compute relative log returns (pairwise difference of rolling means)
relative_return_30 = pd.DataFrame(index=daily_returns.index)
relative_return_60 = pd.DataFrame(index=daily_returns.index)

for a, b in combinations(daily_returns.columns, 2):
    col_30 = f"{a}_{b}_ret_30"
    col_60 = f"{a}_{b}_ret_60"
    relative_return_30[col_30] = daily_returns[a].rolling(30).mean() - daily_returns[b].rolling(30).mean()
    relative_return_60[col_60] = daily_returns[a].rolling(60).mean() - daily_returns[b].rolling(60).mean()

# Feature Engineering on Relative Returns
fe_ret_30 = relative_return_30.copy()
fe_ret_60 = relative_return_60.copy()

for col in relative_return_30.columns:
    fe_ret_30[f"{col}_roc_5"] = relative_return_30[col].pct_change(periods=5)

for col in relative_return_60.columns:
    fe_ret_60[f"{col}_roc_5"] = relative_return_60[col].pct_change(periods=5)

# Drop rows with missing values
fe_corr_30 = fe_corr_30.dropna()
fe_corr_60 = fe_corr_60.dropna()
fe_ret_30 = fe_ret_30.dropna()
fe_ret_60 = fe_ret_60.dropna()

# Merge all features
all_features = pd.concat([fe_corr_30, fe_corr_60, fe_ret_30, fe_ret_60], axis=1).dropna()

# Apply 3.5-year rolling z-score standardization
zscore = lambda x: (x - x.rolling(875).mean()) / x.rolling(875).std()
numeric_cols = all_features.select_dtypes(include=[np.number]).columns
all_features[numeric_cols] = all_features[numeric_cols].apply(zscore)

# Final cleanup: retain only selected stable features
selected_features = [
    'USDCAD_USDJPY_corr_30',
    'USDCAD_USDJPY_ret_60',
    'USDJPY_VIX_corr_30',
    'GOLD_OIL_corr_60',
    'SPX_USDEUR_ret_30',
    'US02_USDCAD_corr_60',
    'US02_VIX_corr_60'
]

df_features = all_features[selected_features].dropna()

# Create lagged features at 1, 5, and 21-day lags
lags = [1, 5, 21]
lagged_features = []
for lag in lags:
    lagged = df_features.shift(lag).add_suffix(f"_lag{lag}")
    lagged_features.append(lagged)

# Combine original and lagged features
df_final_features = pd.concat([df_features] + lagged_features, axis=1).dropna()


[*********************100%***********************]  10 of 10 completed
  result = func(self.values, **kwargs)


hmm + lstm

In [6]:
# Predict HMM states using saved model
import joblib
from tensorflow.keras.models import load_model

hmm_model = joblib.load("Models/Macro_hmm_model.pkl")
hmm_states = hmm_model.predict(df_final_features)
df_final_features["HMM_State"] = hmm_states

# Load LSTM model and run inference
lstm_model = load_model("Models/Macro_lstm_regime_model.keras")

lookback = 30
x_seq = []
dates = []

for i in range(lookback, len(df_final_features)):
    x_seq.append(df_final_features.iloc[i - lookback:i].values)
    dates.append(df_final_features.index[i])

x_seq = np.array(x_seq)

predicted_probs = lstm_model.predict(x_seq)
predicted_classes = np.argmax(predicted_probs, axis=1)

results = pd.DataFrame({
    "Date": dates,
    "Macro Regime": predicted_classes
})
results.set_index("Date", inplace=True)

results.to_csv("predicted_regimes_2021_onward.csv")
print("✅ Exported LSTM regime predictions to 'predicted_regimes_2021_onward.csv'")

[1m128/128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
✅ Exported LSTM regime predictions to 'predicted_regimes_2021_onward.csv'
