In [1]:
import pandas as pd
import numpy as np
import pandas_datareader.data as pdr
import datetime
import yfinance as yf
import plotly.express as px
from statsmodels.tsa.stattools import adfuller
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
# =========================
# Step 1: Data Preparation
# =========================
start = datetime.datetime(1990, 1, 1)
end   = datetime.datetime(2026, 1, 31)

fred_codes = [
    'DGS10', 'T10Y2Y', 'DGS5', 'DGS2', 'TB3MS',
    'BAA', 'AAA',
    'CPIAUCSL', 'UNRATE', 'INDPRO',
    'USREC', 'VIXCLS'
]

In [3]:
# 1) Pull FRED (daily/mixed) and convert to monthly (month-start index, end-of-month value)
df_fred = pdr.DataReader(fred_codes, 'fred', start, end)

df_fred_m = df_fred.resample('MS').last() # 统一成月频

In [4]:
# 2) Pull MOVE and convert to monthly
df_move = yf.download("^MOVE", start="1990-01-01", end="2026-01-31", progress=False)

df_move_m = df_move['Close'].resample('MS').last() # 同样月频化

In [5]:
# 3) Merge (monthly)
df = df_fred_m.merge(df_move_m, left_on="DATE", right_on="Date", how="outer").set_index(df_fred_m.index)

# 'CPIAUCSL', 'UNRATE', 'INDPRO' 都缺一个值
# Fill key monthly macro gaps (safe forward-fill for monthly series)
df[['CPIAUCSL', 'UNRATE', 'INDPRO']] = df[['CPIAUCSL', 'UNRATE', 'INDPRO']].ffill()
# CPI/失业率/工业生产有时是月度发布但会缺值，用前值填充

In [6]:
# =========================
# Step 1b: Construct target y (10Y bond monthly excess return)
# =========================
duration = 9  # approx duration for 10Y Treasury

# price return from yield change
df['price_ret'] = -duration * (df['DGS10'].diff() / 100)

# carry return (use yield at start of month => shift(1))
df['carry_ret'] = (df['DGS10'].shift(1) / 100) / 12

df['total_bond_ret'] = df['price_ret'] + df['carry_ret']
df['excess_return'] = df['total_bond_ret'] - ((df['TB3MS'] / 100) / 12)

In [7]:
# =========================
# Step 1c: Construct predictors X
# =========================
df['slope'] = df['T10Y2Y']
df['curvature'] = (2 * df['DGS5']) - df['DGS2'] - df['DGS10']
df['short_rate'] = df['TB3MS']
df['default_spread'] = df['BAA'] - df['AAA']
df['inflation_yoy'] = df['CPIAUCSL'].pct_change(12)
df['unemployment'] = df['UNRATE']
df['recession_indicator'] = df['USREC']
df['industrial_prod_yoy'] = df['INDPRO'].pct_change(12)
df['vix_index'] = df['VIXCLS']

In [8]:
# Bond volatility: use MOVE when available; otherwise use realized vol from daily DGS10
# (This part uses daily DGS10, so compute from df_fred daily then monthly last)
df_fred['DGS10_filled'] = df_fred['DGS10'].ffill()
df_fred['yield_diff'] = df_fred['DGS10_filled'].diff()
df_fred['realized_daily_vol_30d'] = df_fred['yield_diff'].rolling(window=30).std() * np.sqrt(252)  # annualized
realized_m = df_fred['realized_daily_vol_30d'].resample('MS').last() * 100  # to % scale
df['bond_volatility'] = df['^MOVE'].fillna(realized_m)

# Bond momentum: past 12m excess return, lagged by 1 month
df['excess_return_12m'] = df['excess_return'].rolling(window=12).sum()
df['bond_momentum'] = df['excess_return_12m'].shift(1) # shift(1)：用“到上个月为止”的动量来预测本月，避免偷看本月回报

In [9]:
# =========================
# Step 1d: Lag predictors by 1 month (avoid look-ahead bias)
# =========================
base_predictors = [
    'slope', 'curvature', 'short_rate', 'default_spread', 'inflation_yoy',
    'unemployment', 'recession_indicator', 'industrial_prod_yoy',
    'vix_index', 'bond_volatility', 'bond_momentum'
]

X = df[base_predictors].shift(1)
y = df['excess_return']

In [10]:
df_model = pd.concat([y, X], axis=1).dropna()
df_model.rename(columns={'excess_return': 'y'}, inplace=True)

# =========================
# (Optional but common) Make short_rate stationary + add AR(1) term
# =========================
df_model['short_rate_diff'] = df_model['short_rate'].diff()
df_model['y_lag1'] = df_model['y'].shift(1)

feature_cols = [
    'slope', 'curvature', 'short_rate_diff', 'default_spread', 'inflation_yoy',
    'unemployment', 'recession_indicator', 'industrial_prod_yoy',
    'vix_index', 'bond_volatility', 'bond_momentum', 'y_lag1'
]

df_model = df_model[['y'] + feature_cols].dropna()

### My code starts from here:

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
# =========================
# Step 3: Regularized Regression (Expanding Window + TS-CV)
# =========================

# ---- helper metrics ----
def oos_r2_vs_benchmark(y_true, y_pred, y_bench):
    """
    OOS R^2 = 1 - SSE_model / SSE_benchmark
    Benchmark here is expanding mean forecast.
    """
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    y_bench = np.asarray(y_bench, dtype=float)
    sse_model = np.sum((y_true - y_pred) ** 2)
    sse_bench = np.sum((y_true - y_bench) ** 2)
    return np.nan if sse_bench == 0 else 1 - sse_model / sse_bench


def expanding_oos_cv(
    df_model,
    feature_cols,
    y_col="y",
    model_name="ridge",
    initial_train=120,
    step=1,                     # one-step-ahead
    n_splits=5,                 # inner CV 的切分数（TimeSeriesSplit）
    alphas=None,                # lambda grid
    l1_ratios=None              # only for elastic net
):
    """
    Expanding window one-step-ahead forecasting:
    Outer loop: expanding train, predict next month.
    Inner loop: GridSearchCV + TimeSeriesSplit on training set to select best regularization (alpha / l1_ratio).
    """
    if alphas is None:
        # 常用的 log 网格，覆盖范围大一点更稳
        alphas = np.logspace(-4, 2, 30)

    X_all = df_model[feature_cols].copy()
    y_all = df_model[y_col].copy()

    n = len(df_model)
    if n <= initial_train + 1:
        raise ValueError("样本量太小：需要 n > initial_train + 1。请调小 initial_train 或增加样本期。")

    # store results
    oos_dates = []
    y_true_list, y_pred_list, y_bench_list = [], [], []
    best_params_list = []

    # expanding window outer loop
    for t in range(initial_train, n - 1, step):
        train_idx = np.arange(0, t)         # expanding
        test_idx  = np.arange(t, t + 1)     # next month (one-step)

        X_train, y_train = X_all.iloc[train_idx], y_all.iloc[train_idx]
        X_test,  y_test  = X_all.iloc[test_idx],  y_all.iloc[test_idx]

        # benchmark: expanding mean forecast based on training y
        y_bench = float(np.mean(y_train))

        # pipeline: scaler fitted ONLY on training folds (no leakage)
        if model_name.lower() == "ridge":
            base_model = Ridge(random_state=42)
            param_grid = {"model__alpha": alphas}

        elif model_name.lower() == "lasso":
            # Lasso 对收敛更敏感，给更高 max_iter：since Lasso is sensitive to shrinkage, we use higher max_iter
            base_model = Lasso(max_iter=100000, random_state=42)
            param_grid = {"model__alpha": alphas}

        elif model_name.lower() in ["elasticnet", "elastic_net", "enet"]:
            base_model = ElasticNet(max_iter=100000, random_state=42)
            if l1_ratios is None:
                l1_ratios = [0.1, 0.3, 0.5, 0.7, 0.9]
            param_grid = {
                "model__alpha": alphas,
                "model__l1_ratio": l1_ratios
            }
        else:
            raise ValueError("model_name 只能是 ridge / lasso / elasticnet")

        pipe = Pipeline([
            ("scaler", StandardScaler()),
            ("model", base_model)
        ])

        # inner TS CV: only within training set
        tscv = TimeSeriesSplit(n_splits=n_splits)

        # 用 MSE 做目标（越小越好）；GridSearchCV 默认是越大越好，所以用 neg MSE：MSE smaller the better, but GridSearchCV is bigger MSE the better, so we use neg MSE
        gscv = GridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            scoring="neg_mean_squared_error",
            cv=tscv,
            n_jobs=-1
        )
        gscv.fit(X_train, y_train)

        # one-step prediction
        y_hat = float(gscv.predict(X_test)[0])

        # store
        oos_dates.append(df_model.index[test_idx][0])
        y_true_list.append(float(y_test.iloc[0]))
        y_pred_list.append(y_hat)
        y_bench_list.append(y_bench)
        best_params_list.append(gscv.best_params_)

    # summary
    y_true_arr = np.array(y_true_list)
    y_pred_arr = np.array(y_pred_list)
    y_bench_arr = np.array(y_bench_list)

    oos_r2 = oos_r2_vs_benchmark(y_true_arr, y_pred_arr, y_bench_arr)
    rmse = float(np.sqrt(mean_squared_error(y_true_arr, y_pred_arr)))

    pred_df = pd.DataFrame({
        "date": oos_dates,
        "y_true": y_true_arr,
        "y_pred": y_pred_arr,
        "y_bench_expanding_mean": y_bench_arr
    }).set_index("date")

    return {
        "model": model_name,
        "oos_r2": float(oos_r2),
        "rmse": rmse,
        "predictions": pred_df,
        "best_params_over_time": pd.DataFrame(best_params_list, index=pred_df.index)
    }

# ---- run 3 models ----
# 你可以根据样本长度调 initial_train（例如 120 或 180）：we can adjust initial_train according to sample size, e.g. 120/100
alphas_grid = np.logspace(-4, 2, 30)

ridge_res = expanding_oos_cv(
    df_model=df_model,
    feature_cols=feature_cols,
    model_name="ridge",
    initial_train=120,
    n_splits=5,
    alphas=alphas_grid
)

lasso_res = expanding_oos_cv(
    df_model=df_model,
    feature_cols=feature_cols,
    model_name="lasso",
    initial_train=120,
    n_splits=5,
    alphas=alphas_grid
)

enet_res = expanding_oos_cv(
    df_model=df_model,
    feature_cols=feature_cols,
    model_name="elasticnet",
    initial_train=120,
    n_splits=5,
    alphas=alphas_grid,
    l1_ratios=[0.1, 0.3, 0.5, 0.7, 0.9]
)

# ---- compare summary table ----
summary = pd.DataFrame([
    {"Model": "Ridge",      "OOS_R2": ridge_res["oos_r2"], "RMSE": ridge_res["rmse"]},
    {"Model": "Lasso",      "OOS_R2": lasso_res["oos_r2"], "RMSE": lasso_res["rmse"]},
    {"Model": "ElasticNet", "OOS_R2": enet_res["oos_r2"],  "RMSE": enet_res["rmse"]}
]).set_index("Model")

print(summary)

              OOS_R2      RMSE
Model                         
Ridge      -0.045911  0.024149
Lasso      -0.002006  0.023637
ElasticNet -0.003219  0.023651


In [None]:
# 看看每期选到的 lambda / l1_ratio 随时间怎么变：show how parameters change over different period
print(ridge_res["best_params_over_time"].tail())
print(lasso_res["best_params_over_time"].tail())
print(enet_res["best_params_over_time"].tail())

            model__alpha
date                    
2025-08-01         100.0
2025-09-01         100.0
2025-10-01         100.0
2025-11-01         100.0
2025-12-01         100.0
            model__alpha
date                    
2025-08-01      0.004520
2025-09-01      0.007279
2025-10-01      0.007279
2025-11-01      0.007279
2025-12-01      0.007279
            model__alpha  model__l1_ratio
date                                     
2025-08-01      0.018874              0.3
2025-09-01      0.007279              0.9
2025-10-01      0.007279              0.9
2025-11-01      0.011721              0.5
2025-12-01      0.018874              0.3
