In [8]:
!pip install numpy==1.26.4
!pip install protobuf==4.25.3


Collecting protobuf==4.25.3
  Downloading protobuf-4.25.3-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)
Downloading protobuf-4.25.3-cp37-abi3-macosx_10_9_universal2.whl (394 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 6.31.0
    Uninstalling protobuf-6.31.0:
      Successfully uninstalled protobuf-6.31.0
Successfully installed protobuf-4.25.3


In [4]:
!pip install numpy==1.26.4


Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl (20.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.32.0 requires protobuf<5,>=3.20, but you have protobuf 6.31.0 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [1]:
import numpy as np
import yfinance as yf
print("NumPy version:", np.__version__)
print("yfinance version:", yf.__version__)


NumPy version: 1.26.4
yfinance version: 0.2.61


In [14]:
!pip uninstall -y numpy
!pip install numpy==1.26.4


Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl (20.3 MB)
Installing collected packages: numpy
Successfully installed numpy-1.26.4


In [None]:
import pandas as pd
import numpy as np
import fredapi as fa
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os
import warnings

warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

# 1. Enhanced Data Sourcing from FRED API (Monthly Resampling)
def fetch_fred_data(start_date='2000-01-01', end_date=datetime.today().strftime('%Y-%m-%d'),
                    api_key=None, resample_freq='M'):
    """Fetch Treasury yield data and additional economic indicators from FRED using the API and resample to monthly."""

    if api_key is None:
        api_key = os.environ.get('FRED_API_KEY')
        if api_key is None:
            raise ValueError(
                "FRED API key must be provided either directly or via FRED_API_KEY environment variable")

    fred = fa.Fred(api_key=api_key)

    series_dict = {
        'DGS3': '3_year_yield',
        'DGS5': '5_year_yield',
        'DGS7': '7_year_yield',
        'DGS10': '10_year_yield',
        'DGS20': '20_year_yield',
        'CPIAUCSL': 'cpi',
        'FEDFUNDS': 'fed_funds_rate',
        'VIXCLS': 'vix',
        'DGS2': '2_year_yield',
        'CPILFESL': 'core_cpi',
        'WALCL': 'fed_assets',
        'DTWEXBGS': 'usd_index',
        'GDP': 'gdp'
    }

    all_data = {}

    for series_id, series_name in series_dict.items():
        print(f"Fetching {series_name} data...")
        series_data = fred.get_series(series_id, start_date, end_date)
        all_data[series_name] = series_data

    df = pd.DataFrame(all_data)

    # Resample to monthly frequency, using the last observation of the month
    df = df.resample(resample_freq).last()

    if '2_year_yield' in df.columns and '10_year_yield' in df.columns:
        df['spread_2y_10y'] = df['10_year_yield'] - df['2_year_yield']
        df['yield_curve_inverted'] = (df['spread_2y_10y'] < 0).astype(int)

    df = df.ffill()  # Forward fill after resampling
    print(f"Data fetched and resampled successfully to monthly. Shape: {df.shape}")
    return df


# 2. Enhanced Feature Engineering (Monthly)
def create_features(df, lags=[1, 3, 6]):  # Monthly lags
    """Generate lagged yields, term spreads, and rolling metrics for all indicators (monthly)."""

    df_features = df.copy()

    yield_cols = ['2_year_yield', '10_year_yield']  # Focus on key yields
    macro_cols = ['fed_funds_rate', 'cpi']  # Focus on key macro

    all_cols = yield_cols + macro_cols + ['spread_2y_10y']

    # 1. Create lagged features for all columns (monthly lags)
    for col in all_cols:
        if col in df_features.columns:
            for lag in lags:
                df_features[f'{col}_lag{lag}M'] = df_features[col].shift(lag)  # Monthly lag

    # 2. Create change features (1-month, 3-month, 12-month changes)
    for col in all_cols:
        if col in df_features.columns:
            df_features[f'{col}_1M_chg'] = df_features[col].diff(1)
            df_features[f'{col}_3M_chg'] = df_features[col].diff(3)
            df_features[f'{col}_12M_chg'] = df_features[col].diff(12)

    # 3. Create rolling metrics (6-month) for all columns
    for col in all_cols:
        if col in df_features.columns:
            df_features[f'{col}_ma6M'] = df_features[col].rolling(window=6).mean()

    # 4. Interaction Term
    if 'fed_funds_rate' in df_features.columns and 'cpi' in df_features.columns:
        df_features['fed_funds_rate_x_cpi'] = df_features['fed_funds_rate'] * df_features['cpi']

    df_features = df_features.dropna()
    return df_features


# 3. Prepare Data for Modeling with Train/Test Split
def prepare_data(df_features, target_col='10_year_yield', test_size=0.2):
    """Split features and target, create train/test sets using an 80/20 time-series split."""

    df_features['target'] = df_features[target_col].shift(-1)  # Predict next month's yield
    df_features = df_features.dropna()

    exclude_cols = ['target']  # Keep all engineered features
    feature_cols = [col for col in df_features.columns if col not in exclude_cols]

    split_idx = int(len(df_features) * (1 - test_size))

    X_train = df_features[feature_cols].iloc[:split_idx]
    y_train = df_features['target'].iloc[:split_idx]

    X_test = df_features[feature_cols].iloc[split_idx:]
    y_test = df_features['target'].iloc[split_idx:]

    print(f"Training data shape: {X_train.shape}, Test data shape: {X_test.shape}")

    return X_train, X_test, y_train, y_test, feature_cols


# 4. Train Voting Ensemble Model (XGBoost + RandomForest)
def train_ensemble_model(X_train, y_train, param_grid=None):
    """Train a VotingRegressor with XGBoost and RandomForest models."""
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    rf_model = RandomForestRegressor(random_state=42)

    if param_grid is None:
        param_grid = {
            'xgb__n_estimators': [50, 100],
            'xgb__max_depth': [2, 3],
            'xgb__learning_rate': [0.01, 0.05],
            'xgb__subsample': [0.7, 0.8],
            'xgb__reg_alpha': [0.1, 1],
            'xgb__reg_lambda': [0.1, 1],
            'rf__n_estimators': [50, 100],
            'rf__max_depth': [3, 5]
        }

    tscv = TimeSeriesSplit(n_splits=5)

    grid_search = GridSearchCV(
        VotingRegressor(estimators=[('xgb', xgb_model), ('rf', rf_model)]),
        param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_ensemble_model = grid_search.best_estimator_
    print(f"Best Ensemble Parameters: {grid_search.best_params_}")

    return best_ensemble_model, grid_search


# 5. Model Evaluation on Test Set
def evaluate_model(model, X_train, y_train, X_test, y_test):
    """Evaluate model performance with RMSE, MAE, and directional accuracy on train and test sets."""
    train_preds = model.predict(X_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
    train_mae = mean_absolute_error(y_train, train_preds)

    actual_diff_train = np.sign(y_train - y_train.shift(1).fillna(0))
    pred_diff_train = np.sign(pd.Series(train_preds, index=y_train.index) - y_train.shift(1).fillna(0))
    train_dir_acc = np.mean(actual_diff_train == pred_diff_train) * 100

    test_preds = model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    test_mae = mean_absolute_error(y_test, test_preds)

    actual_diff_test = np.sign(y_test - y_test.shift(1).fillna(0))
    pred_diff_test = np.sign(pd.Series(test_preds, index=y_test.index) - y_test.shift(1).fillna(0))
    test_dir_acc = np.mean(actual_diff_test == pred_diff_test) * 100

    print("\n--- Training Set Performance ---")
    print(f"RMSE: {train_rmse:.4f}")
    print(f"MAE: {train_mae:.4f}")
    print(f"Directional Accuracy: {train_dir_acc:.2f}%")

    print("\n--- Test Set Performance ---")
    print(f"RMSE: {test_rmse:.4f}")
    print(f"MAE: {test_mae:.4f}")
    print(f"Directional Accuracy: {test_dir_acc:.2f}%")

    # Plot predictions vs actual for test set
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.index, y_test, label='Actual 10Y Yield', color='blue')
    plt.plot(y_test.index, test_preds, label='Predicted 10Y Yield', color='red', linestyle='--')
    plt.title('Test Set: Actual vs Predicted 10-Year Treasury Yield (Monthly)')
    plt.xlabel('Date')
    plt.ylabel('Yield (%)')
    plt.legend()
    plt.grid(True)
    plt.show()  # Show the plot instead of saving

    # Feature importance plot (using XGBoost model)
    if hasattr(model, 'named_estimators_') and 'xgb' in model.named_estimators_:
        xgb_model = model.named_estimators_['xgb']
        feature_importance = xgb_model.get_booster().get_score(importance_type='gain')
        sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:15]
        plt.figure(figsize=(12, 8))
        plt.barh([x[0] for x in sorted_importance], [x[1] for x in sorted_importance])
        plt.title('Top 15 Feature Importance (Monthly)')
        plt.xlabel('Gain')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()  # Show the plot instead of saving

    return {
        'train_rmse': train_rmse,
        'train_mae': train_mae,
        'train_dir_acc': train_dir_acc,
        'test_rmse': test_rmse,
        'test_mae': test_mae,
        'test_dir_acc': test_dir_acc
    }


# 6. Forecast Next Month
def forecast_next_month(model, df_features, feature_cols):
    """Forecast the 10-year yield for the next month."""

    latest_date = df_features.index[-1]
    next_month = latest_date + relativedelta(months=1)

    X_latest = df_features.loc[[latest_date]][feature_cols].values
    forecast = model.predict(X_latest)[0]

    return forecast, next_month


# Main Execution
if __name__ == "__main__":
    # Fetch and resample data to monthly frequency
    df = fetch_fred_data(api_key='cc07b760ce5e5ddcd406da43e91f6889', resample_freq='M')

    # Create monthly features
    df_features = create_features(df)

    # Prepare data
    X_train, X_test, y_train, y_test, feature_cols = prepare_data(df_features)

    # Adjusted parameter grid for ensemble model (you might need to tune this further)
    param_grid = {
        'xgb__n_estimators': [50, 150],
        'xgb__max_depth': [2, 4],
        'xgb__learning_rate': [0.01, 0.05],
        'xgb__subsample': [0.7, 0.9],
        'xgb__reg_alpha': [0.01, 0.1],
        'xgb__reg_lambda': [0.01, 0.1],
        'rf__n_estimators': [50, 150],
        'rf__max_depth': [3, 6]
    }

    # Train ensemble model
    ensemble_model, grid_search = train_ensemble_model(X_train, y_train, param_grid=param_grid)

    # Evaluate model
    metrics = evaluate_model(ensemble_model, X_train, y_train, X_test, y_test)

    # Forecast next month
    forecast, next_month_date = forecast_next_month(ensemble_model, df_features, feature_cols)
    print(f"\nForecasted 10Y Yield for {next_month_date.strftime('%Y-%m-%d')}: {forecast:.4f}%")

Fetching 3_year_yield data...
Fetching 5_year_yield data...
Fetching 7_year_yield data...
Fetching 10_year_yield data...
Fetching 20_year_yield data...
Fetching cpi data...
Fetching fed_funds_rate data...
Fetching vix data...
Fetching 2_year_yield data...
Fetching core_cpi data...
Fetching fed_assets data...
Fetching usd_index data...
Fetching gdp data...
Data fetched and resampled successfully to monthly. Shape: (305, 15)
Training data shape: (185, 51), Test data shape: (47, 51)
Fitting 5 folds for each of 256 candidates, totalling 1280 fits
