In [2]:
import pandas as pd
import numpy as np
from pandas_datareader import data as pdr
import yfinance as yf
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')


In [3]:

# Set random seed for reproducibility
np.random.seed(42)

# 1. Data Sourcing from FRED
def fetch_fred_data(start_date='2000-01-01', end_date=datetime.today().strftime('%Y-%m-%d')):
    """Fetch Treasury yield data from FRED."""
    series = {
        'DGS3': '3_year_yield',
        'DGS5': '5_year_yield',
        'DGS7': '7_year_yield',
        'DGS10': '10_year_yield',
        'DGS20': '20_year_yield',
        'CPIAUCSL': 'cpi',  # Consumer Price Index
        'FEDFUNDS': 'fed_funds',  # Federal Funds Rate
        'VIXCLS': 'vix'  # VIX Index
    }
    df = pdr.get_data_fred(list(series.keys()), start_date, end_date)
    df = df.rename(columns=series)
    df = df.dropna()  # Drop rows with any missing values
    return df


In [5]:
# 2. Feature Engineering
def create_features(df, lags=5):
    """Generate lagged yields, term spreads, and rolling metrics."""
    df_features = df.copy()
    
    # Lagged yield values
    for col in ['3_year_yield', '5_year_yield', '7_year_yield', '10_year_yield', '20_year_yield']:
        for lag in range(1, lags + 1):
            df_features[f'{col}_lag{lag}'] = df_features[col].shift(lag)
    
    # Term spreads
    df_features['spread_10y_3y'] = df_features['10_year_yield'] - df_features['3_year_yield']
    df_features['spread_20y_10y'] = df_features['20_year_yield'] - df_features['10_year_yield']
    df_features['spread_20y_3y'] = df_features['20_year_yield'] - df_features['3_year_yield']
    
    # Rolling metrics (7-day and 30-day)
    for col in ['10_year_yield', 'spread_10y_3y']:
        df_features[f'{col}_ma7'] = df_features[col].rolling(window=7).mean()
        df_features[f'{col}_ma30'] = df_features[col].rolling(window=30).mean()
        df_features[f'{col}_vol7'] = df_features[col].rolling(window=7).std()
        df_features[f'{col}_vol30'] = df_features[col].rolling(window=30).std()
    
    # Drop rows with NaN values due to lagging/rolling
    df_features = df_features.dropna()
    return df_features

In [8]:

# 3. Prepare Data for Modeling
def prepare_data(df_features, target_col='10_year_yield'):
    """Split features and target, create train/test sets."""
    # Target: Next day's 10-year yield
    df_features['target'] = df_features[target_col].shift(-1)
    df_features = df_features.dropna()
    
    # Features: All columns except target and original macro inputs
    feature_cols = [col for col in df_features.columns if col not in ['target', 'cpi', 'fed_funds', 'vix']]
    X = df_features[feature_cols]
    y = df_features['target']
    
    return X, y

In [10]:
# 4. Model Training and Evaluation
def train_model(X, y, param_grid=None):
    """Train XGBRegressor with TimeSeriesSplit and hyperparameter tuning."""
    model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
    
    # Default parameter grid if none provided
    if param_grid is None:
        param_grid = {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.1],
            'subsample': [0.8, 1.0]
        }
    
    # Time-series cross-validation
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Grid search for hyperparameter tuning
    grid_search = GridSearchCV(
        model, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1
    )
    grid_search.fit(X, y)
    
    # Best model
    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    
    return best_model, grid_search


In [12]:

# 5. Model Evaluation
def evaluate_model(model, X, y):
    """Evaluate model performance with RMSE, MAE, and directional accuracy."""
    predictions = model.predict(X)
    
    # Metrics
    rmse = np.sqrt(mean_squared_error(y, predictions))
    mae = mean_absolute_error(y, predictions)
    
    # Directional accuracy
    actual_diff = np.sign(y - y.shift(1).fillna(0))
    pred_diff = np.sign(pd.Series(predictions, index=y.index) - y.shift(1).fillna(0))
    directional_accuracy = np.mean(actual_diff == pred_diff) * 100
    
    # Print metrics
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"Directional Accuracy: {directional_accuracy:.2f}%")
    
    # Plot predictions vs actual
    plt.figure(figsize=(12, 6))
    plt.plot(y.index, y, label='Actual 10Y Yield', color='blue')
    plt.plot(y.index, predictions, label='Predicted 10Y Yield', color='red', linestyle='--')
    plt.title('Actual vs Predicted 10-Year Treasury Yield')
    plt.xlabel('Date')
    plt.ylabel('Yield (%)')
    plt.legend()
    plt.grid(True)
    plt.savefig('yield_predictions.png')
    plt.close()
    
    # Feature importance plot
    feature_importance = model.get_booster().get_score(importance_type='gain')
    sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)[:10]
    plt.figure(figsize=(10, 6))
    plt.barh([x[0] for x in sorted_importance], [x[1] for x in sorted_importance])
    plt.title('Top 10 Feature Importance')
    plt.xlabel('Gain')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()
    
    return {'rmse': rmse, 'mae': mae, 'directional_accuracy': directional_accuracy}

In [14]:
# 6. Forecasting Function
def forecast_next_day(model, df_features, latest_date):
    """Generate next-day forecast based on latest data."""
    latest_data = df_features.loc[latest_date].copy()
    feature_cols = [col for col in df_features.columns if col not in ['target', 'cpi', 'fed_funds', 'vix']]
    X_latest = latest_data[feature_cols].values.reshape(1, -1)
    forecast = model.predict(X_latest)[0]
    print(f"Forecasted 10Y Yield for {latest_date + timedelta(days=1)}: {forecast:.4f}%")
    return forecast

In [16]:
# Main Execution
if __name__ == "__main__":
    # Fetch data
    df = fetch_fred_data()
    
    # Create features
    df_features = create_features(df)
    
    # Prepare data
    X, y = prepare_data(df_features)
    
    # Train model
    model, grid_search = train_model(X, y)
    
    # Evaluate model
    metrics = evaluate_model(model, X, y)
    
    # Generate next-day forecast
    latest_date = df_features.index[-1]
    forecast = forecast_next_day(model, df_features, latest_date)
    
    # Save performance report
    with open('performance_report.txt', 'w') as f:
        f.write("10-Year Treasury Yield Forecast Performance Report\n")
        f.write(f"Date: {datetime.today().strftime('%Y-%m-%d')}\n\n")
        f.write(f"Best Parameters: {grid_search.best_params_}\n")
        f.write(f"RMSE: {metrics['rmse']:.4f}\n")
        f.write(f"MAE: {metrics['mae']:.4f}\n")
        f.write(f"Directional Accuracy: {metrics['directional_accuracy']:.2f}%\n")
        f.write(f"Forecast for {latest_date + timedelta(days=1)}: {forecast:.4f}%\n")
        f.write("\nPlots saved: yield_predictions.png, feature_importance.png")

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}
RMSE: 0.0010
MAE: 0.0008
Directional Accuracy: 98.18%
Forecasted 10Y Yield for 2025-04-02 00:00:00: 3.9150%
