In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt

def load_and_preprocess_data(file_path, features, target, context_features):
    df = pd.read_csv(file_path)

    # Check for missing values
    if df.isnull().sum().sum() > 0:
        df = df.fillna(method='ffill').fillna(method='bfill')

    # Normalize data
    scaler_features = MinMaxScaler()
    scaler_target = MinMaxScaler()
    scaler_context = MinMaxScaler()

    # Normalize feature columns
    df[features] = scaler_features.fit_transform(df[features])

    # Normalize target columns
    df[target] = scaler_target.fit_transform(df[target])

    # Normalize context columns
    df[context_features] = scaler_context.fit_transform(df[context_features])

    return df, scaler_target, scaler_context

def create_sequences(data, target_data, context_data, n_timesteps):
    X, y, context_features = [], [], []
    for i in range(len(data) - n_timesteps):
        X.append(data[i:i + n_timesteps].values.flatten())  # Flatten the sequence
        y.append(target_data.iloc[i + n_timesteps].values)
        context_features.append(context_data.iloc[i + n_timesteps].values)
    return np.array(X), np.array(y), np.array(context_features)

def prepare_data(df, features, target, context_features, n_timesteps, test_size=0.2):
    X, y, context = create_sequences(df[features], df[target], df[context_features], n_timesteps)
    X_combined = np.concatenate((X, context), axis=1)  # Combine past data and context data
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=test_size, random_state=42)
    return X_train, X_test, y_train, y_test

def train_xgboost(X_train, y_train, param_grid=None, early_stopping_rounds=10):
    if param_grid is None:
        param_grid = {
            'n_estimators': [100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'tree_method': ['hist']
        }
    
    model = xgb.XGBRegressor(objective='reg:squarederror')
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    # Early stopping
    eval_set = [(X_train, y_train)]
    best_model.fit(X_train, y_train, eval_metric="rmse", eval_set=eval_set, early_stopping_rounds=early_stopping_rounds, verbose=True)
    
    return best_model

def evaluate_xgboost(model, X_test, y_test, scaler_target):
    y_pred = model.predict(X_test)
    y_pred_original_scale = scaler_target.inverse_transform(y_pred.reshape(-1, 1))
    y_test_original_scale = scaler_target.inverse_transform(y_test)
    
    mse = mean_squared_error(y_test_original_scale, y_pred_original_scale)
    r2 = r2_score(y_test_original_scale, y_pred_original_scale)
    
    print(f'Mean Squared Error: {mse}')
    print(f'R^2 Score: {r2}')
    
    return y_pred_original_scale, y_test_original_scale

def plot_results(y_test, y_pred):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test, label='True Values')
    plt.plot(y_pred, label='Predictions')
    plt.legend()
    plt.xlabel('Samples')
    plt.ylabel('Values')
    plt.title('True Values vs Predictions')
    plt.show()

def main(file_path, features, target, context_features, n_timesteps, early_stopping_rounds):
    # Load and preprocess data
    df, scaler_target, scaler_context = load_and_preprocess_data(file_path, features, target, context_features)

    # Prepare data
    X_train, X_test, y_train, y_test = prepare_data(df, features, target, context_features, n_timesteps)

    # Train XGBoost model with hyperparameter tuning and early stopping
    model = train_xgboost(X_train, y_train, early_stopping_rounds=early_stopping_rounds)

    # Evaluate the XGBoost model
    y_pred_original_scale, y_test_original_scale = evaluate_xgboost(model, X_test, y_test, scaler_target)

    # Plot results
    plot_results(y_test_original_scale, y_pred_original_scale)

# Define parameters
file_path = 'time_series_data.csv'
features = ['rate_level_1', 'rate_level_2', 'days_to_end_of_month', 'days_to_ECB_meeting', 'days_to_Fed_meeting', 'ois_sofr_rate']
target = ['rate_level_1', 'rate_level_2']
context_features = ['stock_price', 'fx_rate', 'commodity_price']
n_timesteps = 12
early_stopping_rounds = 10  # Number of rounds for early stopping

# Run the main function
main(file_path, features, target, context_features, n_timesteps, early_stopping_rounds)