In [None]:
"""
This script builds a final, advanced stacked ensemble model for the GoQuant competition.

This version uses a non-linear meta-model to intelligently combine predictions
and includes a full suite of robust features.
"""

import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import pearsonr

# File Paths 
ETH_TRAIN_PATH = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\train\ETH.csv"
ETH_TEST_PATH  = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\test\ETH.csv"
BTC_TRAIN_PATH = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\train\BTC.csv"
SUB_PATH   = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission.csv"
OUT_PATH   = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission_final.csv"

# Base Model 
LGBM_PARAMS = {
    'objective': 'regression_l2',
    'learning_rate': 0.02,
    'num_leaves': 64,
    'min_child_samples': 100,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'n_estimators': 10_000,
    'reg_lambda': 1.0,
    'random_state': 42,
    'n_jobs': -1
}

CATBOOST_PARAMS = {
    'iterations': 10000,
    'learning_rate': 0.02,
    'depth': 8,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': 0,
    'early_stopping_rounds': 400
}

# Meta-Model Parameters 
META_MODEL_PARAMS = {
    'objective': 'regression_l2',
    'n_estimators': 1000,
    'learning_rate': 0.01,
    'num_leaves': 16,
    'random_state': 42,
    'n_jobs': -1
}


def create_features(df: pd.DataFrame, is_primary_asset=True):
    df = df.copy()
    
    if is_primary_asset:
        df['timestamp_dt'] = pd.to_datetime(df['timestamp'], unit='s')
        df['hour'] = df['timestamp_dt'].dt.hour
        df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
        df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)

    df['wap'] = ((df['bid_price1'] * df['ask_volume1']) + (df['ask_price1'] * df['bid_volume1'])) / \
                (df['bid_volume1'] + df['ask_volume1'])
    df['log_return'] = np.log(df['wap'] / df['wap'].shift(1))
    
    if is_primary_asset:
        df['bid_ask_spread'] = df['ask_price1'] - df['bid_price1']
        df['total_volume'] = df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) + \
                             df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)
        df['full_obi'] = (df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) - \
                          df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)) / df['total_volume']
        df['wap_diff_10'] = df['wap'].diff(10)

    df['realized_vol_100'] = df['log_return'].rolling(window=100).std()
    return df


def load_and_prepare_data(eth_train_path, eth_test_path, btc_train_path, sub_path):
    """Loads, de-duplicates, cleans, and merges all data with robust timestamp handling."""
    # Load all raw data
    train_df = pd.read_csv(eth_train_path)
    test_df  = pd.read_csv(eth_test_path)
    btc_df   = pd.read_csv(btc_train_path)
    submission_df = pd.read_csv(sub_path)
    print(f"Loaded raw data: train={train_df.shape}, test={test_df.shape}, btc={btc_df.shape}")

    print("Converting timestamps from text to a numerical format...")
    for df in [train_df, test_df, btc_df, submission_df]:
        if 'timestamp' in df.columns:
      
            df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    
    # Drop any rows where timestamp conversion failed
    train_df.dropna(subset=['timestamp'], inplace=True)
    test_df.dropna(subset=['timestamp'], inplace=True)
    btc_df.dropna(subset=['timestamp'], inplace=True)
    
    # changed for MemoryError: Aggregate by timestamp to remove duplicates 
    print("Aggregating data to ensure unique timestamps...")
    train_df = train_df.groupby('timestamp').mean().reset_index()
    test_df  = test_df.groupby('timestamp').mean().reset_index()
    btc_df   = btc_df.groupby('timestamp').mean().reset_index()
    
   
    # Convert the clean datetime objects to Unix timestamp integers.
    for df in [train_df, test_df, btc_df]:
        df['timestamp'] = df['timestamp'].astype(np.int64) // 10**9
        
    print("Timestamp conversion and de-duplication complete.")

    
    train_featured = create_features(train_df, is_primary_asset=True)
    test_featured  = create_features(test_df, is_primary_asset=True)
    btc_featured   = create_features(btc_df, is_primary_asset=False)

    # Merge cross-asset features
    btc_features_to_merge = btc_featured[['timestamp', 'log_return', 'realized_vol_100']].rename(
        columns={'log_return': 'log_return_btc', 'realized_vol_100': 'realized_vol_100_btc'}
    )
    train_featured = pd.merge(train_featured, btc_features_to_merge, on='timestamp', how='left')
    test_featured  = pd.merge(test_featured, btc_features_to_merge, on='timestamp', how='left')
    
    # rolling correlation
    train_featured['eth_btc_corr_100'] = train_featured['log_return'].rolling(window=100).corr(train_featured['log_return_btc'])
    test_featured['eth_btc_corr_100']  = test_featured['log_return'].rolling(window=100).corr(test_featured['log_return_btc'])
    print("Feature engineering and merging complete.")
    
    # Prepare Final Model Matrices
    TARGET_COL = 'label'
    features_to_drop = ['timestamp', 'timestamp_dt', 'hour', TARGET_COL, 'log_return', 'log_return_btc']
    MODEL_FEATURES = [col for col in train_featured.columns if col not in features_to_drop]

    X = train_featured[MODEL_FEATURES]
    y = train_featured[TARGET_COL]
    X_test = test_featured[MODEL_FEATURES]

    # Final cleanup
    for df in [X, X_test]:
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
    y.fillna(y.mean(), inplace=True)
    
    print(f"Data prepared for training with {len(MODEL_FEATURES)} features.")
    
    return X, y, X_test, submission_df
X, y, X_test, submission_df = load_and_prepare_data(
    ETH_TRAIN_PATH, ETH_TEST_PATH, BTC_TRAIN_PATH, SUB_PATH
)

time_series_splitter = TimeSeriesSplit(n_splits=5)
lgbm_oof_preds = np.zeros(len(X))
cat_oof_preds = np.zeros(len(X))

print("\nStarting cross-validation for base models...")
for fold, (train_idx, val_idx) in enumerate(time_series_splitter.split(X), 1):
    print(f"Fold {fold} ")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    lgbm = lgb.LGBMRegressor(**LGBM_PARAMS)
    lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(400, verbose=False)])
    lgbm_oof_preds[val_idx] = lgbm.predict(X_val)

    cat = CatBoostRegressor(**CATBOOST_PARAMS)
    cat.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    cat_oof_preds[val_idx] = cat.predict(X_val)


print("\n Training Stacking Meta-Model ")
meta_features_train = pd.DataFrame({'lgbm_pred': lgbm_oof_preds, 'cat_pred': cat_oof_preds})

# Use a simple but non-linear model to combine predictions
meta_model = lgb.LGBMRegressor(**META_MODEL_PARAMS)
meta_model.fit(meta_features_train, y)

# Evaluate all models
corr_lgbm_oof, _ = pearsonr(y, lgbm_oof_preds)
corr_cat_oof, _ = pearsonr(y, cat_oof_preds)
stacked_preds_oof = meta_model.predict(meta_features_train)
corr_stacked_oof, _ = pearsonr(y, stacked_preds_oof)

print(f"LightGBM CV Pearson : {corr_lgbm_oof:.6f}")
print(f"CatBoost CV Pearson : {corr_cat_oof:.6f}")
print(f"Stacked CV Pearson  : {corr_stacked_oof:.6f}")


# ==============================================================================
#  5. Final Prediction
# ==============================================================================
print("\nTraining final base models on all data")
final_lgbm = lgb.LGBMRegressor(**LGBM_PARAMS)
final_lgbm.fit(X, y)
lgbm_final_preds = final_lgbm.predict(X_test)

final_cat = CatBoostRegressor(**CATBOOST_PARAMS)
final_cat.fit(X, y)
cat_final_preds = final_cat.predict(X_test)

print("Generating final predictions with the trained meta-model")
meta_features_test = pd.DataFrame({'lgbm_pred': lgbm_final_preds, 'cat_pred': cat_final_preds})
final_predictions = meta_model.predict(meta_features_test)

submission_df['labels'] = final_predictions

try:
    submission_df.to_csv(OUT_PATH, index=False)
    print(f"\nStacked ensemble submission successfully saved to: {OUT_PATH}")
except Exception as e:
    print(f"Error saving submission file: {e}")

Loaded raw data: train=(631292, 23), test=(270548, 22), btc=(631292, 22)
Converting timestamps from text to a numerical format...


  df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')


Aggregating data to ensure unique timestamps...
Timestamp conversion and de-duplication complete.
Feature engineering and merging complete.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.replace([np.inf, -np.inf], np.nan, inplace=True)
  df.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(method='bfill', inplace=True)


Data prepared for training with 31 features.

Starting cross-validation for base models...
--- Fold 1 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6943
[LightGBM] [Info] Number of data points in the train set: 105217, number of used features: 30
[LightGBM] [Info] Start training from score 0.000063
--- Fold 2 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011012 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 6953
[LightGBM] [Info] Number of data points in the train set: 210432, number of used features: 30
[LightGBM] [Info] Start training from score 0.000060
--- Fold 3 ---
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020313 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info

In [None]:
"""
This is the final, state-of-the-art script for the GoQuant assignment.

This version integrates a Long Short-Term Memory (LSTM) neural network into the
stacking ensemble, combining the strengths of tree-based models and sequence models.
"""

import os
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.model_selection import TimeSeriesSplit
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

# ==============================================================================
#  1. Configuration
# ==============================================================================
# --- File Paths ---
ETH_TRAIN_PATH = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\train\ETH.csv"
ETH_TEST_PATH  = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\test\ETH.csv"
SUB_PATH   = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission.csv"
OUT_PATH   = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission_final.csv"

# --- Model Parameters ---
LGBM_PARAMS = {
    'objective': 'regression_l2', 'learning_rate': 0.02, 'num_leaves': 64,
    'min_child_samples': 100, 'colsample_bytree': 0.8, 'subsample': 0.8,
    'n_estimators': 10_000, 'reg_lambda': 1.0, 'random_state': 42, 'n_jobs': -1
}
CATBOOST_PARAMS = {
    'iterations': 10000, 'learning_rate': 0.02, 'depth': 8, 'loss_function': 'RMSE',
    'eval_metric': 'RMSE', 'random_seed': 42, 'verbose': 0, 'early_stopping_rounds': 400
}
META_MODEL_PARAMS = {
    'objective': 'regression_l2', 'n_estimators': 1000, 'learning_rate': 0.01,
    'num_leaves': 16, 'random_state': 42, 'n_jobs': -1
}
# --- LSTM Configuration ---
SEQUENCE_LENGTH = 60  # Look at the last 60 seconds of data to make a prediction
LSTM_EPOCHS = 20
LSTM_BATCH_SIZE = 256

# ==============================================================================
#  2. Data Preparation Pipeline
# ==============================================================================
def create_features(df: pd.DataFrame):
    """Engineers features from the ETH order book data."""
    df = df.copy()
    df['timestamp_dt'] = pd.to_datetime(df['timestamp'], unit='s')
    df['hour'] = df['timestamp_dt'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24.0)
    df['wap'] = (df['bid_price1'] * df['ask_volume1'] + df['ask_price1'] * df['bid_volume1']) / \
                (df['bid_volume1'] + df['ask_volume1'])
    df['log_return'] = np.log(df['wap'] / df['wap'].shift(1))
    df['bid_ask_spread'] = df['ask_price1'] - df['bid_price1']
    df['total_volume'] = df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) + \
                         df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)
    df['full_obi'] = (df[[f"bid_volume{i}" for i in range(1, 6)]].sum(axis=1) - \
                      df[[f"ask_volume{i}" for i in range(1, 6)]].sum(axis=1)) / df['total_volume']
    df['wap_diff_10'] = df['wap'].diff(10)
    df['realized_vol_100'] = df['log_return'].rolling(window=100).std()
    return df

def load_and_prepare_data(paths):
    """Loads and prepares the ETH data."""
    train_df = pd.read_csv(paths['eth_train'])
    test_df  = pd.read_csv(paths['eth_test'])
    submission_df = pd.read_csv(paths['sub'])
    print(f"Loaded raw data: train={train_df.shape}, test={test_df.shape}")

    for name, df in {'train': train_df, 'test': test_df}.items():
        df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
        df.dropna(subset=['timestamp'], inplace=True)
        df = df.groupby('timestamp').mean().reset_index()
        df['timestamp'] = df['timestamp'].astype(np.int64) // 10**9
        if name == 'train': train_df = df
        else: test_df = df
    print("Timestamp conversion and de-duplication complete.")

    train_featured = create_features(train_df)
    test_featured  = create_features(test_df)
    print("Feature engineering complete.")
    
    TARGET_COL = 'label'
    features_to_drop = ['timestamp', 'timestamp_dt', 'hour', TARGET_COL, 'log_return']
    MODEL_FEATURES = [col for col in train_featured.columns if col not in features_to_drop]

    X = train_featured[MODEL_FEATURES]
    y = train_featured[TARGET_COL]
    X_test = test_featured[MODEL_FEATURES]

    for df in [X, X_test]:
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.fillna(method='ffill', inplace=True)
        df.fillna(method='bfill', inplace=True)
    y.fillna(y.mean(), inplace=True)
    
    if X.empty:
        raise ValueError("Dataframe X is empty after preparation.")
        
    print(f"Data prepared for training with {len(MODEL_FEATURES)} features.")
    return X, y, X_test, submission_df

def create_sequences(X, y, sequence_length=60):
    """Reshapes data into sequences for LSTM model."""
    X_seq, y_seq = [], []
    for i in range(len(X) - sequence_length):
        X_seq.append(X.iloc[i:(i + sequence_length)].values)
        y_seq.append(y.iloc[i + sequence_length])
    return np.array(X_seq), np.array(y_seq)

paths = {'eth_train': ETH_TRAIN_PATH, 'eth_test': ETH_TEST_PATH, 'sub': SUB_PATH}
X, y, X_test, submission_df = load_and_prepare_data(paths)

# ==============================================================================
#  3. Cross-Validation & Stacking
# ==============================================================================
time_series_splitter = TimeSeriesSplit(n_splits=5)
lgbm_oof_preds = np.zeros(len(X))
cat_oof_preds = np.zeros(len(X))
lstm_oof_preds = np.zeros(len(X))

print("\nStarting cross-validation for base models...")
for fold, (train_idx, val_idx) in enumerate(time_series_splitter.split(X), 1):
    print(f"--- Fold {fold} ---")
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # --- Train LightGBM & CatBoost ---
    print("  Training tree models...")
    lgbm = lgb.LGBMRegressor(**LGBM_PARAMS)
    lgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(400, verbose=False)])
    lgbm_oof_preds[val_idx] = lgbm.predict(X_val)

    cat = CatBoostRegressor(**CATBOOST_PARAMS)
    cat.fit(X_train, y_train, eval_set=[(X_val, y_val)])
    cat_oof_preds[val_idx] = cat.predict(X_val)
    
    # --- Train LSTM ---
    print("  Preparing sequences for LSTM...")
    X_train_seq, y_train_seq = create_sequences(X_train, y_train, SEQUENCE_LENGTH)
    X_val_seq, y_val_seq = create_sequences(X_val, y_val, SEQUENCE_LENGTH)
    
    print("  Training LSTM...")
    lstm_model = Sequential([
        Input(shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
        LSTM(64, return_sequences=False),
        Dense(1)
    ])
    lstm_model.compile(optimizer='adam', loss='mse')
    early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    lstm_model.fit(X_train_seq, y_train_seq,
                   validation_data=(X_val_seq, y_val_seq),
                   epochs=LSTM_EPOCHS,
                   batch_size=LSTM_BATCH_SIZE,
                   callbacks=[early_stop],
                   verbose=0)
    
    # LSTM predictions need to be aligned with the validation set
    lstm_preds = lstm_model.predict(X_val_seq).flatten()
    lstm_oof_preds[val_idx[SEQUENCE_LENGTH:]] = lstm_preds


print("\n--- Training Stacking Meta-Model ---")
meta_features_train = pd.DataFrame({
    'lgbm_pred': lgbm_oof_preds,
    'cat_pred': cat_oof_preds,
    'lstm_pred': lstm_oof_preds
})
oof_mask = lgbm_oof_preds != 0 # Use a mask from a tree model
meta_model = lgb.LGBMRegressor(**META_MODEL_PARAMS)
meta_model.fit(meta_features_train[oof_mask], y[oof_mask])

# ==============================================================================
#  4. Evaluation & Final Prediction
# ==============================================================================
y_for_oof_eval = y[oof_mask]
lgbm_oof_eval = lgbm_oof_preds[oof_mask]
cat_oof_eval = cat_oof_preds[oof_mask]
# Align LSTM predictions for evaluation
lstm_oof_eval = lstm_oof_preds[oof_mask]

stacked_preds_oof = meta_model.predict(meta_features_train[oof_mask])

corr_lgbm_oof, _ = pearsonr(y_for_oof_eval, lgbm_oof_eval)
corr_cat_oof, _ = pearsonr(y_for_oof_eval, cat_oof_eval)
corr_lstm_oof, _ = pearsonr(y_for_oof_eval, lstm_oof_eval)
corr_stacked_oof, _ = pearsonr(y_for_oof_eval, stacked_preds_oof)

print("\n-------------------------------------------")
print(f"LightGBM CV Pearson : {corr_lgbm_oof:.6f}")
print(f"CatBoost CV Pearson : {corr_cat_oof:.6f}")
print(f"LSTM CV Pearson     : {corr_lstm_oof:.6f}")
print(f"Stacked CV Pearson  : {corr_stacked_oof:.6f} ✨")
print("-------------------------------------------")

# --- Final Model Training ---
print("\nTraining final base models on all data...")
final_lgbm = lgb.LGBMRegressor(**LGBM_PARAMS).fit(X, y)
lgbm_final_preds = final_lgbm.predict(X_test)

final_cat = CatBoostRegressor(**CATBOOST_PARAMS).fit(X, y)
cat_final_preds = final_cat.predict(X_test)

print("Training final LSTM on recent data...")
X_train_seq_final, y_train_seq_final = create_sequences(X, y, SEQUENCE_LENGTH)
final_lstm_model = Sequential([
    Input(shape=(X_train_seq_final.shape[1], X_train_seq_final.shape[2])),
    LSTM(64, return_sequences=False),
    Dense(1)
])
final_lstm_model.compile(optimizer='adam', loss='mse')
final_lstm_model.fit(X_train_seq_final, y_train_seq_final, epochs=LSTM_EPOCHS, batch_size=LSTM_BATCH_SIZE, verbose=0)

# For final LSTM prediction, we need the last `SEQUENCE_LENGTH` steps of the training data
X_test_seq_final = []
X_test_with_history = pd.concat([X.iloc[-SEQUENCE_LENGTH:], X_test])
for i in range(len(X_test)):
    X_test_seq_final.append(X_test_with_history.iloc[i:(i + SEQUENCE_LENGTH)].values)
lstm_final_preds = final_lstm_model.predict(np.array(X_test_seq_final)).flatten()


print("Generating final predictions with the trained meta-model...")
meta_features_test = pd.DataFrame({
    'lgbm_pred': lgbm_final_preds,
    'cat_pred': cat_final_preds,
    'lstm_pred': lstm_final_preds
})
final_predictions = meta_model.predict(meta_features_test)

submission_df['labels'] = final_predictions

try:
    submission_df.to_csv(OUT_PATH, index=False)
    print(f"\nStacked ensemble submission successfully saved to: {OUT_PATH}")
except Exception as e:
    print(f"Error saving submission file: {e}")

--- Loading and preparing ETH data ---


FileNotFoundError: [Errno 2] No such file or directory: 'train\\ETH.csv'