In [None]:
# Import necessary libraries for data handling, modeling, and performance measurement
import os, gc, warnings, sys
import numpy as np
import pandas as pd
from pathlib import Path
from typing import List, Tuple
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

# Hide warning messages to keep the output clean
warnings.filterwarnings("ignore")

# Show more columns when printing dataframes
pd.set_option("display.max_columns", 120)

# Set your base working folder — where 'train' and 'test' folders are located
BASE_DIR = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting"
TRAIN_DIR = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\train"
TEST_DIR  = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\test"

# List of cryptocurrencies we are working with
ASSETS = ["ETH", "BTC", "DOGE", "DOT", "LINK", "SHIB", "SOL"]

# Define the numeric type we'll use for better performance and lower memory usage
DTYPE_FLOAT = np.float32

def parse_ts(s: pd.Series) -> pd.Series:
    # First try to convert to datetime using the standard format (e.g. "2023-08-17 14:30:00")
    ts = pd.to_datetime(s, format="%Y-%m-%d %H:%M:%S", errors="coerce", utc=False)
    
    # If more than 1% of dates failed to parse, try another common format (e.g. "17-08-2023 14:30")
    if ts.isna().mean() > 0.01:
        ts2 = pd.to_datetime(s, format="%d-%m-%Y %H:%M", errors="coerce", utc=False)
        
        # If the second format works better, use it
        if ts2.isna().mean() < ts.isna().mean():
            ts = ts2
        else:
            # If both specific formats fail, fall back to the generic parser
            ts = pd.to_datetime(s, errors="coerce", utc=False)
    
    return ts

def safe_numeric(df: pd.DataFrame) -> pd.DataFrame:
    # Convert all columns (except timestamp) to numeric type
    # If something can't be converted, set it to NaN
    for c in df.columns:
        if c == "timestamp":
            continue
        df[c] = pd.to_numeric(df[c], errors="coerce").astype(DTYPE_FLOAT)
    return df


def winsorize_df(df: pd.DataFrame, lower=0.01, upper=0.99) -> pd.DataFrame:
    # Limit extreme values in each numeric column based on percentiles
    # Helps reduce the impact of outliers
    for c in df.columns:
        if c == "timestamp":
            continue
        s = df[c]
        ql, qu = s.quantile(lower), s.quantile(upper)
        df[c] = s.clip(lower=ql, upper=qu)
    return df

def ffill_bfill_min(df: pd.DataFrame) -> pd.DataFrame:
    # Fill missing values by using previous known value (forward fill)
    # and then the next known value if needed (backward fill)
    return df.sort_values("timestamp").ffill().bfill(limit=1)

def pearson_corr(y_true, y_pred) -> float:
    # Calculate how strongly the true and predicted values are related (Pearson correlation)
    yt = np.asarray(y_true, dtype=np.float64)
    yp = np.asarray(y_pred, dtype=np.float64)
    yt = yt - yt.mean()
    yp = yp - yp.mean()
    denom = (np.sqrt((yt**2).sum()) * np.sqrt((yp**2).sum()))
    
    # Return the correlation, or 0 if something went wrong
    return float((yt*yp).sum() / denom) if denom > 0 else 0.0


def build_eth_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Build detailed features for ETH using order book data and price statistics.
    """
    # Make a copy to avoid changing the original DataFrame
    df = df.copy()
    
    # Convert timestamps to datetime format and clean any bad or duplicate rows
    df["timestamp"] = parse_ts(df["timestamp"])
    df = df.dropna(subset=["timestamp"]).drop_duplicates(subset=["timestamp"])
    
    # Convert all non-timestamp columns to numbers and sort by time
    df = safe_numeric(df).sort_values("timestamp").reset_index(drop=True)

    # === Feature 1: Best level spread (Level 1 spread between ask and bid prices)
    df["spread_l1"] = (df["ask_price1"] - df["bid_price1"]).astype(DTYPE_FLOAT)

    # === Feature 2: Total volume in the top 5 bid and ask levels
    bid_vol_cols = [f"bid_volume{i}" for i in range(1, 6) if f"bid_volume{i}" in df.columns]
    ask_vol_cols = [f"ask_volume{i}" for i in range(1, 6) if f"ask_volume{i}" in df.columns]
    
    df["depth_bid_5"] = df[bid_vol_cols].sum(axis=1).astype(DTYPE_FLOAT)
    df["depth_ask_5"] = df[ask_vol_cols].sum(axis=1).astype(DTYPE_FLOAT)

    # === Feature 3: Order book imbalance (more bids or asks?)
    df["imbalance_5"] = (
        (df["depth_bid_5"] - df["depth_ask_5"]) / (df["depth_bid_5"] + df["depth_ask_5"] + 1e-6)
    ).astype(DTYPE_FLOAT)

    # === Feature 4: Microprice (weighted average of bid/ask using volume)
    denom = (df["bid_volume1"] + df["ask_volume1"]).replace(0, np.nan)
    df["microprice"] = (
        (df["ask_price1"] * df["bid_volume1"] + df["bid_price1"] * df["ask_volume1"]) / denom
    ).astype(DTYPE_FLOAT)

    # === Feature 5: Microprice spread (difference between microprice and mid price)
    df["micro_spread"] = (df["microprice"] - df["mid_price"]).astype(DTYPE_FLOAT)

    # === Feature 6: Log returns (1-second)
    df["ret_1s"] = np.log(df["mid_price"]).diff().astype(DTYPE_FLOAT)

    # === Feature 7: Realized volatility over short time windows
    for w in [5, 10, 30, 60]:
        df[f"rv_{w}s"] = (
            df["ret_1s"].rolling(w).apply(lambda x: np.sqrt(np.sum(x**2)), raw=True)
        ).astype(DTYPE_FLOAT)

    # === Feature 8: Lags for temporal patterns (ret, imbalance, spread)
    for lag in [1, 2, 3, 5, 10]:
        df[f"ret_1s_lag{lag}"] = df["ret_1s"].shift(lag).astype(DTYPE_FLOAT)
        df[f"imbalance_5_lag{lag}"] = df["imbalance_5"].shift(lag).astype(DTYPE_FLOAT)
        df[f"spread_l1_lag{lag}"] = df["spread_l1"].shift(lag).astype(DTYPE_FLOAT)

    # === Final feature selection (a compact but informative subset)
    keep = ["timestamp", "label", "mid_price", "spread_l1", "depth_bid_5", "depth_ask_5", "imbalance_5",
            "microprice", "micro_spread", "ret_1s"] \
           + [f"rv_{w}s" for w in [5, 10, 30, 60]] \
           + [f"ret_1s_lag{l}" for l in [1, 2, 3, 5, 10]] \
           + [f"imbalance_5_lag{l}" for l in [1, 2, 3, 5, 10]] \
           + [f"spread_l1_lag{l}" for l in [1, 2, 3, 5, 10]]

    # Only keep columns that are actually present in the DataFrame
    keep = [c for c in keep if c in df.columns]
    out = df[keep].copy()

    # === Clean the final features:
    # 1. Clip extreme values (winsorize)
    # 2. Fill missing values (forward + backward fill)
    # 3. Replace any leftover infs with 0
    out = winsorize_df(out, 0.01, 0.99)
    out = ffill_bfill_min(out)
    out = out.replace([np.inf, -np.inf], np.nan).fillna(0).astype({c: DTYPE_FLOAT for c in out.columns if c != "timestamp"})
    
    return out



In [None]:
# ==== Load ETH training data and create features ====
eth_train = pd.read_csv(TRAIN_DIR + "/ETH.csv")  # Read ETH CSV file into a dataframe
eth_feat = build_eth_features(eth_train)        # Process and create detailed features for ETH

print("ETH feature shape:", eth_feat.shape)     # Show the size of the new ETH feature dataframe
print(eth_feat.head())                           # Display the first few rows to check data looks right

# ==== Add features from other crypto assets onto ETH data ====
Xy = eth_feat.copy()  # Start with ETH features as base

for asset in ["BTC", "DOGE", "DOT", "LINK", "SHIB", "SOL"]:
    df = pd.read_csv(f"{TRAIN_DIR}/{asset}.csv")      # Load asset's raw data
    cross = build_cross_features(df, asset_prefix=asset)  # Create simple features for the asset

    before = Xy.shape[1]      # Save current number of columns before merge

    # Merge asset features with ETH features using timestamp (keep all ETH rows)
    Xy = Xy.merge(cross, on="timestamp", how="left")

    # Fill in any missing data caused by merge by carrying values forward and backward
    Xy = ffill_bfill_min(Xy)

    # Print how many new columns were added and how many missing values remain
    print(f"Merged {asset}: +{Xy.shape[1] - before} cols → shape={Xy.shape}; NaN%={Xy.isna().mean().mean():.4f}")

    # Clean up memory by deleting temporary dataframes and running garbage collection
    del df, cross
    gc.collect()

# ==== Final data cleanup and prepare for modeling ====
# Replace infinite values with NaN, then fill all NaNs with zero
Xy = Xy.replace([np.inf, -np.inf], np.nan).fillna(0)

# Extract target variable (what we want to predict)
y = Xy["label"].astype(DTYPE_FLOAT).values

# Select all columns except 'timestamp' and 'label' as input features
feature_cols = [c for c in Xy.columns if c not in ["timestamp", "label"]]
X = Xy[feature_cols].astype(DTYPE_FLOAT).values  # Convert features to numpy array for model

print("Final train X shape:", X.shape, " y shape:", y.shape)  # Show final shapes of data

# Quick check to ensure there are no missing or infinite values anywhere
assert np.isfinite(X).all() and np.isfinite(y).all()


ETH feature shape: (631292, 29)
            timestamp     label    mid_price  spread_l1  depth_bid_5  \
0 2024-09-25 18:13:28  0.000060  2581.604980    0.01001  3361.200195   
1 2024-09-25 18:13:29  0.000057  2581.284912    0.01001  1533.900024   
2 2024-09-25 18:13:30  0.000080  2581.284912    0.01001   957.200012   
3 2024-09-25 18:13:31  0.000087  2581.104980    0.01001  2510.899902   
4 2024-09-25 18:13:32  0.000090  2581.104980    0.01001  2526.399902   

   depth_ask_5  imbalance_5   microprice  micro_spread    ret_1s     rv_5s  \
0   373.000031     0.800225  2581.609131      0.004150 -0.000124  0.000000   
1  1201.699951     0.121436  2581.286377      0.001465 -0.000124  0.000000   
2  1186.900024    -0.107131  2581.285156      0.000244  0.000000  0.000000   
3  1292.599976     0.320310  2581.107178      0.002197 -0.000070  0.000000   
4  1222.099976     0.347952  2581.107178      0.002197  0.000000  0.000157   

   rv_10s  rv_30s  rv_60s  ret_1s_lag1  ret_1s_lag2  ret_1s_lag3  

In [None]:
from xgboost import XGBRegressor
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# ==== Define a function to create a new XGBoost regression model ====
def make_reg():
    return XGBRegressor(
        n_estimators=5000,          # Train up to 5000 trees
        learning_rate=0.05,         # How fast the model learns each step (small = slower but stable)
        max_depth=6,                # Max depth of each tree to control complexity
        subsample=0.8,              # Use 80% of data for each tree (helps prevent overfitting)
        colsample_bytree=0.8,       # Use 80% of features per tree (feature randomness)
        random_state=42,            # For reproducibility
        tree_method="hist",         # Fast histogram-based tree growing (efficient)
        eval_metric="rmse",         # Evaluation metric to monitor (Root Mean Squared Error)
        early_stopping_rounds=200   # Stop training if no improvement after 200 rounds
    )

# ==== Prepare cross-validation ====
kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5-fold split with shuffling and fixed seed

# ==== Train and validate model on each fold ====
for tr_idx, va_idx in kf.split(X):
    # Split data into training and validation sets for this fold
    X_tr, y_tr = X[tr_idx], y[tr_idx]
    X_va, y_va = X[va_idx], y[va_idx]

    # Create a fresh model instance
    model = make_reg()

    # Train model on training data and validate on validation data
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],  # Validation data to monitor performance during training
        verbose=False             # No output during training (quiet mode)
    )

    # Predict on validation data
    y_hat = model.predict(X_va)

    # Calculate RMSE (root mean squared error) as a performance measure
    rmse = float(np.sqrt(mean_squared_error(y_va, y_hat)))
    print("RMSE:", rmse)


RMSE: 3.647864217456325e-05
RMSE: 3.6195230340358164e-05
RMSE: 3.621968854342578e-05
RMSE: 3.613182701149182e-05
RMSE: 3.603873021470018e-05


In [None]:
# ==== Train the final model on all data ====
# We pick 2000 trees as a safe number based on previous cross-validation results
best_nround = 2000  

final_model = XGBRegressor(
    n_estimators=best_nround,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    tree_method="hist",
    eval_metric="rmse"
)

final_model.fit(X, y, verbose=False)  # Train using all the data we have
print("✅ Final model trained on all data")

# ==== Prepare ETH test data features ====
eth_test = pd.read_csv(TEST_DIR + "/ETH.csv")  # Load ETH test dataset
eth_test_feat = build_eth_features(eth_test)  # Create features like we did for training

# Drop 'labels' column if it exists (usually test data has no labels)
if "labels" in eth_test_feat.columns:
    eth_test_feat = eth_test_feat.drop(columns=["labels"])

print("ETH test feat shape:", eth_test_feat.shape)  # Show test features shape

# ==== Add features from other assets to test data ====
Xtest = eth_test_feat.copy()

for asset in ["BTC", "DOGE", "DOT", "LINK", "SHIB", "SOL"]:
    df = pd.read_csv(f"{TRAIN_DIR}/{asset}.csv")       # Load training data for the asset
    cross = build_cross_features(df, asset_prefix=asset)  # Create compact features

    before = Xtest.shape[1]   # Columns count before merge

    # Merge asset features with ETH test features on timestamp
    Xtest = Xtest.merge(cross, on="timestamp", how="left")

    # Fill missing data created by merge
    Xtest = ffill_bfill_min(Xtest)

    print(f"[TEST] merged {asset}: +{Xtest.shape[1] - before} cols → shape={Xtest.shape}")

    # Free memory
    del df, cross
    gc.collect()

# Replace infinite values with NaN, then fill all NaNs with zero
Xtest = Xtest.replace([np.inf, -np.inf], np.nan).fillna(0)

# ==== Align test features with training features ====
test_feature_cols = [c for c in Xtest.columns if c != "timestamp"]

# Add missing columns in test set with zeros to match training features
for c in feature_cols:
    if c not in test_feature_cols:
        Xtest[c] = 0.0

# Remove extra columns not present during training
extra = [c for c in test_feature_cols if c not in feature_cols]
if extra:
    Xtest = Xtest.drop(columns=extra)

# Reorder columns to match training data exactly (timestamp first)
Xtest = Xtest[["timestamp"] + feature_cols]

# Convert feature columns to numpy array for prediction
Xt = Xtest[feature_cols].astype(DTYPE_FLOAT).values

# ==== Make predictions and save results ====
pred = final_model.predict(Xt).astype(np.float32)  # Predict using the trained model

# Keep timestamps as integers (convert datetime if needed)
timestamps = Xtest["timestamp"]
if not np.issubdtype(timestamps.dtype, np.integer):
    timestamps = pd.to_datetime(timestamps).astype("int64")  # Convert datetime to nanoseconds since epoch

# Prepare final submission DataFrame with timestamp and predicted labels
submission_final = pd.DataFrame({
    "timestamp": timestamps.astype("int64"),
    "labels": pred
}).sort_values("timestamp")  # Sort by timestamp for consistency

# Save submission file to specified path
save_path = r"C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission_final.csv"
submission_final.to_csv(save_path, index=False)

print("✅ Saved:", save_path)
print(submission_final.dtypes)  # Print data types to confirm correctness
print(submission_final.head())  # Show first few rows of submission file


✅ Final model trained on all data
ETH test feat shape: (270548, 28)
[TEST] merged BTC: +5 cols → shape=(270548, 33)
[TEST] merged DOGE: +5 cols → shape=(270548, 38)
[TEST] merged DOT: +5 cols → shape=(270548, 43)
[TEST] merged LINK: +5 cols → shape=(270548, 48)
[TEST] merged SHIB: +5 cols → shape=(270548, 53)
[TEST] merged SOL: +5 cols → shape=(270548, 58)
✅ Saved: C:\Users\HP\Downloads\gq-implied-volatility-forecasting\submission_final.csv
timestamp      int64
labels       float32
dtype: object
   timestamp    labels
0          1  0.000024
1          2  0.000024
2          3  0.000050
3          4  0.000041
4          5  0.000062
