In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drw-crypto-market-prediction/sample_submission.csv
/kaggle/input/drw-crypto-market-prediction/train.parquet
/kaggle/input/drw-crypto-market-prediction/test.parquet


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import catboost as cb
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load full training data
columns = ['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume'] + [f"X{i}" for i in range(1,781)] + ['label']
train = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/train.parquet', columns=columns, engine='pyarrow')
test = pd.read_parquet('/kaggle/input/drw-crypto-market-prediction/test.parquet', engine='pyarrow')

In [4]:
# Optimize memory usage
train = train.astype({col: 'float32' for col in train.columns if col != 'timestamp'})
test = test.astype({col: 'float32' for col in test.columns if col != 'ID'})

In [5]:
# Standardize features
scaler = StandardScaler()
x_columns = [f"X{i}" for i in range(1,781)] + ['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']
train[x_columns] = scaler.fit_transform(train[x_columns])
test[x_columns] = scaler.transform(test[x_columns])

In [6]:
# Feature Engineering
def engineer_features(df, has_label=True):
    # Lagged features
    lags = [1, 5, 10]
    features_lag = ['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']
    for feature in features_lag:
        for lag in lags:
            df[f"{feature}_lag_{lag}"] = df[feature].shift(lag)

    # Rolling statistics
    window_sizes = [5, 10]
    features_rolling = ['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty', 'volume']
    for feature in features_rolling:
        for window in window_sizes:
            df[f"{feature}_rollmean_{window}"] = df[feature].rolling(window).mean().shift(1)
            df[f"{feature}_rollstd_{window}"] = df[feature].rolling(window).std().shift(1)

    # Interactions
    df['buy_sell_imbalance'] = df['buy_qty'] - df['sell_qty']
    df['order_imbalance'] = df['bid_qty'] - df['ask_qty']
    df['volume_momentum_ratio'] = df['volume_rollmean_5'] / (df['volume_rollmean_10'] + 1e-6)
    df['price_pressure'] = df['bid_qty'] / (df['ask_qty'] + 1e-6)
    df['volume_x1_product'] = df['volume'] * df['X1']
    df['volume_order_imbalance'] = df['volume'] * df['order_imbalance']

    # X1-X20 features
    x_features = [f'X{i}' for i in range(1, 21)]
    for feature in x_features:
        df[f"{feature}_rollmean_10"] = df[feature].rolling(10).mean().shift(1)
        df[f"{feature}_volatility_10"] = df[feature].rolling(10).std().shift(1)
        df[f"{feature}_momentum_5"] = df[feature] - df[feature].shift(5)

    # Additional interactions
    df['x1_x2_ratio'] = df['X1'] / (df['X2'] + 1e-6)
    df['x1_x2_product'] = df['X1'] * df['X2']
    df['x1_x3_product'] = df['X1'] * df['X3']
    df['x1_volume_ratio'] = df['X1'] / (df['volume'] + 1e-6)

    if has_label:
        df['label_rollmean_10'] = df['label'].rolling(10).mean().shift(1)
        df['label_volatility_10'] = df['label'].rolling(10).std().shift(1)

    return df

In [7]:
# Apply feature engineering
train = engineer_features(train, has_label=True)
train.dropna(inplace=True)
test = engineer_features(test, has_label=False)
test = test.fillna(0)

In [8]:
# Prepare data
X = train.drop(columns=['label'])
y = train['label']

In [9]:
# Feature selection: Correlation-based pre-filtering
corr = X.corrwith(y).abs()
corr_df = pd.DataFrame({'Feature': X.columns, 'Correlation': corr}).sort_values(by='Correlation', ascending=False)
# Exclude label-dependent features
label_dependent_features = ['label_rollmean_10', 'label_volatility_10']
top_corr_features = corr_df[corr_df['Correlation'] > 0.05]['Feature'].tolist()
top_corr_features = [f for f in top_corr_features if f not in label_dependent_features]
X = X[top_corr_features]
test = test[top_corr_features]

In [10]:
# Further selection with LightGBM
initial_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.05, max_depth=5, verbosity=-1)
initial_model.fit(X, y)
importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': initial_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
top_features = importances['Feature'].head(100).tolist()
top_features = [f for f in top_features if f not in label_dependent_features]  # Ensure exclusion
X = X[top_features]
test = test[top_features]
print("Top 10 Features by LightGBM Importance:")
print(importances.head(10))

Top 10 Features by LightGBM Importance:
   Feature  Importance
19    X614         315
11    X219         303
20    X218         211
25    X285         189
21    X751         180
0     X752         159
4     X759         149
9      X22         145
17    X753         141
12    X756          94


In [11]:
# Define Pearson correlation objective for Optuna
def objective(trial, model_type):
    if model_type == 'lgb':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
            'max_depth': trial.suggest_int('max_depth', 5, 8),
            'num_leaves': trial.suggest_int('num_leaves', 20, 50)
        }
        model = lgb.LGBMRegressor(**params, verbosity=-1)
    elif model_type == 'cb':
        params = {
            'iterations': trial.suggest_int('iterations', 100, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
            'depth': trial.suggest_int('depth', 5, 8)
        }
        model = cb.CatBoostRegressor(**params, verbose=0)
    else:  # xgb
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 300),
            'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15),
            'max_depth': trial.suggest_int('max_depth', 5, 8)
        }
        model = xgb.XGBRegressor(**params, verbosity=0)
    
    tscv = TimeSeriesSplit(n_splits=7)
    corr_scores = []
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        corr, _ = pearsonr(y_val, y_pred)
        corr_scores.append(corr)
    return np.mean(corr_scores)

In [12]:
# Optimize models
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(lambda trial: objective(trial, 'lgb'), n_trials=10)
best_params_lgb = study_lgb.best_params
print(f"Best LightGBM Parameters: {best_params_lgb}")

study_cb = optuna.create_study(direction='maximize')
study_cb.optimize(lambda trial: objective(trial, 'cb'), n_trials=10)
best_params_cb = study_cb.best_params
print(f"Best CatBoost Parameters: {best_params_cb}")

study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(lambda trial: objective(trial, 'xgb'), n_trials=10)
best_params_xgb = study_xgb.best_params
print(f"Best XGBoost Parameters: {best_params_xgb}")

[I 2025-07-18 23:11:06,657] A new study created in memory with name: no-name-e43d4dea-350e-4d28-92f1-0b129dd429ed
[I 2025-07-18 23:11:39,295] Trial 0 finished with value: 0.04223162253995243 and parameters: {'n_estimators': 193, 'learning_rate': 0.1306745809679591, 'max_depth': 5, 'num_leaves': 27}. Best is trial 0 with value: 0.04223162253995243.
[I 2025-07-18 23:12:14,134] Trial 1 finished with value: 0.06366498370989128 and parameters: {'n_estimators': 179, 'learning_rate': 0.06749056212285767, 'max_depth': 6, 'num_leaves': 43}. Best is trial 1 with value: 0.06366498370989128.
[I 2025-07-18 23:12:54,767] Trial 2 finished with value: 0.06407592389616301 and parameters: {'n_estimators': 199, 'learning_rate': 0.05248299976936053, 'max_depth': 7, 'num_leaves': 49}. Best is trial 2 with value: 0.06407592389616301.
[I 2025-07-18 23:13:31,441] Trial 3 finished with value: 0.05137684800649977 and parameters: {'n_estimators': 211, 'learning_rate': 0.09457674286517789, 'max_depth': 7, 'num_le

Best LightGBM Parameters: {'n_estimators': 107, 'learning_rate': 0.05087733354087658, 'max_depth': 6, 'num_leaves': 21}


[I 2025-07-18 23:17:47,117] Trial 0 finished with value: 0.05858200213157549 and parameters: {'iterations': 295, 'learning_rate': 0.10609750316529071, 'depth': 7}. Best is trial 0 with value: 0.05858200213157549.
[I 2025-07-18 23:18:21,073] Trial 1 finished with value: 0.07237830655406044 and parameters: {'iterations': 143, 'learning_rate': 0.08808440465915453, 'depth': 6}. Best is trial 1 with value: 0.07237830655406044.
[I 2025-07-18 23:19:04,963] Trial 2 finished with value: 0.06337339418622544 and parameters: {'iterations': 224, 'learning_rate': 0.1301546602847755, 'depth': 5}. Best is trial 1 with value: 0.07237830655406044.
[I 2025-07-18 23:20:35,768] Trial 3 finished with value: 0.06432864074192245 and parameters: {'iterations': 273, 'learning_rate': 0.07805312633903283, 'depth': 7}. Best is trial 1 with value: 0.07237830655406044.
[I 2025-07-18 23:21:25,759] Trial 4 finished with value: 0.06181492458606265 and parameters: {'iterations': 131, 'learning_rate': 0.12322696241328622

Best CatBoost Parameters: {'iterations': 221, 'learning_rate': 0.05094752638861734, 'depth': 5}


[I 2025-07-18 23:27:03,051] Trial 0 finished with value: 0.05387601628899574 and parameters: {'n_estimators': 256, 'learning_rate': 0.05083683021026119, 'max_depth': 7}. Best is trial 0 with value: 0.05387601628899574.
[I 2025-07-18 23:27:31,468] Trial 1 finished with value: 0.05788189545273781 and parameters: {'n_estimators': 159, 'learning_rate': 0.08636883559225761, 'max_depth': 6}. Best is trial 1 with value: 0.05788189545273781.
[I 2025-07-18 23:27:53,330] Trial 2 finished with value: 0.050317052751779556 and parameters: {'n_estimators': 146, 'learning_rate': 0.10803145644870879, 'max_depth': 5}. Best is trial 1 with value: 0.05788189545273781.
[I 2025-07-18 23:28:23,076] Trial 3 finished with value: 0.07282233983278275 and parameters: {'n_estimators': 173, 'learning_rate': 0.054235686471163974, 'max_depth': 6}. Best is trial 3 with value: 0.07282233983278275.
[I 2025-07-18 23:29:19,694] Trial 4 finished with value: 0.06199156120419502 and parameters: {'n_estimators': 189, 'learni

Best XGBoost Parameters: {'n_estimators': 173, 'learning_rate': 0.054235686471163974, 'max_depth': 6}


In [13]:
# Train final models and compute CV scores
tscv = TimeSeriesSplit(n_splits=7)
lgb_scores, cb_scores, xgb_scores = [], [], []

In [14]:
# LightGBM
lgb_model = lgb.LGBMRegressor(**best_params_lgb, verbosity=-1)
for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    lgb_model.fit(X_train, y_train)
    y_pred = lgb_model.predict(X_val)
    corr, _ = pearsonr(y_val, y_pred)
    lgb_scores.append(corr)

In [15]:
# CatBoost
cb_model = cb.CatBoostRegressor(**best_params_cb, verbose=0)
for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    cb_model.fit(X_train, y_train)
    y_pred = cb_model.predict(X_val)
    corr, _ = pearsonr(y_val, y_pred)
    cb_scores.append(corr)


In [16]:
# XGBoost
xgb_model = xgb.XGBRegressor(**best_params_xgb, verbosity=0)
for train_idx, val_idx in tscv.split(X):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_val)
    corr, _ = pearsonr(y_val, y_pred)
    xgb_scores.append(corr)

In [17]:
# Ensemble predictions with dynamic weights based on CV scores
avg_lgb_score = np.mean(lgb_scores)
avg_cb_score = np.mean(cb_scores)
avg_xgb_score = np.mean(xgb_scores)
total_score = avg_lgb_score + avg_cb_score + avg_xgb_score
weights = {
    'lgb': avg_lgb_score / total_score if total_score > 0 else 0.4,
    'cb': avg_cb_score / total_score if total_score > 0 else 0.3,
    'xgb': avg_xgb_score / total_score if total_score > 0 else 0.3
}

lgb_model.fit(X, y)
cb_model.fit(X, y)
xgb_model.fit(X, y)
lgb_pred = lgb_model.predict(test)
cb_pred = cb_model.predict(test)
xgb_pred = xgb_model.predict(test)
final_pred = weights['lgb'] * lgb_pred + weights['cb'] * cb_pred + weights['xgb'] * xgb_pred

In [18]:
print(f"Average CV Pearson Correlation (LightGBM): {avg_lgb_score:.6f}")
print(f"Average CV Pearson Correlation (CatBoost): {avg_cb_score:.6f}")
print(f"Average CV Pearson Correlation (XGBoost): {avg_xgb_score:.6f}")
print(f"Ensemble Weights: LightGBM={weights['lgb']:.3f}, CatBoost={weights['cb']:.3f}, XGBoost={weights['xgb']:.3f}")

Average CV Pearson Correlation (LightGBM): 0.073789
Average CV Pearson Correlation (CatBoost): 0.083567
Average CV Pearson Correlation (XGBoost): 0.072822
Ensemble Weights: LightGBM=0.321, CatBoost=0.363, XGBoost=0.316


In [19]:
# Standardize predictions to match label distribution
label_mean, label_std = train['label'].mean(), train['label'].std()
final_pred = (final_pred - final_pred.mean()) / final_pred.std() * label_std + label_mean

In [20]:
# Create submission
submission = pd.DataFrame({
    'ID': test.index,
    'prediction': final_pred
})
submission.to_csv('/kaggle/working/submission.csv', index=False)
print("Submission saved to /kaggle/working/submission.csv")

Submission saved to /kaggle/working/submission.csv
