In [50]:
import sys
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from scipy.stats import pearsonr
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [51]:
def feature_engineering(df):
    df = df.copy()
    df['order_flow_imbalance'] = (df['buy_qty'] - df['sell_qty']) / (df['buy_qty'] + df['sell_qty'] + 1e-8)
    df['liquidity_imbalance'] = (df['bid_qty'] - df['ask_qty']) / (df['bid_qty'] + df['ask_qty'] + 1e-8)
    df['bid_ask_spread'] = df['ask_qty'] - df['bid_qty']
    df['log_volume'] = np.log1p(df['volume'])
    
    df = df.replace([np.inf, -np.inf], np.nan)
    return df

In [52]:
train_df = pd.read_parquet("/kaggle/input/drw-crypto-market-prediction/train.parquet")
train_df.head(10)

Unnamed: 0,bid_qty,ask_qty,buy_qty,sell_qty,volume,X1,X2,X3,X4,X5,...,X772,X773,X774,X775,X776,X777,X778,X779,X780,label
2023-03-01 00:00:00,15.283,8.425,176.405,44.984,221.389,0.181844,-0.63786,0.006652,0.13687,0.116698,...,0.333753,-0.009992,-0.695595,-0.444077,-0.191238,-0.184251,-0.471897,-0.625428,-0.553991,0.562539
2023-03-01 00:01:00,38.59,2.336,525.846,321.95,847.796,0.489497,-0.075619,0.431594,0.5224,0.475255,...,0.333657,-0.01004,-0.696226,-0.452866,-0.200082,-0.188929,-0.472842,-0.625832,-0.554426,0.533686
2023-03-01 00:02:00,0.442,60.25,159.227,136.369,295.596,0.260121,-0.444684,0.100695,0.224729,0.203282,...,0.333667,-0.010037,-0.696832,-0.461383,-0.208786,-0.193571,-0.473785,-0.626236,-0.55486,0.546505
2023-03-01 00:03:00,4.865,21.016,335.742,124.963,460.705,0.099976,-0.666728,-0.123858,0.019197,0.014459,...,0.333174,-0.010279,-0.697391,-0.469628,-0.21735,-0.198175,-0.474726,-0.626639,-0.555294,0.357703
2023-03-01 00:04:00,27.158,3.451,98.411,44.407,142.818,0.270893,-0.325973,0.116336,0.234311,0.214073,...,0.333171,-0.010283,-0.69794,-0.477622,-0.22578,-0.202745,-0.475666,-0.627043,-0.555728,0.362452
2023-03-01 00:05:00,29.454,5.54,309.371,97.46,406.831,-0.558479,-1.655676,-1.03354,-0.814882,-0.756889,...,0.332504,-0.01061,-0.698384,-0.485338,-0.234063,-0.207272,-0.476601,-0.627445,-0.556162,0.100448
2023-03-01 00:06:00,3.478,29.514,100.608,233.525,334.133,-0.452937,-1.332166,-0.854345,-0.668494,-0.627297,...,0.332695,-0.010519,-0.698833,-0.492821,-0.242217,-0.211766,-0.477535,-0.627847,-0.556595,0.178679
2023-03-01 00:07:00,16.591,3.23,46.034,218.806,264.84,0.464475,0.292593,0.441009,0.498789,0.449966,...,0.333397,-0.010178,-0.699211,-0.500053,-0.250233,-0.21622,-0.478466,-0.628248,-0.557028,0.463684
2023-03-01 00:08:00,5.56,30.919,40.259,82.518,122.777,1.000941,1.145275,1.168142,1.166786,1.073158,...,0.333931,-0.009919,-0.699591,-0.507064,-0.258124,-0.22064,-0.479395,-0.628649,-0.557461,0.686066
2023-03-01 00:09:00,12.697,14.827,125.65,90.75,216.4,1.093672,1.193154,1.25861,1.26454,1.172654,...,0.333782,-0.009994,-0.699884,-0.51383,-0.265876,-0.225019,-0.48032,-0.629049,-0.557893,0.637539


In [53]:
train_df.shape

(525886, 786)

In [54]:
train_df.isnull().sum()

bid_qty     0
ask_qty     0
buy_qty     0
sell_qty    0
volume      0
           ..
X777        0
X778        0
X779        0
X780        0
label       0
Length: 786, dtype: int64

In [55]:
test_df = pd.read_parquet("/kaggle/input/drw-crypto-market-prediction/test.parquet")

In [56]:
test_df.shape

(538150, 786)

In [57]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split

from scipy.stats import pearsonr

base_feature = ["bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume"]

top_100_feature = ["X683", "X140", "X758", "X425", "X752", "X344", "X292", "X646",
                   "X287", "X134", "X385", "X647", "X682", "X279", "X345", "X466",
                   "X381", "X778", "X283", "X739", "X427", "X272", "X684", "X301",
                   "X198", "X465", "X608", "X738", "X384", "X137", "X386", "X581",
                   "X734", "X180", "X589", "X421", "X610", "X780", "X387", "X772",
                   "X654", "X428", "X96", "X779", "X426", "X98", "X591", "X650",
                   "X613", "X566", "X605", "X181", "X750", "X174", "X288", "X607",
                   "X579", "X176", "X508", "X178", "X419", "X219", "X343", "X89",
                   "X678", "X588", "X40", "X293", "X411", "X757", "X337", "X285",
                   "X295", "X341", "X443", "X179", "X575", "X751", "X92", "X562",
                   "X769", "X776", "X501", "X298", "X375", "X95", "X590", "X611",
                   "X94", "X270", "X424", "X86", "X587", "X434", "X638", "X170",
                   "X297", "X136", "X97", "X572"]


top_30_feature = top_100_feature[:30]

FEATURES = [*base_feature, *top_30_feature]

LABEL_COLUMN = "label"

#X=train_df[FEATURES]

#X_test = test_df[FEATURES]

X =  feature_engineering(train_df[FEATURES])
X_test =  feature_engineering(test_df[FEATURES])
y=train_df[LABEL_COLUMN]



In [58]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
splits = list(tscv.split(X))
train_idx, valid_idx = splits[-1]
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]


In [59]:
X_train.shape

(394415, 39)

In [60]:
X_valid.shape

(131471, 39)

In [61]:
y_train.shape

(394415,)

In [62]:
y_valid.shape

(131471,)

In [63]:
# Hyperparameter tuning
def objective(trial):
    

    params = {
        "tree_method": "gpu_hist",
        "device": "cuda",
        "verbosity": 0,
        "random_state": 42,
        "n_jobs": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "max_leaves": trial.suggest_int("max_leaves", 8, 64),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 30),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.3, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.3, 1.0),
        "gamma": trial.suggest_float("gamma", 0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 50.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 80.0),
        "n_estimators": 1667
    }

    scores = []
    
    for train_idx, valid_idx in tscv.split(X):
        
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = xgb.XGBRegressor(early_stopping_rounds=50, **params)
        
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        
        preds = model.predict(X_valid)
        r, _ = pearsonr(y_valid, preds)
        scores.append(r)

    return np.mean(scores)
    


# Create and run Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

#  results
print("Best Pearson correlation:", study.best_value)
print("Best Parameters:", study.best_params)


[I 2025-07-25 16:27:40,610] A new study created in memory with name: no-name-78039086-ffac-45b1-b0ee-e6f9ec1a261d
[I 2025-07-25 16:27:44,922] Trial 0 finished with value: 0.07071738634390724 and parameters: {'learning_rate': 0.031191328016175378, 'max_depth': 20, 'max_leaves': 21, 'min_child_weight': 28, 'subsample': 0.7023333460249905, 'colsample_bytree': 0.8110917139233953, 'colsample_bylevel': 0.9014535352460094, 'colsample_bynode': 0.4079119169579721, 'gamma': 3.146102448471719, 'reg_alpha': 7.290715600785997, 'reg_lambda': 43.83540744344116}. Best is trial 0 with value: 0.07071738634390724.
[I 2025-07-25 16:27:48,731] Trial 1 finished with value: 0.08036510828978448 and parameters: {'learning_rate': 0.033685055779316034, 'max_depth': 10, 'max_leaves': 8, 'min_child_weight': 30, 'subsample': 0.6198263124834795, 'colsample_bytree': 0.708074168870628, 'colsample_bylevel': 0.7615156634579913, 'colsample_bynode': 0.8765049148226713, 'gamma': 4.013010871098742, 'reg_alpha': 43.766569652

Best Pearson correlation: 0.11735128353020946
Best Parameters: {'learning_rate': 0.09258057678967159, 'max_depth': 3, 'max_leaves': 34, 'min_child_weight': 22, 'subsample': 0.5979983554389867, 'colsample_bytree': 0.9451406846342363, 'colsample_bylevel': 0.7268190491493042, 'colsample_bynode': 0.3357242329928344, 'gamma': 2.3660829037739393, 'reg_alpha': 16.635256680506135, 'reg_lambda': 5.266093237811045}


In [64]:
best_params = study.best_params

# Add required non-tuned params
best_params.update({
    "tree_method": "hist",
    "device": "cuda",
    "random_state": 42,
    "n_jobs": -1,
    "early_stopping_rounds": 50,
    "eval_metric": "rmse"
})

final_model = xgb.XGBRegressor(**best_params)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)


In [65]:
from sklearn.preprocessing import MinMaxScaler

In [66]:
scaler_y = MinMaxScaler()
scaler_y.fit(y_train.values.reshape(-1, 1))


In [67]:
y_train

2023-03-01 00:00:00    0.562539
2023-03-01 00:01:00    0.533686
2023-03-01 00:02:00    0.546505
2023-03-01 00:03:00    0.357703
2023-03-01 00:04:00    0.362452
                         ...   
2023-11-30 11:41:00    0.370834
2023-11-30 11:42:00    0.279049
2023-11-30 11:43:00    0.280720
2023-11-30 11:44:00    0.321828
2023-11-30 11:45:00    0.273688
Name: label, Length: 394415, dtype: float64

In [68]:
preds = final_model.predict(X_test)

In [69]:
scaled_predictions = scaler_y.fit(preds.reshape(-1, 1))
preds

array([ 0.0246935 ,  0.01550626,  0.01550626, ...,  0.01550626,
       -0.00460351,  0.0246935 ], dtype=float32)

In [70]:
submission_df = pd.read_csv("/kaggle/input/drw-crypto-market-prediction/sample_submission.csv")
if 'label' in submission_df.columns:
    submission_df = submission_df.drop(columns=['label'])
submission_df["prediction"] = preds
submission_df.to_csv("submission.csv", index=False)
    
print("\nSubmission file 'submission.csv' created successfully.")
print(submission_df.head())


Submission file 'submission.csv' created successfully.
   ID  prediction
0   1    0.024694
1   2    0.015506
2   3    0.015506
3   4   -0.006885
4   5    0.015506
