In [1]:
import cudf as pd #Change1
import numpy as np
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBRegressor
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import defaultdict
import warnings
from itertools import combinations
import gc
import pickle
import warnings
warnings.filterwarnings('ignore')

print('We will use RAPIDS version',pd.__version__)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error

train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
plt.rcParams.update({'font.size': 16})

We will use RAPIDS version 23.10.00


In [2]:
features = [col for col in train.columns if col not in ['row_id', 'time_id', 'date_id', 'target']]
print(features)

['stock_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']


In [3]:
cat_cols = ['stock_id', 'imbalance_buy_sell_flag']
num_cols = ['seconds_in_bucket', 'imbalance_size',  'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']


In [4]:
train['far_price'].fillna(0, inplace=True)
train['near_price'].fillna(1, inplace=True)

cols_group_by = ['date_id', 'seconds_in_bucket']
cols_fill_nan = [
    'imbalance_size', 'reference_price', 'matched_size', 'wap',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 
    'stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag']
train_grouped_median = train.groupby(cols_group_by)[cols_fill_nan].transform('median')

train[cols_fill_nan] = train[cols_fill_nan].fillna(train_grouped_median)

display(train.isnull().sum())
print(f"before drop dataset size: {len(train)}")
train.dropna(inplace=True)
print(f"after drop dataset size: {len(train)}")
                                                                                

stock_id                    0
date_id                     0
seconds_in_bucket           0
imbalance_size              0
imbalance_buy_sell_flag     0
reference_price             0
matched_size                0
far_price                   0
near_price                  0
bid_price                   0
bid_size                    0
ask_price                   0
ask_size                    0
wap                         0
target                     88
time_id                     0
row_id                      0
dtype: int64

before drop dataset size: 5237980
after drop dataset size: 5237892


In [5]:
X_train = train[features].copy(deep=True)
y_train = train['target'].copy(deep=True)

In [6]:
X_train.fillna(0, inplace = True)
y_train.fillna(0, inplace = True)





In [7]:
X = train.to_pandas().drop(columns=['target'])  # Your features
y = train.to_pandas()['target']  # Your target variable
groups = train.to_pandas()['date_id'] # Extracting just the 'time_id' column for grouping

In [8]:
groups.reset_index(inplace=True, drop=True)

groups

0            0
1            0
2            0
3            0
4            0
          ... 
5237887    480
5237888    480
5237889    480
5237890    480
5237891    480
Name: date_id, Length: 5237892, dtype: int64

In [18]:

from tqdm.notebook import tqdm

# Initialize GroupKFold
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

best_iteration_xgb = []
importance_dict = {}
feature_importance_df = pd.DataFrame()

xgb_params = {
    #'booster': 'gbtree',
    'objective': 'reg:absoluteerror',
    'tree_method': 'gpu_hist', #Change4
    'eval_metric':'mae',
    'learning_rate': 0.01,
    #'alpha': 8,
    'max_depth': 6,
    'n_estimators': 1000,
    'early_stopping_rounds': 90,
    #'subsample':0.8,
    #'colsample_bytree': 0.5,
    'seed': 42
}

# Perform the split
for i, (train_index, test_index) in tqdm(enumerate(gkf.split(X, y, groups))):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # TRAIN MODEL        
    reg =  XGBRegressor(**xgb_params)
    reg.fit(X_train[features], y_train.values,
            eval_set=[(X_test[features], y_test.values)],
            verbose=10)
    best_iteration_xgb.append(reg.best_ntree_limit)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = reg.feature_importances_
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

feature_importance_df = feature_importance_df.groupby(['feature'])['importance'].agg(['mean']).sort_values(by='mean', ascending=False)
feature_importance_df

0it [00:00, ?it/s]

[0]	validation_0-mae:6.37997
[10]	validation_0-mae:6.34804
[20]	validation_0-mae:6.34515
[30]	validation_0-mae:6.34540
[40]	validation_0-mae:6.34974
[50]	validation_0-mae:6.35375
[60]	validation_0-mae:6.35825
[70]	validation_0-mae:6.36277
[80]	validation_0-mae:6.36592
[90]	validation_0-mae:6.36949
[100]	validation_0-mae:6.37273
[110]	validation_0-mae:6.37592
[112]	validation_0-mae:6.37639
[0]	validation_0-mae:6.50231
[10]	validation_0-mae:6.46847
[20]	validation_0-mae:6.46491
[30]	validation_0-mae:6.46746
[40]	validation_0-mae:6.46959
[50]	validation_0-mae:6.47364
[60]	validation_0-mae:6.47583
[70]	validation_0-mae:6.48240
[80]	validation_0-mae:6.48589
[90]	validation_0-mae:6.49011
[100]	validation_0-mae:6.49463
[105]	validation_0-mae:6.49597
[0]	validation_0-mae:6.41681
[10]	validation_0-mae:6.38182
[20]	validation_0-mae:6.38016
[30]	validation_0-mae:6.38120
[40]	validation_0-mae:6.38525
[50]	validation_0-mae:6.39261
[60]	validation_0-mae:6.39644
[70]	validation_0-mae:6.39909
[80]	val

Unnamed: 0_level_0,mean
feature,Unnamed: 1_level_1
bid_size,0.189837
ask_size,0.170331
imbalance_buy_sell_flag,0.094566
wap,0.080076
seconds_in_bucket,0.071004
bid_price,0.060143
ask_price,0.059133
reference_price,0.052605
imbalance_size,0.048951
matched_size,0.046431


In [19]:
best_iteration_xgb

[23, 17, 18, 15, 18]

In [41]:
n_est = int(np.median(best_iteration_xgb)+1)
xgb_params = {
    #'booster': 'gbtree',
    'objective': 'reg:absoluteerror',
    'tree_method': 'gpu_hist', #Change4
    'eval_metric':'mae',
    'learning_rate': 0.01,
    #'alpha': 8,
    'max_depth': 6,
    'n_estimators': n_est,

    #'subsample':0.8,
    #'colsample_bytree': 0.5,
    'seed': 42
}



In [42]:
reg =  XGBRegressor(**xgb_params)
reg.fit(X[features], y,
            verbose=1)