In [1]:
import cudf as pd #Change1
import numpy as np
import pandas as pdx
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBRegressor
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from collections import defaultdict
import warnings
from itertools import combinations
import gc
import pickle
import warnings
warnings.filterwarnings('ignore')

print('We will use RAPIDS version',pd.__version__)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error

train = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
plt.rcParams.update({'font.size': 16})

We will use RAPIDS version 23.10.01


In [2]:
features = [col for col in train.columns if col not in ['row_id', 'time_id', 'date_id', 'target']]
print(features)

['stock_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']


In [3]:
cat_cols = ['stock_id', 'imbalance_buy_sell_flag']
num_cols = ['seconds_in_bucket', 'imbalance_size',  'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size', 'wap']


cols_fill_nan = [
    'imbalance_size', 'reference_price', 'matched_size', 'wap',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 
    'stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag']

In [4]:
train['far_price'].fillna(0, inplace=True)
train['near_price'].fillna(1, inplace=True)
train['bid_size_not_norm']=train['bid_size']
train['ask_size_not_norm']=train['ask_size']


cols_group_by = ['date_id', 'seconds_in_bucket']
cols_fill_nan = [
    'imbalance_size', 'reference_price', 'matched_size', 'wap',
    'bid_price', 'bid_size', 'ask_price', 'ask_size', 
    'stock_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 'bid_size_not_norm', 'ask_size_not_norm']
train_grouped_median = train.groupby(cols_group_by)[cols_fill_nan].transform('median')

train[cols_fill_nan] = train[cols_fill_nan].fillna(train_grouped_median)

display(train.isnull().sum())
print(f"before drop dataset size: {len(train)}")
train.dropna(inplace=True)
print(f"after drop dataset size: {len(train)}")
                                                                                

stock_id                    0
date_id                     0
seconds_in_bucket           0
imbalance_size              0
imbalance_buy_sell_flag     0
reference_price             0
matched_size                0
far_price                   0
near_price                  0
bid_price                   0
bid_size                    0
ask_price                   0
ask_size                    0
wap                         0
target                     88
time_id                     0
row_id                      0
bid_size_not_norm           0
ask_size_not_norm           0
dtype: int64

before drop dataset size: 5237980
after drop dataset size: 5237892


In [5]:
from sklearn.preprocessing import RobustScaler, QuantileTransformer

X = train.to_pandas().drop(columns=['target'])  # Your features
#X[cat_cols] = X[cat_cols].astype("category")
y = train.to_pandas()['target']  # Your target variable

num_cols = [feature for feature in features if feature not in cat_cols]
print(len(num_cols))

X[num_cols] = X[num_cols].astype('float32')


robust_scaler = QuantileTransformer(output_distribution='normal')
X[num_cols] = robust_scaler.fit_transform(X[num_cols])

groups = train.to_pandas()['date_id'] # Extracting just the 'time_id' column for grouping

11


In [6]:
groups.reset_index(inplace=True, drop=True)

groups

0            0
1            0
2            0
3            0
4            0
          ... 
5237887    480
5237888    480
5237889    480
5237890    480
5237891    480
Name: date_id, Length: 5237892, dtype: int64

In [7]:

from tqdm.notebook import tqdm

# Initialize GroupKFold
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

best_iteration_xgb = []
importance_dict = {}
feature_importance_df = pdx.DataFrame()

models = []

xgb_params = {
    #'booster': 'gbtree',
    'objective': 'reg:absoluteerror',
    'tree_method': 'hist', #Change4
    'eval_metric':'mae',
    'learning_rate': 0.01,
    'grow_policy': 'lossguide',
    #'alpha': 8,
    'max_depth': 6,
    'n_estimators': 10000,
    'early_stopping_rounds': 90,
    #'subsample':0.8,
    #'colsample_bytree': 0.5,
    'seed': 42
}

# Perform the split
for i, (train_index, test_index) in enumerate(gkf.split(X, y, groups)):
    print("==========={i}===========")
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # TRAIN MODEL        
    reg =  XGBRegressor( device="cuda" , **xgb_params)

    reg.fit(X_train[features], y_train.values,
            eval_set=[(X_test[features], y_test.values)],
            verbose=200)
    models.append(reg)
    best_iteration_xgb.append(reg.best_iteration)
    
    fold_importance_df = pdx.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = reg.feature_importances_
    fold_importance_df["fold"] = i + 1
    feature_importance_df = pdx.concat([feature_importance_df, fold_importance_df], axis=0)

feature_importance_df = feature_importance_df.groupby(['feature'])['importance'].agg(['mean']).sort_values(by='mean', ascending=False)
feature_importance_df

[0]	validation_0-mae:6.43606
[200]	validation_0-mae:6.36772
[400]	validation_0-mae:6.34863
[600]	validation_0-mae:6.33989
[800]	validation_0-mae:6.33486
[1000]	validation_0-mae:6.33022
[1200]	validation_0-mae:6.32701
[1400]	validation_0-mae:6.32400
[1600]	validation_0-mae:6.32156
[1800]	validation_0-mae:6.31985
[2000]	validation_0-mae:6.31855
[2200]	validation_0-mae:6.31722
[2400]	validation_0-mae:6.31612
[2600]	validation_0-mae:6.31521
[2800]	validation_0-mae:6.31436
[3000]	validation_0-mae:6.31374
[3200]	validation_0-mae:6.31306
[3400]	validation_0-mae:6.31245
[3600]	validation_0-mae:6.31203
[3800]	validation_0-mae:6.31154
[4000]	validation_0-mae:6.31113
[4200]	validation_0-mae:6.31066
[4400]	validation_0-mae:6.31029
[4600]	validation_0-mae:6.31005
[4800]	validation_0-mae:6.30979
[5000]	validation_0-mae:6.30958
[5200]	validation_0-mae:6.30945
[5400]	validation_0-mae:6.30919
[5600]	validation_0-mae:6.30901
[5800]	validation_0-mae:6.30884
[6000]	validation_0-mae:6.30864
[6200]	validati

Unnamed: 0_level_0,mean
feature,Unnamed: 1_level_1
ask_size,0.172057
bid_size,0.168658
imbalance_buy_sell_flag,0.094896
seconds_in_bucket,0.072854
wap,0.068957
bid_price,0.067881
ask_price,0.065639
reference_price,0.060084
matched_size,0.048515
imbalance_size,0.048513


In [8]:
best_iteration_xgb

[6935, 5481, 7308, 6016, 7328]

In [9]:
n_est = int(np.median(best_iteration_xgb)+1)
xgb_params = {
    #'booster': 'gbtree',
    'objective': 'reg:absoluteerror',
    'tree_method': 'hist', #Change4
    'eval_metric':'mae',
    'learning_rate': 0.01,
    'grow_policy': 'lossguide',
    #'alpha': 8,
    'max_depth': 6,
    'n_estimators': n_est,

    #'subsample':0.8,
    #'colsample_bytree': 0.5,
    'seed': 42
}



In [10]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, valid_idx = next(gss.split(X, groups=X['date_id']))

train_X = X.iloc[train_idx]
valid_X = X.iloc[valid_idx]

train_y = y.iloc[train_idx]
valid_y = y.iloc[valid_idx]



In [11]:
reg =  XGBRegressor(device="cuda", **xgb_params)
reg.fit(train_X[features], train_y, eval_set=[(valid_X[features], valid_y.values)],
            verbose=200)


[0]	validation_0-mae:6.18835
[200]	validation_0-mae:6.12209
[400]	validation_0-mae:6.10476
[600]	validation_0-mae:6.09739
[800]	validation_0-mae:6.09275
[1000]	validation_0-mae:6.08856
[1200]	validation_0-mae:6.08562
[1400]	validation_0-mae:6.08280
[1600]	validation_0-mae:6.08019
[1800]	validation_0-mae:6.07846
[2000]	validation_0-mae:6.07714
[2200]	validation_0-mae:6.07584
[2400]	validation_0-mae:6.07492
[2600]	validation_0-mae:6.07409
[2800]	validation_0-mae:6.07331
[3000]	validation_0-mae:6.07276
[3200]	validation_0-mae:6.07215
[3400]	validation_0-mae:6.07158
[3600]	validation_0-mae:6.07114
[3800]	validation_0-mae:6.07076
[4000]	validation_0-mae:6.07038
[4200]	validation_0-mae:6.07001
[4400]	validation_0-mae:6.06974
[4600]	validation_0-mae:6.06944
[4800]	validation_0-mae:6.06924
[5000]	validation_0-mae:6.06901
[5200]	validation_0-mae:6.06883
[5400]	validation_0-mae:6.06869
[5600]	validation_0-mae:6.06851
[5800]	validation_0-mae:6.06836
[6000]	validation_0-mae:6.06824
[6200]	validati

In [12]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out
    
y_min, y_max = -64, 64

# Making predictions on the test set
predictions = reg.predict(valid_X[features])
predictions_to_clip = zero_sum(predictions, valid_X['bid_size_not_norm'] + valid_X['ask_size_not_norm'])

clipped_predictions = np.clip(predictions_to_clip, y_min, y_max)


# Calculate the mean absolute and squared error
print("Mean Absolute Error:", mean_absolute_error(valid_y, predictions))
print("Mean Relative Error:", mean_absolute_percentage_error(valid_y, predictions))

print("Mean Absolute Error CP:", mean_absolute_error(valid_y, clipped_predictions))
print("Mean Relative Error CP:", mean_absolute_percentage_error(valid_y, clipped_predictions))

Mean Absolute Error: 6.068054336514252
Mean Relative Error: 2389667017144.372
Mean Absolute Error CP: 6.067656325515407
Mean Relative Error CP: 2379568568383.4927


In [13]:
reg =  XGBRegressor( device="cuda", **xgb_params)
reg.fit(X[features], y,
            verbose=1)

In [14]:

import xgboost as xgb

reg.feature_importances_

array([0.03790003, 0.07318993, 0.04607042, 0.09661287, 0.05918073,
       0.04598477, 0.04018564, 0.04483659, 0.06681271, 0.173595  ,
       0.06525116, 0.18054007, 0.06984007], dtype=float32)

In [15]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()


from tqdm.auto import tqdm

TRAIN_TARGET = "target"

def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out
    
y_min, y_max = -64, 64

# Making predictions on the test set

train_grouped_median_pd = train_grouped_median.to_pandas()


counter = 0
for (df_test, revealed_targets, sample_prediction) in tqdm(iter_test):
#     display(df_test); display(revealed_targets); display(sample_prediction)
    test_nan_ = df_test.isnull().sum()
    df_test['far_price'].fillna(0, inplace=True)
    df_test['near_price'].fillna(1, inplace=True)
    df_test['bid_size_not_norm']=df_test['bid_size']
    df_test['ask_size_not_norm']=df_test['ask_size']
    
    df_test[cols_fill_nan] = df_test[cols_fill_nan].fillna(train_grouped_median_pd)
    
    test_nan = pdx.DataFrame(dict(before=test_nan_, after=df_test.isnull().sum()))
    
    #df_test[cat_cols] = df_test[cat_cols].astype("category")
    df_test[num_cols] = df_test[num_cols].astype('float32')

    
    df_test[num_cols] = robust_scaler.fit_transform(df_test[num_cols])
    
    X_test = df_test[features]
    #X_test = scaler.transform(X_test)

    df_test[TRAIN_TARGET] = reg.predict(X_test)
    if counter < 5:
        display(df_test.head())
        display(test_nan.T)

    preds = zero_sum(df_test[TRAIN_TARGET], df_test['bid_size_not_norm'] + df_test['ask_size_not_norm'])
    clipped_predictions = np.clip(preds, y_min, y_max)
    sample_prediction['target'] = clipped_predictions
    prediction = sample_prediction
    env.predict(prediction)
    counter += 1

ModuleNotFoundError: No module named 'optiver2023'