In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import gc
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, log_evaluation, record_evaluation
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
#from sklearn.impute import IterativeImputer
import pickle
import optuna
from optuna.visualization import plot_slice, plot_param_importances
import shap
import random

gc.enable()

pd.options.display.max_columns = None
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

#pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(10000)
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
path = 'I:/Kaggle/jane-street-real-time-market-data-forecasting/'

In [3]:
os.listdir(path)

['features.csv',
 'kaggle_evaluation',
 'lags.parquet',
 'my_folder',
 'responders.csv',
 'sample_submission.csv',
 'team_folder',
 'test.parquet',
 'top_100000_rows_sorted_by_weight_descending.parquet',
 'top_10000_rows_sorted_by_weight_descending.parquet',
 'train.parquet']

In [4]:
train_df = pl.read_parquet(path + 'train.parquet/').drop(['responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_7', 'responder_8', 'partition_id']).select(pl.all().shrink_dtype())
print(train_df.shape)
train_df.head()

(47127338, 84)


date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_6
i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,1,3.889038,,,,,,0.851033,0.242971,0.2634,-0.891687,11,7,76,-0.883028,0.003067,-0.744703,,-0.169586,,-1.335938,-1.707803,0.91013,,1.636431,1.522133,-1.551398,-0.229627,,,1.378301,-0.283712,0.123196,,,,0.28118,0.269163,0.349028,-0.012596,-0.225932,,-1.073602,,,-0.181716,,,,0.564021,2.088506,0.832022,,0.204797,,,-0.808103,,-2.037683,0.727661,,-0.989118,-0.345213,-1.36224,,,,,,-1.251104,-0.110252,-0.491157,-1.02269,0.152241,-0.659864,,,-0.261412,-0.211486,-0.335556,-0.281498,0.775981
0,0,7,1.370613,,,,,,0.676961,0.151984,0.192465,-0.521729,11,7,76,-0.865307,-0.225629,-0.582163,,0.317467,,-1.250016,-1.682929,1.412757,,0.520378,0.744132,-0.788658,0.641776,,,0.2272,0.580907,1.128879,,,,-1.512286,-1.414357,-1.823322,-0.082763,-0.184119,,,,,,,,,-10.835207,-0.002704,-0.621836,,1.172836,,,-1.625862,,-1.410017,1.063013,,0.888355,0.467994,-1.36224,,,,,,-1.065759,0.013322,-0.592855,-1.052685,-0.393726,-0.741603,,,-0.281207,-0.182894,-0.245565,-0.302441,0.703665
0,0,9,2.285698,,,,,,1.056285,0.187227,0.249901,-0.77305,11,7,76,-0.675719,-0.199404,-0.586798,,-0.814909,,-1.296782,-2.040234,0.639589,,1.597359,0.657514,-1.350148,0.364215,,,-0.017751,-0.317361,-0.122379,,,,-0.320921,-0.95809,-2.436589,0.070999,-0.245239,,,,,,,,,-1.420632,-3.515137,-4.67776,,0.535897,,,-0.72542,,-2.29417,1.764551,,-0.120789,-0.063458,-1.36224,,,,,,-0.882604,-0.072482,-0.617934,-0.86323,-0.241892,-0.709919,,,0.377131,0.300724,-0.106842,-0.096792,2.109352
0,0,10,0.690606,,,,,,1.139366,0.273328,0.306549,-1.262223,42,5,150,-0.694008,3.004091,0.114809,,-0.251882,,-1.902009,-0.979447,0.241165,,-0.392359,-0.224699,-2.129397,-0.855287,,,0.404142,-0.578156,0.105702,,,,0.544138,-0.087091,-1.500147,-0.201288,-0.038042,,,,,,,,,0.382074,2.669135,0.611711,,2.413415,,,1.313203,,-0.810125,2.939022,,3.988801,1.834661,-1.36224,,,,,,-0.697595,1.074309,-0.206929,-0.530602,4.765215,0.571554,,,-0.226891,-0.251412,-0.215522,-0.296244,1.114137
0,0,14,0.44057,,,,,,0.9552,0.262404,0.344457,-0.613813,44,3,16,-0.947351,-0.030018,-0.502379,,0.646086,,-1.844685,-1.58656,-0.182024,,-0.969949,-0.673813,-1.282132,-1.399894,,,0.043815,-0.320225,-0.031713,,,,-0.08842,-0.995003,-2.635336,-0.196461,-0.618719,,,,,,,,,-2.0146,-2.321076,-3.711265,,1.253902,,,0.476195,,-0.771732,2.843421,,1.379815,0.411827,-1.36224,,,,,,-0.948601,-0.136814,-0.447704,-1.141761,0.099631,-0.661928,,,3.678076,2.793581,2.61825,3.418133,-3.57282


In [5]:
slice_df = train_df[:2000000]
print(slice_df.shape)
slice_df.tail()

(2000000, 84)


date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_6
i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
173,457,30,0.567516,,,,,,0.375326,-0.319825,1.029079,-0.15302,81,2,534,0.606772,-0.284212,0.058182,0.173863,1.345357,0.574207,1.142812,-2.047774,-0.387229,,-1.226608,-1.43103,-1.89651,-0.441322,,,0.491788,-0.187928,0.266939,,-1.159581,-1.975904,-0.670448,-0.407861,0.379695,0.570265,0.409624,0.88381,2.234918,1.323364,0.592798,2.19508,2.312556,-0.113432,-0.1772,-1.286193,0.101208,0.865276,1.319221,0.260566,1.276351,0.438635,1.550548,-0.135334,-0.379588,0.206222,-0.176699,-0.005426,0.825014,0.060741,0.936034,2.569681,1.665185,1.096207,0.255152,0.993454,-0.312718,-0.033129,0.1425,-0.381973,0.140116,0.38296,0.327697,-0.148496,-0.144118,1.023528,0.615022,1.70001
173,457,33,1.315456,,,,,,0.343326,-0.418899,1.237633,-0.329882,11,7,76,-0.705248,-0.270001,-0.434605,0.362582,-0.184644,0.215689,1.149253,-0.235989,1.786524,,-0.065922,-0.068221,-1.776328,-0.752469,,,0.379974,-0.20868,0.448382,,-0.70977,-0.757513,-0.774077,-0.057243,0.405395,0.384512,0.744361,0.868108,0.171783,-0.552638,1.048226,0.633133,0.756537,-1.371079,-0.876449,-0.06624,0.177198,1.023339,0.750569,0.513293,1.591298,0.31925,1.463332,-0.424356,-0.946484,-1.00053,-0.038264,-0.036337,1.019238,0.060741,0.831873,0.32249,0.216685,1.262692,-0.581339,-0.557206,-0.307113,-0.418475,-0.828474,-0.271117,-0.302529,-0.044075,-0.071534,-0.343982,-0.238794,0.121667,0.067188,-0.071422
173,457,38,1.770896,,,,,,0.351517,-0.208007,0.574333,-0.234759,50,1,522,-0.735548,-0.250054,-0.043207,0.014195,0.071237,-0.12816,-1.567486,-1.00677,1.128188,,0.322964,0.278065,-0.874487,-0.06031,,,1.181455,-0.336926,-0.051936,,-0.654526,0.624864,-0.217913,-0.523524,2.158123,-1.234442,-0.802911,0.865537,1.034676,1.833992,1.592091,1.325713,1.018681,1.224255,2.200923,2.230081,0.805427,3.317573,1.912813,-0.581211,2.239192,0.667625,-1.103449,-0.319881,1.069179,-0.186434,2.065614,0.749513,3.983626,0.060741,-0.061943,0.278523,0.150814,-0.141675,1.52034,-0.685136,-0.17973,-0.472897,-0.451451,-0.178881,0.223801,-0.081331,-0.069009,0.886316,1.006435,0.338686,0.435197,-0.347843
173,458,0,1.807736,,,,,,0.614981,0.635133,1.882468,-0.327886,11,7,76,-0.709815,-0.507204,-0.473125,0.205299,-0.039872,-0.092011,0.573536,-0.004837,0.386396,,0.923311,0.237342,-1.558329,-1.313407,,,1.581155,-0.396477,0.22506,,-0.185647,0.817457,-0.592454,0.088044,0.640236,0.021879,0.015603,0.119185,1.737825,0.890449,1.691442,1.162424,1.314714,0.243937,-0.894177,0.510681,-0.176766,0.204046,0.427734,-1.011595,-1.091748,0.288567,0.841142,-0.674711,-0.35751,-0.212526,0.002593,-0.500916,-0.133224,0.060741,0.104819,-0.31754,-0.378839,-1.422157,0.593636,-0.904677,-0.262607,-0.460612,-0.605673,-0.581912,-0.253963,-0.205238,-0.229314,-0.362538,-0.271852,-0.251717,-0.257713,-1.23062
173,458,1,2.91537,,,,,,0.488421,0.772387,1.515856,-0.251704,11,7,76,-0.972408,-0.486752,-0.628382,-0.012652,-0.574241,-0.08046,-0.601459,-0.369373,0.625507,,1.440213,1.144646,-2.168929,-0.764314,,,1.78276,-0.608452,-0.010369,,-0.811327,0.229316,-0.490882,-0.606857,-0.094037,-0.321685,-0.066831,-0.022027,-0.406428,-0.676852,1.428394,-0.044953,0.968642,-0.465706,0.638096,0.262707,-9e-06,0.63441,0.675668,0.357317,1.101804,1.709052,1.386229,0.743044,-0.235305,0.073975,0.403365,0.160144,0.485487,0.060741,-0.145374,0.042278,-0.047062,-1.013508,0.102645,-0.671402,-0.323251,-0.573669,-0.992679,-0.363933,-0.484326,-0.232865,-0.26996,-0.251529,-0.271913,-0.203798,-0.37782,0.234755


In [5]:
date_id_sr = train_df['date_id']

In [6]:
half_sr = date_id_sr[-int(len(date_id_sr)/2):]

In [7]:
half_sr[0] == 1060

True

In [8]:
train_df.estimated_size() / 1e9

15.282841602

In [9]:
models_path = path + 'my_folder/models/20250109_01/'
if not os.path.exists(models_path):
    os.makedirs(models_path)

In [10]:
def lgb_sliding_window(train_data, optuna_n_trials):

    #unique_date_ids = sorted(train_data['date_id'].unique())
    unique_date_ids = [i for i in range(half_sr[0], train_data['date_id'].max()+1)]
    date_ids_df = train_data['date_id'].to_frame()

    def objective(trial):
        window_size = trial.suggest_int('window_size', 7000000, 10000000)
        #window_size = 10000000
        date_id = random.choice(unique_date_ids)
        #print(date_id)
        date_id_window_size = trial.suggest_int('date_id_window_size', 10, 100)
        #fraction = trial.suggest_float('fraction', 0.05, 0.12)
        #fraction = 0.1
        
        if date_id < date_id_window_size:
            date_id_df = train_df.filter(pl.col('date_id') < date_id_window_size)#.sample(fraction=fraction)
        else:
            date_id_df = train_df.filter((pl.col('date_id') > date_id - date_id_window_size)&(pl.col('date_id') <= date_id))#.sample(fraction=fraction)
        
        if date_ids_df.filter(pl.col('date_id') < date_id - date_id_window_size).shape[0] < window_size:
            window_df = train_data[:window_size + date_id_df.shape[0]]
            window_df = window_df.join(date_id_df, on=['date_id', 'time_id', 'symbol_id'], how='anti').sample(n=1000000).sort(by=['date_id', 'time_id', 'symbol_id'])
        else:
            window_df = train_data.filter(pl.col('date_id') < date_id - date_id_window_size + 1)[-window_size:].sample(n=1000000).sort(by=['date_id', 'time_id', 'symbol_id'])

        '''
        print('this is window_df')
        display(window_df)
        print('this is date_id_df')
        display(date_id_df)
        '''
    
        base_params = {
            'verbosity': -1,
            'device': 'gpu',
            'early_stopping_round': 20,
        }
    
        params_to_tune = {
            'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.07),
            'max_depth': trial.suggest_int('max_depth', 10, 25),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 300),
            'num_leaves': trial.suggest_int('num_leaves', 50, 10000),
            'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 0.3),
            'lambda_l1': trial.suggest_float('lambda_l1', 0, 10),
            'lambda_l2': trial.suggest_float('lambda_l2', 50, 2000),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.8, 1),
        }
    
        model = LGBMRegressor(
            **base_params,
            **params_to_tune,
            n_estimators=100000
        )
    
        X_train = window_df.drop(['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']).select(pl.all().shrink_dtype()).to_pandas()
        X_val = date_id_df.drop(['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']).select(pl.all().shrink_dtype()).to_pandas()
    
        y_train = window_df['responder_6'].to_pandas()
        y_val = date_id_df['responder_6'].to_pandas()
    
        weights_train = window_df['weight'].to_pandas()
        weights_val = date_id_df['weight'].to_pandas()
    
        model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val])#, callbacks=[log_evaluation(period=10)])

        '''
        val_preds = model.predict(X_val)

        val_score = r2_score(y_val, val_preds, sample_weight=weights_val)

        print('Val Weighted R2 score is:', val_score)
        '''

        return model.best_score_['valid_1']['l2']

    with tqdm(total=optuna_n_trials, desc="Optimizing", unit="trial") as pbar:
    
        # Define a callback function to update the progress bar
        def progress_bar_callback(study, trial):
            pbar.update(1)
    
        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=optuna_n_trials, callbacks=[progress_bar_callback])

    return study

In [11]:
lgb_study = lgb_sliding_window(train_df, 300)

[W 2025-01-09 05:50:43,467] Trial 150 failed with parameters: {'window_size': 9213705, 'date_id_window_size': 16, 'learning_rate': 0.0352658871858313, 'max_depth': 25, 'min_data_in_leaf': 59, 'num_leaves': 8009, 'min_gain_to_split': 0.10178316334575258, 'lambda_l1': 4.4051458764237035, 'lambda_l2': 725.4384255379523, 'feature_fraction': 0.9722056779875711} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "I:\Kaggle\kaggle_venvs\ml\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\peppe\AppData\Local\Temp\ipykernel_26224\2671156396.py", line 66, in objective
    model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val])#, callbacks=[log_evaluation(period=10)])
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [None]:
for param in lgb_study.best_params.keys():
    fig = plot_slice(lgb_study, params=[param])
    fig.show()

In [None]:
plot_param_importances(lgb_study)

In [None]:
lgb_study.best_params

In [None]:
lgb_study.best_value

In [None]:
for k, v in lgb_study.best_params.items():
    print(k, v)

In [None]:
lgb_params_df = pd.DataFrame({k:[v] for k, v in lgb_study.best_params.items()})

In [None]:
lgb_params_df

In [None]:
lgb_params_df.to_csv(models_path + 'lgb_params.csv', index=False)