In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import gc
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, log_evaluation, record_evaluation
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
#from sklearn.impute import IterativeImputer
import pickle
import optuna
import optunahub
from optuna.visualization import plot_slice, plot_param_importances
import shap
import random

gc.enable()

pd.options.display.max_columns = None
#pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

#pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(10000)
#optuna.logging.set_verbosity(optuna.logging.WARNING)

polars.config.Config

In [2]:
path = 'I:/Kaggle/jane-street-real-time-market-data-forecasting/'

In [3]:
os.listdir(path)

['db.sqlite3',
 'features.csv',
 'imputed_train_ffill.parquet',
 'kaggle_evaluation',
 'lags.parquet',
 'my_folder',
 'responders.csv',
 'sample_submission.csv',
 'team_folder',
 'test.parquet',
 'top_100000_rows_sorted_by_weight_descending.parquet',
 'top_10000_rows_sorted_by_weight_descending.parquet',
 'train.parquet']

In [4]:
train_df = pl.read_parquet(path + 'train.parquet/').select(pl.all().shrink_dtype())
lags_df = train_df.with_columns(pl.col('date_id') + 1).drop(['weight', 'partition_id'] + [col for col in train_df.columns if 'feature' in col]).rename({f'responder_{x}': f'responder_{x}_lag_1' for x in range(9)})
train_df = train_df.drop(['responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_7', 'responder_8', 'partition_id']).select(pl.all().shrink_dtype())
train_df = train_df.join(lags_df, on=['date_id', 'time_id', 'symbol_id'], how='left').select(pl.all().shrink_dtype())
del lags_df
gc.collect()
train_df

date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_6,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
0,0,1,3.889038,,,,,,0.851033,0.242971,0.2634,-0.891687,11,7,76,-0.883028,0.003067,-0.744703,,-0.169586,,-1.335938,-1.707803,0.91013,,1.636431,1.522133,-1.551398,-0.229627,,,1.378301,-0.283712,0.123196,,,,0.28118,0.269163,0.349028,-0.012596,-0.225932,,-1.073602,,,-0.181716,,,,0.564021,2.088506,0.832022,,0.204797,,,-0.808103,,-2.037683,0.727661,,-0.989118,-0.345213,-1.36224,,,,,,-1.251104,-0.110252,-0.491157,-1.02269,0.152241,-0.659864,,,-0.261412,-0.211486,-0.335556,-0.281498,0.775981,,,,,,,,,
0,0,7,1.370613,,,,,,0.676961,0.151984,0.192465,-0.521729,11,7,76,-0.865307,-0.225629,-0.582163,,0.317467,,-1.250016,-1.682929,1.412757,,0.520378,0.744132,-0.788658,0.641776,,,0.2272,0.580907,1.128879,,,,-1.512286,-1.414357,-1.823322,-0.082763,-0.184119,,,,,,,,,-10.835207,-0.002704,-0.621836,,1.172836,,,-1.625862,,-1.410017,1.063013,,0.888355,0.467994,-1.36224,,,,,,-1.065759,0.013322,-0.592855,-1.052685,-0.393726,-0.741603,,,-0.281207,-0.182894,-0.245565,-0.302441,0.703665,,,,,,,,,
0,0,9,2.285698,,,,,,1.056285,0.187227,0.249901,-0.77305,11,7,76,-0.675719,-0.199404,-0.586798,,-0.814909,,-1.296782,-2.040234,0.639589,,1.597359,0.657514,-1.350148,0.364215,,,-0.017751,-0.317361,-0.122379,,,,-0.320921,-0.95809,-2.436589,0.070999,-0.245239,,,,,,,,,-1.420632,-3.515137,-4.67776,,0.535897,,,-0.72542,,-2.29417,1.764551,,-0.120789,-0.063458,-1.36224,,,,,,-0.882604,-0.072482,-0.617934,-0.86323,-0.241892,-0.709919,,,0.377131,0.300724,-0.106842,-0.096792,2.109352,,,,,,,,,
0,0,10,0.690606,,,,,,1.139366,0.273328,0.306549,-1.262223,42,5,150,-0.694008,3.004091,0.114809,,-0.251882,,-1.902009,-0.979447,0.241165,,-0.392359,-0.224699,-2.129397,-0.855287,,,0.404142,-0.578156,0.105702,,,,0.544138,-0.087091,-1.500147,-0.201288,-0.038042,,,,,,,,,0.382074,2.669135,0.611711,,2.413415,,,1.313203,,-0.810125,2.939022,,3.988801,1.834661,-1.36224,,,,,,-0.697595,1.074309,-0.206929,-0.530602,4.765215,0.571554,,,-0.226891,-0.251412,-0.215522,-0.296244,1.114137,,,,,,,,,
0,0,14,0.44057,,,,,,0.9552,0.262404,0.344457,-0.613813,44,3,16,-0.947351,-0.030018,-0.502379,,0.646086,,-1.844685,-1.58656,-0.182024,,-0.969949,-0.673813,-1.282132,-1.399894,,,0.043815,-0.320225,-0.031713,,,,-0.08842,-0.995003,-2.635336,-0.196461,-0.618719,,,,,,,,,-2.0146,-2.321076,-3.711265,,1.253902,,,0.476195,,-0.771732,2.843421,,1.379815,0.411827,-1.36224,,,,,,-0.948601,-0.136814,-0.447704,-1.141761,0.099631,-0.661928,,,3.678076,2.793581,2.61825,3.418133,-3.57282,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1698,967,34,3.242493,2.52516,-0.721981,2.544025,2.477615,0.417557,0.785812,1.117796,2.199436,0.415427,42,5,150,0.804403,1.157257,1.031543,-0.671189,-0.3286,-0.486132,1.730176,-0.006173,-0.001144,-0.213062,0.932618,1.367338,-0.238197,-0.692615,-0.121163,1.090798,1.444294,-0.675626,-1.013264,-0.242888,3.427639,-0.958278,3.139836,3.416278,-1.655316,-0.59944,-0.932876,2.493458,0.969462,1.102016,0.158982,-0.496177,0.036177,1.309866,0.828025,1.577955,1.040802,1.255398,2.577441,0.057455,0.953005,1.377051,-0.396358,0.520262,1.179617,1.127657,2.231928,0.614652,2.412886,-1.101531,-0.384833,-0.275818,-0.40804,2.427115,-0.108427,0.739734,0.830205,0.366287,1.33325,1.075499,1.798264,-0.183443,-0.190222,0.234211,0.347142,-0.044463,0.016936,-0.132337,0.501321,0.905332,-0.819582,-0.564046,-0.223018,-0.283954,-0.045938,0.009797,-0.102538
1698,967,35,1.079139,1.857906,-0.790646,2.745439,2.339877,0.845065,0.65137,1.180301,1.966379,0.321543,25,7,195,-0.075294,-0.152726,-0.20417,-0.421137,0.21708,-0.258775,1.874978,0.19988,-0.199219,-0.125619,-1.004547,-0.051933,0.450905,0.009246,0.164127,-0.939974,-1.143421,-0.320071,-0.379835,-0.142429,3.862469,-1.451786,3.477489,2.861663,0.763459,0.075972,-0.119677,0.626035,0.148815,0.653281,0.059313,-0.845099,0.098528,0.409564,-0.675728,-0.011334,0.930534,0.83198,0.808955,0.219276,-0.315776,0.687755,-1.189577,0.180146,-0.175486,-1.60435,-0.209283,0.249847,0.288816,-1.101531,-0.343868,-0.253991,-0.278832,2.050639,-0.059506,-0.029396,-0.101381,-0.187759,-0.180839,-0.0861,-0.153405,-0.196077,-0.175292,1.04578,0.739733,0.03372,0.05086,-0.249584,-1.113053,0.69719,-1.619031,-1.222743,-0.706082,-0.291133,0.167733,0.099704,0.32461
1698,967,36,1.033172,2.515527,-0.672298,2.28925,2.521592,0.255077,0.919892,1.172018,2.180496,0.24846,49,7,297,1.026715,-0.096892,0.224309,-0.528109,-0.704952,-0.704818,2.312482,0.32804,-0.108193,,-0.945684,-0.244173,0.205989,-0.357343,,,-1.11075,-0.580242,-0.400568,,2.397877,-0.637258,3.260638,3.046786,0.440965,0.234842,-0.17558,1.022406,-0.500069,2.071033,0.413488,-0.450016,-0.156616,-0.253755,-0.769588,0.066086,0.047826,1.713707,0.772772,-0.549192,1.338474,0.933568,0.032978,-0.519118,-0.290343,-0.806786,0.106295,0.183461,1.830421,-1.101531,-0.341991,-0.249132,-0.34365,2.251358,0.601888,1.035051,-0.283241,0.107244,0.86016,0.024223,0.374852,-0.220933,-0.161584,0.032771,0.036888,0.168908,0.152333,-0.065355,-1.019353,-0.460962,-2.026678,-0.848606,-0.305448,-1.256913,-0.109359,-0.027474,-0.253956
1698,967,37,1.243116,2.663298,-0.889112,2.313155,3.101428,0.324454,0.618944,1.185663,1.599724,0.319719,34,4,214,0.759314,0.284057,0.41716,-0.611075,-0.513717,-0.891423,1.84994,0.406756,-1.608196,-0.252663,-0.271574,-0.051405,0.098146,-0.653961,0.173676,-0.016497,-0.404509,-0.577262,-0.731429,-0.21646,3.018564,-0.472061,3.13922,3.065858,0.842925,0.053283,-0.074403,0.500129,0.08263,0.336223,0.643934,-0.422367,-0.418195,0.203037,-0.702278,0.543305,-0.195764,0.693364,0.953293,0.352567,0.471775,1.876459,-0.143377,0.845516,0.301135,-0.395703,0.738038,-0.04124,1.270645,-1.101531,-0.358106,-0.141883,-0.255192,2.489247,0.537652,0.982107,-0.158009,0.137389,0.478357,0.782692,0.581421,-0.106056,-0.111017,0.163867,0.169331,-0.037563,-0.029483,-0.148711,0.23585,0.556479,0.618944,-0.243765,-0.108361,-0.260777,-0.486923,-0.275566,-1.020708


In [5]:
train_df.estimated_size() / 1e9

17.032444032

In [6]:
models_path = path + 'my_folder/models/20250112_02/'
if not os.path.exists(models_path):
    os.makedirs(models_path)

In [7]:
def lgb_sliding_window(train_data, optuna_n_trials):

    #unique_date_ids = sorted(train_data['date_id'].unique())
    #unique_date_ids = [i for i in range(1110, train_data['date_id'].max() +1 -90)]
    #date_ids_df = train_data['date_id'].to_frame()
    test_date_id_cut = train_data['date_id'].max() -180
    test_dataset_df = train_data.filter(pl.col('date_id') > test_date_id_cut)
    train_dataset_df = train_data.filter(pl.col('date_id') <= test_date_id_cut)
    train_date_id_max = train_dataset_df['date_id'].max()
    val_dataset_df = train_dataset_df.filter(pl.col('date_id') == train_date_id_max)
    training_df = train_dataset_df.filter(pl.col('date_id') < train_date_id_max)

    def objective(trial: optuna.Trial) -> float:
        #val_window_size = trial.suggest_int('val_window_size', 1, 100)
        #training_window_size = trial.suggest_int('training_window_size', 100, 1400)
        #n_samples = 1100000 #trial.suggest_int('n_samples', 1000000, 1500000)
        #fraction = trial.suggest_float('fraction', 0.05, 0.3)
        #sample_df = train_data.sample(fraction=fraction)
        #unique_date_ids = [i for i in range(1110, sample_df['date_id'].max()+1)]
        base_params = {
            'verbosity': -1,
            'device': 'gpu',
            'early_stopping_round': 20,
        }
        params_to_tune = {
            'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.06),
            'max_depth': trial.suggest_int('max_depth', 6, 25),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 300),
            'num_leaves': trial.suggest_int('num_leaves', 2000, 15000),
            'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 0.3),
            'lambda_l1': trial.suggest_float('lambda_l1', 1, 10),
            'lambda_l2': trial.suggest_float('lambda_l2', 1000, 10000),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.8, 1),
        }
    
        model = LGBMRegressor(
            **base_params,
            **params_to_tune,
            n_estimators=100000
        )
    
        X_train = training_df.drop(['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']).select(pl.all().shrink_dtype()).to_pandas()
        X_val = val_dataset_df.drop(['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']).select(pl.all().shrink_dtype()).to_pandas()
        X_test = test_dataset_df.drop(['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']).select(pl.all().shrink_dtype()).to_pandas()
    
        y_train = training_df['responder_6'].to_pandas()
        y_val = val_dataset_df['responder_6'].to_pandas()
        y_test = test_dataset_df['responder_6'].to_pandas()
    
        weights_train = training_df['weight'].to_pandas()
        weights_val = val_dataset_df['weight'].to_pandas()
        weights_test = test_dataset_df['weight'].to_pandas()
    
        model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val])#, callbacks=[log_evaluation(period=10)])
        
        test_preds = model.predict(X_test)

        test_score = r2_score(y_test, test_preds, sample_weight=weights_test)

        #print('Test Weighted R2 score is:', test_score)

        return test_score

    with tqdm(total=optuna_n_trials, desc="Optimizing", unit="trial") as pbar:
    
        # Define a callback function to update the progress bar
        def progress_bar_callback(study, trial):
            pbar.update(1)
    
        study = optuna.create_study(
            direction="maximize",
            sampler=optunahub.load_module("samplers/auto_sampler").AutoSampler(),
            storage="sqlite:///C:/Users/peppe/Python Notebooks/janestreet/optuna_study/db.sqlite3",
            study_name="js_last_180_days_model_param_tune_20250112_02"
        )
        study.optimize(objective, n_trials=optuna_n_trials, callbacks=[progress_bar_callback])

    return study

In [8]:
lgb_study = lgb_sliding_window(train_df, 100)

[I 2025-01-12 21:46:35,295] A new study created in RDB with name: js_last_180_days_model_param_tune_20250112_02trial/s]
[I 2025-01-12 21:53:11,058] Trial 0 finished with value: 0.00919586266775052 and parameters: {'learning_rate': 0.032643120366868277, 'max_depth': 10, 'min_data_in_leaf': 197, 'num_leaves': 7024, 'min_gain_to_split': 0.02080491798649725, 'lambda_l1': 6.0285833607284305, 'lambda_l2': 1687.278795920493, 'feature_fraction': 0.8611806001114815}. Best is trial 0 with value: 0.00919586266775052.
  return GPSampler(seed=seed)
[I 2025-01-12 21:58:12,317] Trial 1 finished with value: 0.008442027668334573 and parameters: {'learning_rate': 0.03260861263224332, 'max_depth': 7, 'min_data_in_leaf': 233, 'num_leaves': 6652, 'min_gain_to_split': 0.08126064217332375, 'lambda_l1': 7.455036465200563, 'lambda_l2': 6876.153593889046, 'feature_fraction': 0.8624253578664131}. Best is trial 0 with value: 0.00919586266775052.
[I 2025-01-12 22:09:16,944] Trial 2 finished with value: 0.008602346

KeyboardInterrupt: 

In [None]:
for param in lgb_study.best_params.keys():
    fig = plot_slice(lgb_study, params=[param])
    fig.show()

In [None]:
plot_param_importances(lgb_study)

In [None]:
lgb_study.best_params

In [None]:
lgb_study.best_value

In [None]:
for k, v in lgb_study.best_params.items():
    print(k, v)

In [None]:
lgb_params_df = pd.DataFrame({k:[v] for k, v in lgb_study.best_params.items()})

In [None]:
lgb_params_df

In [None]:
lgb_params_df.to_csv(models_path + 'lgb_params.csv', index=False)