In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
import gc
import seaborn as sns
from tqdm import tqdm
from sklearn.model_selection import KFold, StratifiedKFold
import xgboost as xgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, log_evaluation, record_evaluation
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
#from sklearn.impute import IterativeImputer
import pickle
import optuna
from optuna.visualization import plot_slice, plot_param_importances
import shap
import random

gc.enable()

pd.options.display.max_columns = None
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', None)

pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)
pl.Config.set_fmt_str_lengths(10000)

polars.config.Config

In [2]:
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [3]:
path = 'I:/Kaggle/jane-street-real-time-market-data-forecasting/'

In [4]:
os.listdir(path)

['features.csv',
 'kaggle_evaluation',
 'lags.parquet',
 'my_folder',
 'responders.csv',
 'sample_submission.csv',
 'team_folder',
 'test.parquet',
 'top_100000_rows_sorted_by_weight_descending.parquet',
 'top_10000_rows_sorted_by_weight_descending.parquet',
 'train.parquet']

In [5]:
train_df = pl.scan_parquet(path + 'train.parquet/').drop('partition_id').select(pl.all().shrink_dtype())
lags_df = train_df.with_columns(pl.col('date_id') + 1).drop(['weight'] + [col for col in train_df.collect_schema().names() if 'feature' in col]).rename({f'responder_{x}': f'responder_{x}_lag_1' for x in range(9)})
train_df = train_df.drop(['responder_0', 'responder_1', 'responder_2', 'responder_3', 'responder_4', 'responder_5', 'responder_7', 'responder_8']).select(pl.all().shrink_dtype())
train_df = train_df.join(lags_df, on=['date_id', 'time_id', 'symbol_id'], how='left').select(pl.all().shrink_dtype())
del lags_df
gc.collect()

  lags_df = train_df.with_columns(pl.col('date_id') + 1).drop(['weight'] + [col for col in train_df.columns if 'feature' in col]).rename({f'responder_{x}': f'responder_{x}_lag_1' for x in range(9)})


0

In [6]:
train_scan = pl.scan_parquet(path + 'train.parquet/')
test_scan = pl.scan_parquet(path + 'test.parquet/')

In [7]:
train_symbol_ids_list = sorted(train_scan.select('symbol_id').unique().collect()['symbol_id'].to_list())
test_symbol_ids_list = sorted(test_scan.select('symbol_id').unique().collect()['symbol_id'].to_list())
unique_symbol_ids_list = sorted(list(set(train_symbol_ids_list + test_symbol_ids_list)))

In [8]:
def one_hot_cat_cols(df):
    for v in tqdm(unique_symbol_ids_list):
        new_col_name = 'symbol_id_' + str(v)
        #df[new_col_name] = (df['symbol_id'] == v).astype(int)
        df = df.with_columns((pl.col('symbol_id') == v).cast(pl.Int8).alias(new_col_name))

    
    #df = df.drop('symbol_id', axis=1)

    return df.select(pl.all().shrink_dtype())

In [9]:
features_df = pl.read_csv(path + 'features.csv').select(pl.all().shrink_dtype())
responders_df = pl.read_csv(path + 'responders.csv').select(pl.all().shrink_dtype())

In [10]:
tags_list = features_df.columns
tags_list.remove('feature')

In [11]:
responder_tags_list = responders_df.columns
responder_tags_list.remove('responder')

In [12]:
def create_tag_features(df):
    for tag in tqdm(tags_list):
        tag_features = features_df.filter(pl.col(f"{tag}") == True)["feature"].to_list()
        df = df.with_columns(
            pl.reduce(
                lambda acc, x: acc + x,
                tag_features
            ).alias('feature_' + tag + '_sum'),
            pl.mean_horizontal(tag_features).alias('feature_' + tag + '_mean'),
            pl.min_horizontal(tag_features).alias('feature_' + tag + '_min'),
            pl.max_horizontal(tag_features).alias('feature_' + tag + '_max')
        )

    return df.select(pl.all().shrink_dtype())

In [13]:
def create_responders_tag_features(df):
    for tag in tqdm(responder_tags_list):
        tag_features = responders_df.filter(pl.col(f"{tag}") == True)["responder"].to_list()
        tag_features = [v + '_lag_1' for v in tag_features]
        df = df.with_columns(
            pl.reduce(
                lambda acc, x: acc + x,
                tag_features
            ).alias('responder_' + tag + '_sum'),
            pl.mean_horizontal(tag_features).alias('responder_' + tag + '_mean'),
            pl.min_horizontal(tag_features).alias('responder_' + tag + '_min'),
            pl.max_horizontal(tag_features).alias('responder_' + tag + '_max')
        )

    return df.select(pl.all().shrink_dtype())

In [14]:
train_df = create_tag_features(train_df)
train_df = create_responders_tag_features(train_df)
#print(train_df.shape)
#train_df.head()

100%|████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 1688.93it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 5000.36it/s]


In [15]:
#train_df.estimated_size() / 1e9

In [16]:
models_path = path + 'my_folder/models/20250106_04/'
if not os.path.exists(models_path):
    os.makedirs(models_path)

In [17]:
def lgb_sliding_window(train_data, optuna_n_trials):
    #weights = train_data['weight'].to_pandas()
    #y = train_data['responder_6'].to_pandas()

    unique_date_ids = sorted(train_data.select("date_id").unique().collect().to_series().to_list())

    #sample_df = train_data.sample(fraction=0.1)
    #print(sample_df.shape)

    date_ids_df = train_data.select("date_id").collect()

    def objective(trial):

        window_size = trial.suggest_int('window_size', 5000000, 15000000)
    
        #for date_id in unique_date_ids:
        date_id = random.choice(unique_date_ids)
        #date_id = max(unique_date_ids)
        
        date_id_df = train_data.filter(pl.col('date_id') == date_id)
        if date_ids_df.filter(pl.col('date_id') < date_id).shape[0] < window_size:
            window_df = train_data[:window_size + date_id_df.shape[0]]
            window_df = window_df.join(date_id_df, on=['date_id', 'time_id', 'symbol_id'], how='anti')
        else:
            window_df = train_data.filter(pl.col('date_id') < date_id)[-window_size:]
    
        base_params = {
            'verbosity': -1,
            #'learning_rate': 0.05,
            #'feature_fraction': 0.8,
            'device': 'gpu',
            'early_stopping_round': 30,
            #'lambda_l2': 100,
            #'metric': 'r2',
            #'seed': 42
        }
    
        params_to_tune = {
            'learning_rate': trial.suggest_float('learning_rate', 0.03, 0.07),
            'max_depth': trial.suggest_int('max_depth', 4, 50),
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 300),
            'num_leaves': trial.suggest_int('num_leaves', 50, 10000),
            'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0, 0.3),
            'lambda_l1': trial.suggest_float('lambda_l1', 0, 10),
            'lambda_l2': trial.suggest_float('lambda_l2', 50, 2000),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.7, 1),
        }
    
        model = LGBMRegressor(
            **base_params,
            **params_to_tune,
            n_estimators=100000
        )
    
        X_train = window_df.drop(['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']).select(pl.all().shrink_dtype()).collect().to_pandas()
        X_val = date_id_df.drop(['date_id', 'time_id', 'symbol_id', 'weight', 'responder_6']).select(pl.all().shrink_dtype()).collect().to_pandas()
    
        y_train = window_df.select('responder_6').collect().to_series().to_pandas()
        y_val = date_id_df.select('responder_6').collect().to_series().to_pandas()
    
        weights_train = window_df.select('weight').collect().to_series().to_pandas()
        weights_val = date_id_df.select('weight').collect().to_series().to_pandas()
    
        model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val])#, callbacks=[log_evaluation(period=10)])

        return model.best_score_['valid_1']['l2']

    with tqdm(total=optuna_n_trials, desc="Optimizing", unit="trial") as pbar:
    
        # Define a callback function to update the progress bar
        def progress_bar_callback(study, trial):
            pbar.update(1)
    
        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=optuna_n_trials, callbacks=[progress_bar_callback])

    return study

In [18]:
%%time
lgb_study = lgb_sliding_window(train_df, 10)

[W 2025-01-06 23:48:35,339] Trial 1 failed with parameters: {'window_size': 9693618, 'learning_rate': 0.03609232955815071, 'max_depth': 22, 'min_data_in_leaf': 115, 'num_leaves': 7338, 'min_gain_to_split': 0.20664245856598326, 'lambda_l1': 5.630118968031885, 'lambda_l2': 1144.5208674299765, 'feature_fraction': 0.9158395771238041} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "I:\Kaggle\kaggle_venvs\ml\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\peppe\AppData\Local\Temp\ipykernel_19400\3227260707.py", line 64, in objective
    model.fit(X_train, y_train, sample_weight=weights_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_sample_weight=[weights_train, weights_val])#, callbacks=[log_evaluation(period=10)])
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 

In [19]:
for param in lgb_study.best_params.keys():
    fig = plot_slice(lgb_study, params=[param])
    fig.show()

NameError: name 'lgb_study' is not defined

In [None]:
plot_param_importances(lgb_study)

In [None]:
lgb_study.best_params

In [None]:
lgb_study.best_value

In [None]:
for k, v in lgb_study.best_params.items():
    print(k, v)

In [None]:
lgb_params_df = pd.DataFrame({k:[v] for k, v in lgb_study.best_params.items()})

In [None]:
lgb_params_df

In [None]:
lgb_params_df.to_csv(models_path + 'lgb_params.csv', index=False)