# Models training

In this notebook I'll show how to train a simple lgbm model.
If you want you have to possibility to train several lgbm model, here I'm just using one combination of params for trainings.

In [1]:
# Imports
import polars as pl
import lightgbm as lgb
import itertools
import glob
import numpy as np
import os
import copy
from pathlib import Path
from sklearn.model_selection import train_test_split

In [2]:
# define columns to read
feature_cols = [f'feature_{x:02}' for x in range(79)]
responder_cols = [f'responder_{i}' for i in range(9)]
responder_lags = [f'responder_{i}_lag_1' for i in range(9)]

In [3]:
# define base dir
DATA_DIR = Path('/kaggle/input/')
N_PARTITION = 10

In [4]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    feature_cols = ["symbol_id", "time_id"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)]
    all_cols =  ["date_id","time_id", "symbol_id", "weight"] + [f"feature_{idx:02d}" for idx in range(79)]+ [f"responder_{idx}_lag_1" for idx in range(9)] + [target_col]

    data_paths = [
        "/kaggle/input/js24-preprocessing-create-lags/training.parquet/",
    ]

# Model training

In [5]:
pl_train = pl.concat([pl.read_parquet(_f, columns=CONFIG.all_cols) for _f in glob.glob(os.path.join(CONFIG.data_paths[0], "*/*parquet"))])

In [6]:
pl_train.head()

date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,…,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1,responder_6
i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
1459,0,0,6.272114,0.284779,1.367876,0.061011,0.611989,2.428839,-0.306044,0.327402,0.14567,0.241236,11,7,76,-0.774271,1.470566,-0.181446,,-0.226076,,-1.109849,-0.86537,0.402435,0.163787,1.829081,0.792077,2.429708,0.84077,1.354493,1.090451,0.313082,-0.603189,-0.758272,0.187623,,…,,,-1.358412,,-0.645864,1.096121,,1.875256,0.926568,1.103987,-0.261278,-0.369544,-0.326938,-1.401343,-1.358529,-0.658659,1.48607,-0.224669,-1.081992,1.097029,-0.27703,,,5.860481,8.190721,2.706833,3.22771,0.989091,0.311066,0.238182,-0.182826,-0.078809,-0.045538,-0.073532,-0.007304,-0.155109,0.518053
1459,0,1,3.552529,0.338234,1.45932,-0.472118,0.544349,2.820604,-0.280947,0.325028,0.165659,0.140202,11,7,76,-1.065184,0.858682,-0.608783,,-0.349334,,-1.313855,-2.731537,0.690834,0.64343,0.324025,-0.150489,2.388658,0.383097,-1.133539,-0.612681,0.35688,-0.538027,-0.501963,0.558729,,…,,,-1.483555,,-1.867761,1.240135,,-2.539496,-1.600528,1.103987,-0.314283,-0.231416,-0.297536,-1.10211,-2.13699,-0.831114,0.484567,-0.489232,-0.934581,0.555128,-0.48204,,,2.322282,3.203155,0.728853,0.424349,1.221226,1.233946,0.54823,0.005668,0.006927,0.03315,0.056857,0.048867,0.064007,0.028188
1459,0,2,3.43617,0.714179,1.393869,-0.100954,0.060362,2.67652,-0.192269,0.301498,0.179047,0.20748,81,2,59,-0.833855,0.696863,-0.378966,,-0.391663,,-1.15009,-1.588253,-0.208019,0.033058,0.145311,-0.266151,2.266596,0.106538,0.756999,0.390671,-0.219139,-0.922623,-0.773966,0.044262,,…,,,0.211052,,-2.191152,2.085952,,-0.146265,-0.095684,1.103987,-0.553068,-0.297724,-0.36485,-1.430172,-1.972257,-0.83755,0.498855,-0.468964,-0.697338,0.859621,-0.298157,,,0.103668,0.187409,-0.152425,-0.198911,2.498147,2.075224,-0.819492,-0.066036,-0.028982,-0.15746,0.135269,0.079109,0.213121,0.392735
1459,0,3,3.708833,0.608647,1.410406,0.304514,0.316107,2.89652,-0.280715,0.462354,0.120939,0.22711,4,3,11,-0.776751,0.761279,-0.164175,,-0.435497,,-1.262971,-2.139015,-0.724918,0.154513,-0.215643,-1.092684,2.380904,0.478625,-0.053792,-0.481114,-0.767898,-0.932684,-1.290758,0.210172,,…,,,-1.218683,,-1.830643,1.231215,,-1.14774,-0.736113,1.103987,-0.143147,-0.235812,-0.331154,-1.712278,-2.390939,-1.159261,0.217245,-0.473246,-0.851354,0.791675,-0.440235,,,4.014511,5.286469,-0.229585,-0.300696,1.394534,0.784929,0.48956,-0.322424,-0.144407,0.295834,0.242603,0.106816,0.498997,0.424594
1459,0,4,1.987908,0.51705,1.169389,0.305554,0.003111,3.448445,-0.233972,0.252148,0.170875,0.156473,15,1,9,-0.817543,1.892761,-0.147779,,-0.421498,,-1.145212,-1.343885,-0.99354,0.072836,-0.523466,-1.038173,1.651809,0.717405,-1.038091,-0.902733,-0.348741,-0.602181,-0.916636,0.031108,,…,,,-1.314001,,-1.23573,1.265067,,-1.532822,-0.791146,1.103987,-0.37807,-0.088048,-0.289169,-1.190867,-1.546377,-0.602668,1.077879,-0.18245,-0.453072,2.622363,-0.067118,,,2.709814,4.295229,0.130722,0.101925,-0.285501,0.402817,1.763028,-0.039663,-0.012079,0.545526,0.159254,0.07332,0.21008,-1.001556


In [7]:
pl_train = pl_train.sort("date_id")

In [None]:
pl_train = pl_train.sort("time_id")

In [10]:
X = pl_train.select(CONFIG.feature_cols).to_numpy()
y = pl_train.select(CONFIG.target_col).to_numpy().flatten()

In [11]:
weights = pl_train.select(["weight"]).to_numpy().flatten()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
weights_train, weights_test = train_test_split(weights, test_size=0.2, random_state=42)

In [17]:
# Create a LightGBM Dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

In [13]:
# Train many lightgbm models and save them
num_leaves = [31, 63, 127]
feature_fraction = [0.6, 0.8]
n_estimators = [50, 100]
learning_rate = [0.05, 0.1]

# Generate all combinations
param_combinations = list(itertools.product(num_leaves, feature_fraction, n_estimators, learning_rate))

# Convert to a list of dictionaries (optional, for LightGBM compatibility)
param_dicts = [
    {"num_leaves": nl, "feature_fraction": ff, "n_estimators": ne, "learning_rate": lr}
    for nl, ff, ne, lr in param_combinations
]

# Print the combinations
for params in param_dicts:
    print(params)

{'num_leaves': 31, 'feature_fraction': 0.6, 'n_estimators': 50, 'learning_rate': 0.05}
{'num_leaves': 31, 'feature_fraction': 0.6, 'n_estimators': 50, 'learning_rate': 0.1}
{'num_leaves': 31, 'feature_fraction': 0.6, 'n_estimators': 100, 'learning_rate': 0.05}
{'num_leaves': 31, 'feature_fraction': 0.6, 'n_estimators': 100, 'learning_rate': 0.1}
{'num_leaves': 31, 'feature_fraction': 0.8, 'n_estimators': 50, 'learning_rate': 0.05}
{'num_leaves': 31, 'feature_fraction': 0.8, 'n_estimators': 50, 'learning_rate': 0.1}
{'num_leaves': 31, 'feature_fraction': 0.8, 'n_estimators': 100, 'learning_rate': 0.05}
{'num_leaves': 31, 'feature_fraction': 0.8, 'n_estimators': 100, 'learning_rate': 0.1}
{'num_leaves': 63, 'feature_fraction': 0.6, 'n_estimators': 50, 'learning_rate': 0.05}
{'num_leaves': 63, 'feature_fraction': 0.6, 'n_estimators': 50, 'learning_rate': 0.1}
{'num_leaves': 63, 'feature_fraction': 0.6, 'n_estimators': 100, 'learning_rate': 0.05}
{'num_leaves': 63, 'feature_fraction': 0.6,

In [14]:
f"Nb of combinations: {len(param_dicts)}"

'Nb of combinations: 24'

In [15]:
# Use just one set of param for this notebook
param_dicts = [{"num_leaves": 31, "feature_fraction": 0.8, "n_estimators": 100, "learning_rate": 0.1}]

In [18]:
for i in range(len(param_dicts)):
    # model params
    input_params = param_dicts[i]
    
    # Define Parameters
    params = {
        'objective': 'regression',
        'metric': 'rmse',                                      # Root Mean Squared Error
        'boosting_type': 'gbdt',                               # Gradient Boosted Decision Trees
        'num_leaves': input_params['num_leaves'],
        'learning_rate': input_params['learning_rate'],
        'feature_fraction': input_params['feature_fraction'],
        'n_estimators': input_params['n_estimators']      
    }
    
    # Train the model
    lgbm_model = lgb.train(
        params,
        train_data,
        valid_sets=[train_data, test_data],
        num_boost_round=50
    )

    lgbm_model.save_model(f"lgbm_model_{i}.json")



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 10.977232 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22031
[LightGBM] [Info] Number of data points in the train set: 16817644, number of used features: 90
[LightGBM] [Info] Start training from score -0.000975


In [20]:
os.listdir()

['lgbm_model_0.json', '.virtual_documents']

In [None]:
import kaggle_evaluation.jane_street_inference_server

In [None]:

# Replace this function with your inference code.
# You can return either a Pandas or Polars dataframe, though Polars is recommended.
# Each batch of predictions (except the very first) must be returned within 1 minute of the batch features being provided.
def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    """Make a prediction."""
    # All the responders from the previous day are passed in at time_id == 0. We save them in a global variable for access at every time_id.
    # Use them as extra features, if you like.
    global lags_
    if lags is not None:
        lags_ = lags

    # Replace this section with your own predictions
    # predictions = test.select(
    #     'row_id',
    #     pl.lit(0.0).alias('responder_6'),
    # )
    predictions = predict_xgb(test,lags).to_pandas()

    
    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    # Confirm has as many rows as the test data.
    assert len(predictions) == len(test)

    return predictions

In [None]:
inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        (
            '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
            '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
        )
    )