# Ungrouped CV randomized search for the LightGBM model for French MTPL data

- Rerunning takes long
- Results might depend on seed
- Almost copy of Chapter 3 code

In [1]:
import json

import lightgbm as lgb
import pandas as pd
from joblib import load
from lightgbm import LGBMRegressor
from sklearn.model_selection import ParameterSampler, train_test_split
from sklearn.pipeline import Pipeline
from tqdm import tqdm

grid_file = "grid_nongrouped_lgb.txt"

train, test, *_, xvars, prep_lgb = load("data.joblib")
df = pd.concat([train, test], axis=0)
df.shape

(678013, 16)

In [2]:
# Random split (same split as for the ungrouped GLM in the main notebook)
X_train_u, X_test_u, y_train_u, y_test_u, w_train_u, w_test_u = train_test_split(
    df[xvars], df.Freq, df.Exposure, train_size=0.8, random_state=90
)

# Data interface of LightGBM
dtrain_u = lgb.Dataset(
    prep_lgb.fit_transform(X_train_u),
    label=y_train_u.to_numpy(),
    weight=w_train_u.to_numpy(),
    params={"feature_pre_filter": False},
)

# STEP 1: Model for expected claims frequency -> Poisson deviance

# STEP 2: Select learning rate so that optimal number of rounds by early stopping is
# somewhere between 100 and 1000
params = {
    "objective": "poisson",
    "learning_rate": 0.05,
    "verbose": -1,  # play with learning_rate
}

In [3]:
cvm = lgb.cv(
    params=params,
    train_set=dtrain_u,
    num_boost_round=5000,
    nfold=5,
    stratified=False,
    eval_train_metric=True,
    callbacks=[lgb.log_evaluation(period=0), lgb.early_stopping(20, verbose=0)],
)

# A LR of 0.05 provides about 550 trees, which is convenient
cvm = pd.DataFrame(cvm)
print("Best boosting round with default params:\n", len(cvm))
cvm.tail(1)

Best boosting round with default params:
 551


Unnamed: 0,train poisson-mean,train poisson-stdv,valid poisson-mean,valid poisson-stdv
550,0.299765,0.000581,0.309713,0.002385


In [23]:
# STEP 3: Iterate randomized SearchCV for regularization parameters
if True:
    # Or ParameterGrid(...) if grid is small enough to check all combinations
    grid = ParameterSampler(
        {
            "objective": ["poisson"],
            "learning_rate": [0.05],
            "num_leaves": [15, 31, 63],
            "reg_lambda": [0, 2.5, 5, 7.5],
            "reg_alpha": [0, 4],
            "colsample_bynode": [0.8, 1],
            "subsample": [0.8, 1],
            "min_child_samples": [20, 50, 100],
            "poisson_max_delta_step": [0.1, 0.7],
            "verbose": [-1],
        },
        n_iter=50,
        random_state=94,
    )

    # Iterate over grid and save relevant information on disk
    search = []
    for g in tqdm(grid):
        cvm = lgb.cv(
            params=g,
            train_set=dtrain_u,
            num_boost_round=5000,
            nfold=5,
            stratified=False,
            seed=82,
            eval_train_metric=True,
            callbacks=[lgb.log_evaluation(period=0), lgb.early_stopping(20, verbose=0)],
        )
        # Keep number of rounds, cv score, train score, and parameters
        cvm = pd.DataFrame(cvm)
        search.append((len(cvm), *cvm.iloc[-1, [2, 0]], g))

        with open(grid_file, "w") as f:
            json.dump(search, f)

# Load grid and check (A) sort order and (B) if grid ranges were set reasonable
with open(grid_file) as f:
    search = json.load(f)

search_df = pd.DataFrame.from_records(
    search, columns=["num_boost_round", "cv_score", "train_score", "params"]
).sort_values("cv_score")

with pd.option_context("display.max_colwidth", None):
    display(search_df.head())

100%|██████████| 50/50 [53:20<00:00, 64.00s/it]


Unnamed: 0,num_boost_round,cv_score,train_score,params
32,380,0.309547,0.295104,"{'verbose': -1, 'subsample': 0.8, 'reg_lambda': 0, 'reg_alpha': 0, 'poisson_max_delta_step': 0.7, 'objective': 'poisson', 'num_leaves': 63, 'min_child_samples': 20, 'learning_rate': 0.05, 'colsample_bynode': 0.8}"
42,345,0.309624,0.297844,"{'verbose': -1, 'subsample': 1, 'reg_lambda': 0, 'reg_alpha': 0, 'poisson_max_delta_step': 0.7, 'objective': 'poisson', 'num_leaves': 63, 'min_child_samples': 50, 'learning_rate': 0.05, 'colsample_bynode': 0.8}"
45,344,0.309716,0.299935,"{'verbose': -1, 'subsample': 1, 'reg_lambda': 5, 'reg_alpha': 0, 'poisson_max_delta_step': 0.7, 'objective': 'poisson', 'num_leaves': 63, 'min_child_samples': 50, 'learning_rate': 0.05, 'colsample_bynode': 0.8}"
18,303,0.309748,0.301403,"{'verbose': -1, 'subsample': 1, 'reg_lambda': 7.5, 'reg_alpha': 0, 'poisson_max_delta_step': 0.7, 'objective': 'poisson', 'num_leaves': 63, 'min_child_samples': 100, 'learning_rate': 0.05, 'colsample_bynode': 0.8}"
11,146,0.309769,0.30311,"{'verbose': -1, 'subsample': 1, 'reg_lambda': 2.5, 'reg_alpha': 4, 'poisson_max_delta_step': 0.1, 'objective': 'poisson', 'num_leaves': 63, 'min_child_samples': 20, 'learning_rate': 0.05, 'colsample_bynode': 1}"


In [24]:
# Best parameters
best = search_df.iloc[0]
best.num_boost_round, best.params

(380,
 {'verbose': -1,
  'subsample': 0.8,
  'reg_lambda': 0,
  'reg_alpha': 0,
  'poisson_max_delta_step': 0.7,
  'objective': 'poisson',
  'num_leaves': 63,
  'min_child_samples': 20,
  'learning_rate': 0.05,
  'colsample_bynode': 0.8})

In [26]:
# Fit on best parameters
lgbr = LGBMRegressor(**best.params, n_estimators=best.num_boost_round, random_state=59)
model_lgb = Pipeline([("preprocessor", prep_lgb), ("model", lgbr)])
_ = model_lgb.fit(X=X_train_u, y=y_train_u, model__sample_weight=w_train_u)