# Randomized search CV for the additive LightGBM model

- Rerunning takes long
- Results might depend on seed
- Basically a copy of Chapter 3 code

In [1]:
# Rerunning takes ~30 minutes; results might depend on seed
import json

import lightgbm as lgb
import pandas as pd
from joblib import load
from lightgbm import LGBMRegressor
from sklearn.model_selection import GroupKFold, ParameterSampler  # , ParameterGrid
from sklearn.pipeline import Pipeline
from tqdm import tqdm

grid_file = "grid_add_lgb.txt"

train, test, X_train, X_test, y_train, y_test, w_train, w_test, xvars, prep_lgb = load(
    "data.joblib"
)

# Data interface of LightGBM
dtrain = lgb.Dataset(
    prep_lgb.fit_transform(X_train),
    label=y_train,
    weight=w_train,
    params={"feature_pre_filter": False},
)

In [2]:
# STEP 1: Model for expected claims frequency -> Poisson deviance

# STEP 2: Select learning rate so that optimal number of rounds by early stopping is
# somewhere between 100 and 1000
params = {
    "objective": "poisson",
    "learning_rate": 0.5,  # much higher than for a more flexibel model
    "num_leaves": 2,  # to get additive model
}

In [None]:
# k-fold grouped cross-validation to see how many trees are required by early stopping
folds = list(GroupKFold(n_splits=5).split(X=X_train, groups=train.group_id))

cvm = lgb.cv(
    params=params,
    train_set=dtrain,
    num_boost_round=5000,
    folds=folds,
    # nfold = 5,  # for the ungrouped case, plus stratified=False,
    eval_train_metric=True,
    callbacks=[lgb.log_evaluation(period=0), lgb.early_stopping(20, verbose=0)],
)

# A LR of 0.5 provides about 400 trees, which is convenient
cvm = pd.DataFrame(cvm)
print("Best boosting round with default params:\n", len(cvm))
cvm.tail(1)

In [None]:
# STEP 3: Iterate randomized SearchCV for regularization parameters
if True:
    # Or ParameterGrid(...) if grid is small enough to check all combinations
    grid = ParameterSampler(
        {
            "objective": ["poisson"],
            "learning_rate": [0.5],
            "num_leaves": [2],  # to get additive model
            "reg_lambda": [0, 2.5, 5, 7.5],
            "reg_alpha": [0, 4],
            "colsample_bynode": [0.8, 1],
            "subsample": [0.8, 1],
            "min_child_samples": [20, 50, 100],
            "poisson_max_delta_step": [0.1, 0.7],
            "verbose": [-1],
        },
        n_iter=50,
        random_state=94,
    )

    # Iterate over grid and save relevant information on disk
    search = []
    for g in tqdm(grid):
        cvm = lgb.cv(
            params=g,
            train_set=dtrain,
            num_boost_round=5000,
            folds=folds,
            seed=82,
            eval_train_metric=True,
            callbacks=[lgb.log_evaluation(period=0), lgb.early_stopping(20, verbose=0)],
        )
        # Keep number of rounds, cv score, train score, and parameters
        cvm = pd.DataFrame(cvm)
        search.append((len(cvm), *cvm.iloc[-1, [2, 0]], g))

        with open(grid_file, "w") as f:
            json.dump(search, f)

# Load grid and check (A) sort order and (B) if grid ranges were set reasonable
with open(grid_file) as f:
    search = json.load(f)

search_df = pd.DataFrame.from_records(
    search, columns=["num_boost_round", "cv_score", "train_score", "params"]
).sort_values("cv_score")

with pd.option_context("display.max_colwidth", None):
    display(search_df.head())

In [17]:
# Best parameters
best = search_df.iloc[0]
best.num_boost_round, best.params

(568,
 {'verbose': -1,
  'subsample': 1,
  'reg_lambda': 7.5,
  'reg_alpha': 0,
  'poisson_max_delta_step': 0.7,
  'objective': 'poisson',
  'num_leaves': 2,
  'min_child_samples': 50,
  'learning_rate': 0.5,
  'colsample_bynode': 1})

In [None]:
# Fit on best parameters
lgbr = LGBMRegressor(**best.params, n_estimators=best.num_boost_round, random_state=59)
model_lgb = Pipeline([("preprocessor", prep_lgb), ("model", lgbr)])
_ = model_lgb.fit(X=X_train, y=y_train, model__sample_weight=w_train)