# Randomized search CV for the additive XGB model

- Rerunning takes ~ten minutes
- Results might depend on seed
- Basically a copy of Chapter 3 code

In [2]:
# Rerunning takes ~10 minutes; results might depend on seed
import json

import pandas as pd
import xgboost as xgb
from joblib import load
from sklearn.model_selection import ParameterSampler  # , ParameterGrid
from tqdm import tqdm
from xgboost import XGBRegressor

grid_file = "grid_additive_xgb.txt"

# Data splits already prepared in exercise text (must run first)
X_train, X_test, y_train, y_test = load("data.joblib")

# Data interface of XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)

# STEP 1: Model for the expected claims frequency
# -> Gamma deviance is both strictly consistent and meaningful

# STEP 2: Select learning rate to get reasonable number of trees by early stopping
params = {
    "objective": "reg:gamma",
    "learning_rate": 0.1,
    "max_depth": 1,  # for additivity
}

In [2]:
# k-fold cross-validation to see if number of trees is reasonable
cvm = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=5000,
    nfold=5,
    early_stopping_rounds=20,
    callbacks=[xgb.callback.EvaluationMonitor(period=50)],
)

# A LR of 0.1 provides about 550 trees, which is ok
print("Best boosting round with default params:\n")
cvm.tail(1)

[0]	train-gamma-nloglik:4270.02550	test-gamma-nloglik:4270.02868
[50]	train-gamma-nloglik:35.72487	test-gamma-nloglik:35.72862
[100]	train-gamma-nloglik:9.83816	test-gamma-nloglik:9.84827
[150]	train-gamma-nloglik:9.68193	test-gamma-nloglik:9.71143
[200]	train-gamma-nloglik:9.65195	test-gamma-nloglik:9.69338
[250]	train-gamma-nloglik:9.63782	test-gamma-nloglik:9.68319
[300]	train-gamma-nloglik:9.62945	test-gamma-nloglik:9.67944
[350]	train-gamma-nloglik:9.62374	test-gamma-nloglik:9.67721
[400]	train-gamma-nloglik:9.61931	test-gamma-nloglik:9.67501
[450]	train-gamma-nloglik:9.61568	test-gamma-nloglik:9.67300
[500]	train-gamma-nloglik:9.61266	test-gamma-nloglik:9.67236
[550]	train-gamma-nloglik:9.61010	test-gamma-nloglik:9.67101
[563]	train-gamma-nloglik:9.60950	test-gamma-nloglik:9.67111
Best boosting round with default params:



Unnamed: 0,train-gamma-nloglik-mean,train-gamma-nloglik-std,test-gamma-nloglik-mean,test-gamma-nloglik-std
543,9.610437,0.016802,9.670975,0.080579


In [4]:
# STEP 3: Iterate randomized SearchCV for regularization parameters
if True:
    # Or ParameterGrid(...) if grid is small enough to check all combinations
    grid = ParameterSampler(
        {
            "objective": ["reg:gamma"],
            "learning_rate": [0.1],
            "max_depth": [1],  # deciding for additivity
            "colsample_bynode": [0.8, 1],
            "subsample": [0.8, 1],
            "reg_lambda": [0, 1, 2, 3],
            "reg_alpha": [0, 1, 2, 3],
            "min_split_loss": [0, 0.001],
        },
        n_iter=50,
        random_state=9,
    )

    # Iterate over grid and save relevant information on disk
    search = []
    for g in tqdm(grid):
        cvm = xgb.cv(
            params=g,
            dtrain=dtrain,
            num_boost_round=5000,
            nfold=5,
            seed=67,
            early_stopping_rounds=20,
        )
        # Keep number of rounds, cv score, train score, and parameters
        search.append((len(cvm), *cvm.iloc[-1, [2, 0]], g))

        with open(grid_file, "w") as f:
            json.dump(search, f)

# Load grid and check (A) sort order and (B) if grid ranges were set reasonable
with open(grid_file) as f:
    search = json.load(f)

search_df = pd.DataFrame.from_records(
    search, columns=["num_boost_round", "cv_score", "train_score", "params"]
).sort_values("cv_score")

with pd.option_context("display.max_colwidth", None):
    display(search_df.head())

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [11:31<00:00, 13.83s/it]


Unnamed: 0,num_boost_round,cv_score,train_score,params
43,422,9.676076,9.609622,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 1, 'objective': 'reg:gamma', 'min_split_loss': 0.001, 'max_depth': 1, 'learning_rate': 0.1, 'colsample_bynode': 0.8}"
4,422,9.676076,9.609622,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 1, 'objective': 'reg:gamma', 'min_split_loss': 0, 'max_depth': 1, 'learning_rate': 0.1, 'colsample_bynode': 0.8}"
0,422,9.676138,9.60969,"{'subsample': 0.8, 'reg_lambda': 0, 'reg_alpha': 1, 'objective': 'reg:gamma', 'min_split_loss': 0, 'max_depth': 1, 'learning_rate': 0.1, 'colsample_bynode': 0.8}"
49,422,9.676615,9.60947,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0, 'objective': 'reg:gamma', 'min_split_loss': 0, 'max_depth': 1, 'learning_rate': 0.1, 'colsample_bynode': 0.8}"
33,565,9.677658,9.608753,"{'subsample': 1, 'reg_lambda': 0, 'reg_alpha': 3, 'objective': 'reg:gamma', 'min_split_loss': 0.001, 'max_depth': 1, 'learning_rate': 0.1, 'colsample_bynode': 0.8}"


In [7]:
# Best parameters
best = search_df.iloc[0]
best.num_boost_round, best.params

(422,
 {'subsample': 0.8,
  'reg_lambda': 1,
  'reg_alpha': 1,
  'objective': 'reg:gamma',
  'min_split_loss': 0.001,
  'max_depth': 1,
  'learning_rate': 0.1,
  'colsample_bynode': 0.8})

In [6]:
# Fit on best parameters
model_xgb = XGBRegressor(
    **best.params, n_estimators=best.num_boost_round, random_state=59
)
_ = model_xgb.fit(X=X_train, y=y_train)