# Randomized search CV for the partly additive XGB model

- Rerunning takes ~ten minutes
- Results might depend on seed
- Basically a copy of Chapter 3 code

In [1]:
# Rerunning takes ~10 minutes; results might depend on seed
import json

import pandas as pd
import xgboost as xgb
from joblib import load
from sklearn.model_selection import ParameterSampler  # , ParameterGrid
from tqdm import tqdm
from xgboost import XGBRegressor

grid_file = "grid_partly_additive_xgb.txt"

# Data splits already prepared in exercise text (must run first)
X_train, X_test, y_train, y_test = load("data.joblib")

# Data interface of XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)

# STEP 1: Model for the expected claims frequency
# -> Gamma deviance is both strictly consistent and meaningful

interaction_constraints = [
    ["DateNum"],
    ["Female"],
    [
        "LogInitial",
        "LogWeeklyPay",
        "LogDelay",
        "LogAge",
        "PartTime",
        "Married",
        "WeekDay",
        "Hour",
    ],
]

# STEP 2: Select learning rate to get reasonable number of trees by early stopping
params = {
    "objective": "reg:gamma",
    "learning_rate": 0.05,
    "max_depth": 2,
    "interaction_constraints": interaction_constraints,
}

In [2]:
# k-fold cross-validation to see if number of trees is reasonable
cvm = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=5000,
    nfold=5,
    early_stopping_rounds=20,
    callbacks=[xgb.callback.EvaluationMonitor(period=50)],
)

# A LR of 0.05 provides about 450 trees, which is ok
print("Best boosting round with default params:\n")
cvm.tail(1)

[0]	train-gamma-nloglik:4488.80020	test-gamma-nloglik:4488.80385
[50]	train-gamma-nloglik:372.81397	test-gamma-nloglik:372.83600
[100]	train-gamma-nloglik:37.15666	test-gamma-nloglik:37.18060
[150]	train-gamma-nloglik:11.27299	test-gamma-nloglik:11.30717
[200]	train-gamma-nloglik:9.74567	test-gamma-nloglik:9.79096
[250]	train-gamma-nloglik:9.65464	test-gamma-nloglik:9.72069
[300]	train-gamma-nloglik:9.62408	test-gamma-nloglik:9.70658
[350]	train-gamma-nloglik:9.60559	test-gamma-nloglik:9.69668
[400]	train-gamma-nloglik:9.59130	test-gamma-nloglik:9.69075
[450]	train-gamma-nloglik:9.58100	test-gamma-nloglik:9.68994
[487]	train-gamma-nloglik:9.57439	test-gamma-nloglik:9.69069
Best boosting round with default params:



Unnamed: 0,train-gamma-nloglik-mean,train-gamma-nloglik-std,test-gamma-nloglik-mean,test-gamma-nloglik-std
467,9.57787,0.015424,9.689257,0.10268


In [4]:
# STEP 3: Iterate randomized SearchCV for regularization parameters
if True:
    # Or ParameterGrid(...) if grid is small enough to check all combinations
    grid = ParameterSampler(
        {
            "objective": ["reg:gamma"],
            "learning_rate": [0.05],
            "max_depth": [2],
            "interaction_constraints": [interaction_constraints],
            "colsample_bynode": [0.8, 1],
            "subsample": [0.8, 1],
            "reg_lambda": [0, 1, 2, 3],
            "reg_alpha": [0, 1, 2, 3],
            "min_split_loss": [0, 0.001],
        },
        n_iter=50,
        random_state=9,
    )

    # Iterate over grid and save relevant information on disk
    search = []
    for g in tqdm(grid):
        cvm = xgb.cv(
            params=g,
            dtrain=dtrain,
            num_boost_round=5000,
            nfold=5,
            seed=67,
            early_stopping_rounds=20,
        )
        # Keep number of rounds, cv score, train score, and parameters
        search.append((len(cvm), *cvm.iloc[-1, [2, 0]], g))

        with open(grid_file, "w") as f:
            json.dump(search, f)

# Load grid and check (A) sort order and (B) if grid ranges were set reasonable
with open(grid_file) as f:
    search = json.load(f)

search_df = pd.DataFrame.from_records(
    search, columns=["num_boost_round", "cv_score", "train_score", "params"]
).sort_values("cv_score")

with pd.option_context("display.max_colwidth", None):
    display(search_df.head())

  0%|          | 0/50 [00:00<?, ?it/s]

100%|██████████| 50/50 [11:03<00:00, 13.27s/it]


Unnamed: 0,num_boost_round,cv_score,train_score,params
23,441,9.687053,9.583179,"{'subsample': 1, 'reg_lambda': 3, 'reg_alpha': 2, 'objective': 'reg:gamma', 'min_split_loss': 0, 'max_depth': 2, 'learning_rate': 0.05, 'interaction_constraints': [['DateNum'], ['Female'], ['LogInitial', 'LogWeeklyPay', 'LogDelay', 'LogAge', 'PartTime', 'Married', 'WeekDay', 'Hour']], 'colsample_bynode': 0.8}"
33,417,9.687826,9.587172,"{'subsample': 1, 'reg_lambda': 0, 'reg_alpha': 3, 'objective': 'reg:gamma', 'min_split_loss': 0.001, 'max_depth': 2, 'learning_rate': 0.05, 'interaction_constraints': [['DateNum'], ['Female'], ['LogInitial', 'LogWeeklyPay', 'LogDelay', 'LogAge', 'PartTime', 'Married', 'WeekDay', 'Hour']], 'colsample_bynode': 0.8}"
12,417,9.687826,9.587172,"{'subsample': 1, 'reg_lambda': 0, 'reg_alpha': 3, 'objective': 'reg:gamma', 'min_split_loss': 0, 'max_depth': 2, 'learning_rate': 0.05, 'interaction_constraints': [['DateNum'], ['Female'], ['LogInitial', 'LogWeeklyPay', 'LogDelay', 'LogAge', 'PartTime', 'Married', 'WeekDay', 'Hour']], 'colsample_bynode': 0.8}"
21,346,9.688511,9.60128,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 3, 'objective': 'reg:gamma', 'min_split_loss': 0, 'max_depth': 2, 'learning_rate': 0.05, 'interaction_constraints': [['DateNum'], ['Female'], ['LogInitial', 'LogWeeklyPay', 'LogDelay', 'LogAge', 'PartTime', 'Married', 'WeekDay', 'Hour']], 'colsample_bynode': 0.8}"
49,429,9.689165,9.580553,"{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0, 'objective': 'reg:gamma', 'min_split_loss': 0, 'max_depth': 2, 'learning_rate': 0.05, 'interaction_constraints': [['DateNum'], ['Female'], ['LogInitial', 'LogWeeklyPay', 'LogDelay', 'LogAge', 'PartTime', 'Married', 'WeekDay', 'Hour']], 'colsample_bynode': 0.8}"


In [7]:
# Best parameters
best = search_df.iloc[0]
best.num_boost_round, best.params

(441,
 {'subsample': 1,
  'reg_lambda': 3,
  'reg_alpha': 2,
  'objective': 'reg:gamma',
  'min_split_loss': 0,
  'max_depth': 2,
  'learning_rate': 0.05,
  'interaction_constraints': [['DateNum'],
   ['Female'],
   ['LogInitial',
    'LogWeeklyPay',
    'LogDelay',
    'LogAge',
    'PartTime',
    'Married',
    'WeekDay',
    'Hour']],
  'colsample_bynode': 0.8})

In [6]:
# Fit on best parameters
model_xgb = XGBRegressor(
    **best.params, n_estimators=best.num_boost_round, random_state=59
)
_ = model_xgb.fit(X=X_train, y=y_train)