### Tuning HP: XGBoost and Optuna
* adopting sklearn cross validate
* using rmsle as metric

In [1]:
import pandas as pd
import numpy as np

# the GBM used
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders

import optuna

In [2]:
# globals

FILE_TRAIN = "train.csv"

In [3]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# Data preparation, feature engineering
#

# add features (hour, year) extracted form timestamp
data_extended = add_features(data_orig)

# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one (atemp)
del_columns = ["datetime", "casual", "registered", "temp"]

TARGET = "count"
cat_cols = ["season", "holiday", "workingday", "weather", "hour", "year"]
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [4]:
# Code categorical columns (only season, weather, year)
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# reorder columns, move count at the end
data_used = data_used[features + [TARGET]]

# define indexes for cat_cols
# not using now, but can be useful in future
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

# finally we have the train dataset
X = data_used[features].values
y = data_used[TARGET].values

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [5]:
# parameters for the HPO session with Optuna
FOLDS = 7
SEED = 4321

N_TRIALS = 60
STUDY_NAME = "gbm11"

# ranges
LR_LOW = 1e-4
LR_HIGH = 1e-2
DEPTH_LOW = 5
DEPTH_HIGH = 10
N_ITER_LIST = [3000, 3500, 4000, 4500, 5000]

In [6]:
#
# Here we define what we do using Optuna
#
def objective(trial):

    # tuning on these parameters
    # names are implementation (diff for xg etc)
    dict_params = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "learning_rate": trial.suggest_loguniform(
            "learning_rate", low=LR_LOW, high=LR_HIGH
        ),
        "max_depth": trial.suggest_int("max_depth", DEPTH_LOW, DEPTH_HIGH),
        "num_boost_round": trial.suggest_categorical("num_boost_round", N_ITER_LIST),
    }

    # for XGBoost seems I have to pass esplicitely n_estimators
    regr = xgb.XGBRegressor(n_estimators=dict_params["num_boost_round"], **dict_params)

    # using rmsle for scoring
    # greater is better is Flase because it is an error measure
    # then make_scorer sign-flip and therefore we will maximize it to get the best
    scorer = make_scorer(rmsle, greater_is_better=False)

    scores = cross_validate(regr, X, y, cv=FOLDS, scoring=scorer)

    avg_test_score = round(np.mean(scores["test_score"]), 4)

    return avg_test_score

In [7]:
# launch Optuna Study

study = optuna.create_study(study_name=STUDY_NAME, direction="maximize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-09 16:03:31,972][0m A new study created in memory with name: gbm11[0m
[32m[I 2022-03-09 16:06:05,743][0m Trial 0 finished with value: -0.7384 and parameters: {'learning_rate': 0.00017483394431969703, 'max_depth': 7, 'num_boost_round': 4000}. Best is trial 0 with value: -0.7384.[0m
[32m[I 2022-03-09 16:13:27,303][0m Trial 1 finished with value: -0.4375 and parameters: {'learning_rate': 0.0006730970747687033, 'max_depth': 10, 'num_boost_round': 5000}. Best is trial 1 with value: -0.4375.[0m
[32m[I 2022-03-09 16:14:58,749][0m Trial 2 finished with value: -0.559 and parameters: {'learning_rate': 0.00043372976049416845, 'max_depth': 5, 'num_boost_round': 4000}. Best is trial 1 with value: -0.4375.[0m
[32m[I 2022-03-09 16:16:05,301][0m Trial 3 finished with value: -0.9 and parameters: {'learning_rate': 0.00017359027407523402, 'max_depth': 5, 'num_boost_round': 3000}. Best is trial 1 with value: -0.4375.[0m
[32m[I 2022-03-09 16:19:48,797][0m Trial 4 finished wi

[32m[I 2022-03-09 18:31:37,123][0m Trial 38 finished with value: -0.4523 and parameters: {'learning_rate': 0.0005647953100866222, 'max_depth': 7, 'num_boost_round': 4000}. Best is trial 31 with value: -0.4307.[0m
[32m[I 2022-03-09 18:36:59,973][0m Trial 39 finished with value: -0.5775 and parameters: {'learning_rate': 0.00020223997390042688, 'max_depth': 9, 'num_boost_round': 5000}. Best is trial 31 with value: -0.4307.[0m
[32m[I 2022-03-09 18:42:54,581][0m Trial 40 finished with value: -0.4425 and parameters: {'learning_rate': 0.001189778478476448, 'max_depth': 10, 'num_boost_round': 4000}. Best is trial 31 with value: -0.4307.[0m
[32m[I 2022-03-09 18:47:20,994][0m Trial 41 finished with value: -0.4336 and parameters: {'learning_rate': 0.0006193339528976119, 'max_depth': 9, 'num_boost_round': 4000}. Best is trial 31 with value: -0.4307.[0m
[32m[I 2022-03-09 18:52:23,789][0m Trial 42 finished with value: -0.4315 and parameters: {'learning_rate': 0.000914031222929688, 'max

In [8]:
print("Best parameters are:")
print(study.best_params)

Best parameters are:
{'learning_rate': 0.0009126590966928525, 'max_depth': 9, 'num_boost_round': 3500}


In [10]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df["state"] == "COMPLETE"].sort_values(by=["value"], ascending=False)

# best on top
result_df.head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_num_boost_round,state
31,31,-0.4307,2022-03-09 17:59:26.357293,2022-03-09 18:03:22.582281,0 days 00:03:56.224988,0.000913,9,3500,COMPLETE
16,16,-0.4309,2022-03-09 17:03:10.300362,2022-03-09 17:07:05.585218,0 days 00:03:55.284856,0.000886,9,3500,COMPLETE
32,32,-0.4309,2022-03-09 18:03:22.583823,2022-03-09 18:07:18.272377,0 days 00:03:55.688554,0.000966,9,3500,COMPLETE
34,34,-0.431,2022-03-09 18:12:25.867916,2022-03-09 18:16:56.393726,0 days 00:04:30.525810,0.000981,9,4000,COMPLETE
13,13,-0.431,2022-03-09 16:49:06.414481,2022-03-09 16:54:44.789990,0 days 00:05:38.375509,0.000796,9,5000,COMPLETE
52,52,-0.4312,2022-03-09 19:34:25.122212,2022-03-09 19:39:29.304546,0 days 00:05:04.182334,0.000892,9,4500,COMPLETE
57,57,-0.4313,2022-03-09 19:54:03.989067,2022-03-09 19:59:07.758679,0 days 00:05:03.769612,0.000916,9,4500,COMPLETE
51,51,-0.4314,2022-03-09 19:29:20.605063,2022-03-09 19:34:25.120570,0 days 00:05:04.515507,0.000873,9,4500,COMPLETE
42,42,-0.4315,2022-03-09 18:47:20.995977,2022-03-09 18:52:23.789363,0 days 00:05:02.793386,0.000914,9,4500,COMPLETE
23,23,-0.4316,2022-03-09 17:30:29.338657,2022-03-09 17:34:25.317643,0 days 00:03:55.978986,0.000819,9,3500,COMPLETE


### Train the model with best params on train set and save

In [11]:
%%time
model = xgb.XGBRegressor(
    n_estimators=study.best_params["num_boost_round"], **study.best_params
)

model.fit(X, y)

CPU times: user 1min 58s, sys: 284 ms, total: 1min 58s
Wall time: 37.6 s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.0009126590966928525,
             max_delta_step=0, max_depth=9, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=3500, n_jobs=4,
             num_boost_round=3500, num_parallel_tree=1, predictor='auto',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [12]:
model.save_model("xgboost.txt")