### Tuning HP: XGBoost and Optuna
* adopting sklearn cross validate
* using rmsle as metric

In [1]:
import pandas as pd
import numpy as np

# the GBM used
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders 

import optuna

In [2]:
# globals

FILE_TRAIN = "train.csv"

In [3]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# Data preparation, feature engineering
#

# add features (hour, year) extracted form timestamp
data_extended = add_features(data_orig)

# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one (atemp)
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"
cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [4]:
# Code categorical columns (only season, weather, year)
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# reorder columns, move count at the end
data_used = data_used[features + [TARGET]]

# define indexes for cat_cols
# not using now, but can be useful in future
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

# finally we have the train dataset
X = data_used[features].values
y = data_used[TARGET].values

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [5]:
# parameters for the HPO session with Optuna
FOLDS = 7
SEED = 4321

N_TRIALS = 60
STUDY_NAME = "gbm11"

# ranges
LR_LOW = 1e-4
LR_HIGH = 1e-2
DEPTH_LOW = 5
DEPTH_HIGH = 10
N_ITER_LIST = [3000, 3500, 4000, 4500, 5000]

In [8]:
#
# Here we define what we do using Optuna
#
def objective(trial):
    
    # tuning on these parameters
    # names are implementation (diff for xg etc)
    dict_params = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "learning_rate": trial.suggest_loguniform("learning_rate", low=LR_LOW, high=LR_HIGH),
        "max_depth" : trial.suggest_int("max_depth", DEPTH_LOW, DEPTH_HIGH),
        "num_boost_round": trial.suggest_categorical("num_boost_round", N_ITER_LIST)
    }
    
    # for XGBoost seems I have to pass esplicitely n_estimators
    regr = xgb.XGBRegressor(n_estimators = dict_params["num_boost_round"],
                                                       **dict_params)
    
    # using rmsle for scoring
    # greater is better is Flase because it is an error measure
    # then make_scorer sign-flip and therefore we will maximize it to get the best
    scorer = make_scorer(rmsle, greater_is_better=False)
    
    scores = cross_validate(regr, X, y, cv=FOLDS, scoring=scorer)
    
    avg_test_score = round(np.mean(scores['test_score']), 4)
        
    return avg_test_score

In [7]:
# launch Optuna Study

study = optuna.create_study(study_name=STUDY_NAME , direction="maximize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-09 15:44:16,912][0m A new study created in memory with name: gbm11[0m
[32m[I 2022-03-09 15:46:53,155][0m Trial 0 finished with value: -0.5475 and parameters: {'learning_rate': 0.00029979319507730237, 'max_depth': 7, 'num_boost_round': 4000}. Best is trial 0 with value: -0.5475.[0m
[32m[I 2022-03-09 15:50:35,766][0m Trial 1 finished with value: -1.0371 and parameters: {'learning_rate': 0.00011875324061696935, 'max_depth': 9, 'num_boost_round': 3500}. Best is trial 0 with value: -0.5475.[0m
[32m[I 2022-03-09 15:53:15,324][0m Trial 2 finished with value: -0.5084 and parameters: {'learning_rate': 0.005280182147906995, 'max_depth': 8, 'num_boost_round': 3500}. Best is trial 2 with value: -0.5084.[0m
[32m[I 2022-03-09 15:56:27,442][0m Trial 3 finished with value: -0.8908 and parameters: {'learning_rate': 0.00010266352067178083, 'max_depth': 7, 'num_boost_round': 5000}. Best is trial 2 with value: -0.5084.[0m
[32m[I 2022-03-09 15:58:57,673][0m Trial 4 finished 

In [None]:
print('Best parameters are:')
print(study.best_params)

In [None]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df['state'] == 'COMPLETE'].sort_values(by=['value'], ascending=False)

# best on top
result_df.head()

### Train the model with best params on train set and save

In [None]:
%%time
model = xgb.XGBRegressor(n_estimators = study.best_params['num_boost_round'], 
                         **study.best_params)

model.fit(X, y)

In [None]:
model.save_model("xgboost.txt")