### Tuning HP: Catboost and Optuna
* adopting sklearn cross validate
* using rmsle as metric

In [1]:
import pandas as pd
import numpy as np

# the GBM used
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders 

import optuna

In [2]:
# globals and load train dataset

FILE_TRAIN = "train.csv"

In [3]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# Data preparation, feature engineering
#

# add features (hour, year) extracted form timestamp
data_extended = add_features(data_orig)

# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"
cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [4]:
# Code categorical columns (only season, weather, year)
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

# finally we have the train dataset
X = data_used[features].values
y = data_used[TARGET].values

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [5]:
# general
FOLDS = 5
SEED = 4321
N_TRIALS = 5
STUDY_NAME = "gbm1"

In [6]:
#
# Here we define what we do using Optuna
#
def objective(trial):
    
    # tuning on max_depth, n_estimators for the example
    dict_params = {'iterations': trial.suggest_categorical("num_boost_round", [3000, 4000, 5000]),
             'learning_rate': trial.suggest_loguniform("learning_rate", low=1e-4, high=1e-2),
             "loss_function": "RMSE",
             'depth': trial.suggest_int("depth", 5, 10),
             "verbose" : 0
    }
    
    regr = cat.CatBoostRegressor(**dict_params)
    
    # using rmsle for scoring
    scorer = make_scorer(rmsle, greater_is_better=False)
    
    scores = cross_validate(regr, X, y, cv=FOLDS, scoring=scorer)
    
    avg_test_score = round(np.mean(scores['test_score']), 4)
        
    return avg_test_score

In [7]:
# launch Optuna Study
study = optuna.create_study(study_name=STUDY_NAME, direction="maximize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-09 14:14:25,157][0m A new study created in memory with name: gbm1[0m
[32m[I 2022-03-09 14:14:49,508][0m Trial 0 finished with value: -0.6352 and parameters: {'num_boost_round': 3000, 'learning_rate': 0.004836407679131681, 'depth': 6}. Best is trial 0 with value: -0.6352.[0m
KeyboardInterrupt: 

In [8]:
study.best_params

{'num_boost_round': 3000, 'learning_rate': 0.004836407679131681, 'depth': 6}

In [9]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df['state'] == 'COMPLETE'].sort_values(by=['value'], ascending=False)

# best on top
result_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_depth,params_learning_rate,params_num_boost_round,state
0,0,-0.6352,2022-03-09 14:14:25.165506,2022-03-09 14:14:49.508050,0 days 00:00:24.342544,6,0.004836,3000,COMPLETE


### Train the model on entire train set and save

In [None]:
%%time
model = cat.CatBoostRegressor(**study.best_params)

model.fit(X, y, silent=True)

In [None]:
model_file = "catboost.cbm"

model.save_model(model_file, format='cbm')