### Catboost and Optuna

In [1]:
import pandas as pd
import numpy as np

# the GBM used
import catboost as cat

from sklearn.model_selection import KFold

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders

import optuna

In [2]:
# globals and load train dataset

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

In [3]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# add features: hour, year
#
data_extended = add_features(data_orig)

In [4]:
# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year

In [5]:
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ["datetime", "casual", "registered", "temp"]

TARGET = "count"
cat_cols = ["season", "holiday", "workingday", "weather", "windspeed", "hour", "year"]
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print("All columns:", len(all_columns))
print("Ignored columns:", len(del_columns))
print("Target:", len([TARGET]))
print("Categorical columns:", len(cat_cols))
print("Numerical columns:", len(num_cols))
print("All the features", len(features))

All columns: 14
Ignored columns: 4
Target: 1
Categorical columns: 7
Numerical columns: 2
All the features 9


In [6]:
# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [7]:
# let's code categorical
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [8]:
# general
FOLDS = 5
SEED = 4321
N_TRIALS = 20
STUDY_NAME = "gbm1"

X = data_used[features].values
y = data_used[TARGET].values

In [9]:
# for CatBoost

dtrain = cat.Pool(X, y)

In [10]:
#
# Here we define what we do using Optuna
#
def objective(trial):

    # tuning on max_depth, n_estimators for the example
    dict_params = {
        "iterations": trial.suggest_categorical(
            "num_boost_round", [1000, 2000, 3000, 4000, 5000]
        ),
        "learning_rate": trial.suggest_loguniform("learning_rate", low=1e-4, high=1e-2),
        "loss_function": "RMSE",
        "depth": trial.suggest_int("depth", 4, 10),
        "verbose": 0,
    }

    history = cat.cv(
        params=dict_params,
        dtrain=dtrain,
        nfold=FOLDS,
        seed=SEED,
        logging_level="Silent",
    )

    # take the last
    rmse = round(history["test-RMSE-mean"].values[-1], 4)

    return rmse

In [11]:
# launch Optuna Study
study = optuna.create_study(study_name=STUDY_NAME, direction="minimize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-07 19:59:57,634][0m A new study created in memory with name: gbm1[0m
[32m[I 2022-03-07 20:00:08,447][0m Trial 0 finished with value: 62.2979 and parameters: {'num_boost_round': 1000, 'learning_rate': 0.0028818196420147815, 'depth': 8}. Best is trial 0 with value: 62.2979.[0m
[32m[I 2022-03-07 20:00:58,729][0m Trial 1 finished with value: 50.4127 and parameters: {'num_boost_round': 3000, 'learning_rate': 0.0012954249062606673, 'depth': 10}. Best is trial 1 with value: 50.4127.[0m
[32m[I 2022-03-07 20:01:05,213][0m Trial 2 finished with value: 65.3869 and parameters: {'num_boost_round': 1000, 'learning_rate': 0.008175166890944857, 'depth': 4}. Best is trial 1 with value: 50.4127.[0m
[32m[I 2022-03-07 20:01:14,483][0m Trial 3 finished with value: 47.645 and parameters: {'num_boost_round': 1000, 'learning_rate': 0.008786405077997652, 'depth': 7}. Best is trial 3 with value: 47.645.[0m
[32m[I 2022-03-07 20:01:40,797][0m Trial 4 finished with value: 55.8223 an

In [12]:
study.best_params

{'num_boost_round': 4000, 'learning_rate': 0.005853153235859197, 'depth': 10}

In [20]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df["state"] == "COMPLETE"].sort_values(by=["value"], ascending=True)

# best on top
result_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_depth,params_learning_rate,params_num_boost_round,state
19,19,40.5446,2022-03-07 20:11:03.646640,2022-03-07 20:12:12.708702,0 days 00:01:09.062062,10,0.005853,4000,COMPLETE
17,17,40.908,2022-03-07 20:09:20.753132,2022-03-07 20:10:13.914972,0 days 00:00:53.161840,9,0.006598,4000,COMPLETE
14,14,41.6883,2022-03-07 20:06:17.446093,2022-03-07 20:07:10.383730,0 days 00:00:52.937637,9,0.004262,4000,COMPLETE
5,5,42.7924,2022-03-07 20:01:40.797935,2022-03-07 20:02:23.841957,0 days 00:00:43.044022,8,0.003982,4000,COMPLETE
13,13,42.89,2022-03-07 20:05:34.561573,2022-03-07 20:06:17.445085,0 days 00:00:42.883512,8,0.003816,4000,COMPLETE


### estimate RMSLE
* it is the metric used in Kaggle competition

In [13]:
# make an estimation of rmsle on entire dataset

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

avg_rmsle = 0.0

#
# at each iteration you get a different set of indexes
# from which you get different samples for train and validation dataset
#
for i, (train_idx, valid_idx) in enumerate(kf.split(data_used)):
    print("Processing fold:", i + 1)

    # here we split the DataFrame, using the indexes for the fold
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]

    x_train = data_train[features].values
    y_train = data_train[TARGET].values
    x_valid = data_valid[features].values
    y_valid = data_valid[TARGET].values

    model = cat.CatBoostRegressor(**study.best_params)

    model.fit(x_train, y_train, silent=True)

    y_valid_preds = model.predict(x_valid)

    # clip to zero (to avoid nan for rmsle)
    y_valid_preds = np.where(y_valid_preds >= 0, y_valid_preds, 0)

    avg_rmsle += rmsle(y_valid_preds, y_valid) / float(FOLDS)

print()
print("Avg. RMSLE:", round(avg_rmsle, 4))
print()

Processing fold: 1
Processing fold: 2
Processing fold: 3
Processing fold: 4
Processing fold: 5

Avg. RMSLE: 0.4018



### Train the model on entire train set and save

In [23]:
%%time
model = cat.CatBoostRegressor(**study.best_params)

model.fit(x_train, y_train, silent=True)

CPU times: user 47.5 s, sys: 4.78 s, total: 52.3 s
Wall time: 15.6 s


<catboost.core.CatBoostRegressor at 0x7f9c8865dbe0>

In [22]:
model_file = "catboost.cbm"

model.save_model(model_file, format="cbm")