### XGBoost and Optuna

In [1]:
import pandas as pd
import numpy as np

# the GBM used
import xgboost as xgb

from sklearn.model_selection import KFold

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders 

import optuna

In [2]:
# globals and load train dataset

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

In [3]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# add features
#
data_extended = add_features(data_orig)

In [4]:
# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year

In [5]:
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"
cat_cols = ['season', 'holiday','workingday', 'weather', 'windspeed', 'hour', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print('All columns:', len(all_columns))
print('Ignored columns:', len(del_columns))
print('Target:', len([TARGET]))
print('Categorical columns:', len(cat_cols))
print('Numerical columns:', len(num_cols))
print('All the features', len(features))

All columns: 14
Ignored columns: 4
Target: 1
Categorical columns: 7
Numerical columns: 2
All the features 9


In [6]:
# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [7]:
# let's code categorical
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [8]:
# general
FOLDS = 7
SEED = 4321
# train for longer, see if I can reduce RMSLE from 0.7
N_TRIALS = 60
STUDY_NAME = "gbm2"


X = data_used[features].values
y = data_used[TARGET].values

In [9]:
# for XGBoost

dtrain = xgb.DMatrix(X, label=y)

In [10]:
#
# Here we define what we do using Optuna
#
def objective(trial):
    
    # tuning on max_depth, n_estimators for the example
    dict_params = {
        "verbosity": 0,
        "objective": "reg:squarederror",
        "learning_rate": trial.suggest_loguniform("learning_rate", low=1e-4, high=1e-2),
        "eval_metric": "rmse",
        "max_depth" : trial.suggest_int("max_depth", 4, 10),
    }
    
    # needed only for XGBoost
    n_rounds = trial.suggest_categorical("num_boost_round", [1000, 2000, 3000, 4000, 5000])
    dict_params['num_boost_round']  = n_rounds

    history = xgb.cv(params=dict_params, dtrain=dtrain, nfold=FOLDS, seed=SEED,
                     # as far as I see we need this for XGBoost
                     num_boost_round=n_rounds)
    
    # take the last
    rmse = round(history["test-rmse-mean"].values[-1], 4)
        
    return rmse

In [None]:
# launch Optuna Study

study = optuna.create_study(study_name=STUDY_NAME , direction="minimize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-08 17:09:17,851][0m A new study created in memory with name: gbm2[0m
[32m[I 2022-03-08 17:11:13,476][0m Trial 0 finished with value: 50.3394 and parameters: {'learning_rate': 0.001338134232361574, 'max_depth': 10, 'num_boost_round': 2000}. Best is trial 0 with value: 50.3394.[0m
[32m[I 2022-03-08 17:13:29,968][0m Trial 1 finished with value: 43.3153 and parameters: {'learning_rate': 0.0035008245434434105, 'max_depth': 6, 'num_boost_round': 5000}. Best is trial 1 with value: 43.3153.[0m
[32m[I 2022-03-08 17:16:44,695][0m Trial 2 finished with value: 74.0092 and parameters: {'learning_rate': 0.00041053116341917425, 'max_depth': 9, 'num_boost_round': 4000}. Best is trial 1 with value: 43.3153.[0m
[32m[I 2022-03-08 17:18:27,477][0m Trial 3 finished with value: 81.1703 and parameters: {'learning_rate': 0.0005458897213907421, 'max_depth': 7, 'num_boost_round': 3000}. Best is trial 1 with value: 43.3153.[0m
[32m[I 2022-03-08 17:19:01,876][0m Trial 4 finished wi

In [None]:
study.best_params

In [None]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df['state'] == 'COMPLETE'].sort_values(by=['value'], ascending=True)

# best on top
result_df.head()

### estimate RMSLE
* it is the metric used in Kaggle competition

In [None]:
# make an estimation of rmsle on entire dataset

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

avg_rmsle = 0.

#
# at each iteration you get a different set of indexes
# from which you get different samples for train and validation dataset
#
for i, (train_idx, valid_idx) in enumerate(kf.split(data_used)):
    print("Processing fold:", i + 1)
    
    # here we split the DataFrame, using the indexes for the fold
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]
    
    x_train = data_train[features].values
    y_train = data_train[TARGET].values
    x_valid = data_valid[features].values
    y_valid = data_valid[TARGET].values
    
    # it is important to pass explicitely n_estimators (only xgboost)
    model = xgb.XGBRegressor(n_estimators = study.best_params['num_boost_round'], 
                             **study.best_params)

    model.fit(x_train, y_train)
    
    y_valid_preds = model.predict(x_valid)
    
    # clip to zero (to avoid nan for rmsle)
    y_valid_preds = np.where(y_valid_preds >= 0,  y_valid_preds, 0)
    
    avg_rmsle += rmsle(y_valid_preds, y_valid)/float(FOLDS)

print()
print('Avg. RMSLE:', round(avg_rmsle, 4))
print()

### Train the model on entire train set and save

In [None]:
%%time
model = xgb.XGBRegressor(n_estimators = study.best_params['num_boost_round'], 
                         **study.best_params)

model.fit(x_train, y_train)

In [None]:
model.save_model("xgboost.txt")