### Lgbm and Optuna
* changed with cross validation

In [2]:
import pandas as pd
import numpy as np

# the GBM used
mport xgboost as xgb
import catboost as cat
import lightgbm as lgb

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders 

import warnings
warnings.filterwarnings('ignore')

import optuna

In [3]:
# globals and load train dataset

FILE_TRAIN = "train.csv"

In [4]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# Data preparation, feature engineering
#

# add features (hour, year) extracted form timestamp
data_extended = add_features(data_orig)

# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ['datetime', 'casual', 'registered', 'temp']

TARGET = "count"
cat_cols = ['season', 'holiday','workingday', 'weather', 'hour', 'year']
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [6]:
# Code categorical columns (only season, weather, year)
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

# finally we have the train dataset
X = data_used[features].values
y = data_used[TARGET].values

All columns: 14
Ignored columns: 4
Target: 1
Categorical columns: 7
Numerical columns: 2
All the features 9


In [10]:
# general
FOLDS = 5
SEED = 4321
N_TRIALS = 5
STUDY_NAME = "gbm3"

In [13]:
#
# Here we define what we do using Optuna
#
def objective(trial):
    
    # tuning on max_depth, n_estimators for the example
    dict_params = {
        "num_iterations": trial.suggest_categorical("num_iterations", [3000, 4000, 5000]),
        "learning_rate": trial.suggest_loguniform("learning_rate", low=1e-4, high=1e-2),
        "metrics" : ["rmse"],
        "verbose" : -1,
    }
    max_depth = trial.suggest_int("max_depth", 4, 10)
    num_leaves = trial.suggest_int("num_leaves", 2**(max_depth), 2**(max_depth))
    
    dict_params['max_depth'] = max_depth
    dict_params['num_leaves'] = num_leaves
    
    regr = lgb.LGBMRegressor(**dict_params)
    
    # using rmsle for scoring
    scorer = make_scorer(rmsle, greater_is_better=False)
    
    scores = cross_validate(regr, X, y, cv=FOLDS, scoring=scorer)
    
    avg_test_score = round(np.mean(scores['test_score']), 4)
        
    return avg_test_score

In [14]:
# launch Optuna Study
study = optuna.create_study(study_name=STUDY_NAME, direction="maximize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-09 11:44:08,014][0m A new study created in memory with name: gbm3[0m
[32m[I 2022-03-09 11:44:53,441][0m Trial 0 finished with value: -1.3121 and parameters: {'num_iterations': 3000, 'learning_rate': 0.00017125621912421937, 'max_depth': 8, 'num_leaves': 256}. Best is trial 0 with value: -1.3121.[0m
[32m[I 2022-03-09 11:45:18,176][0m Trial 1 finished with value: -1.3638 and parameters: {'num_iterations': 4000, 'learning_rate': 0.00010698031788920725, 'max_depth': 6, 'num_leaves': 64}. Best is trial 0 with value: -1.3121.[0m
[32m[I 2022-03-09 11:45:41,954][0m Trial 2 finished with value: -0.5258 and parameters: {'num_iterations': 5000, 'learning_rate': 0.0034648547200920796, 'max_depth': 6, 'num_leaves': 64}. Best is trial 2 with value: -0.5258.[0m
[32m[I 2022-03-09 11:47:29,543][0m Trial 3 finished with value: -1.2259 and parameters: {'num_iterations': 5000, 'learning_rate': 0.000141646290605655, 'max_depth': 9, 'num_leaves': 512}. Best is trial 2 with value:

In [15]:
study.best_params

{'num_iterations': 5000,
 'learning_rate': 0.0034648547200920796,
 'max_depth': 6,
 'num_leaves': 64}

In [17]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df['state'] == 'COMPLETE'].sort_values(by=['value'], ascending=False)

# best on top
result_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_num_iterations,params_num_leaves,state
2,2,-0.5258,2022-03-09 11:45:18.178690,2022-03-09 11:45:41.954595,0 days 00:00:23.775905,0.003465,6,5000,64,COMPLETE
3,3,-1.2259,2022-03-09 11:45:41.957671,2022-03-09 11:47:29.543232,0 days 00:01:47.585561,0.000142,9,5000,512,COMPLETE
4,4,-1.3099,2022-03-09 11:47:29.546341,2022-03-09 11:47:49.915313,0 days 00:00:20.368972,0.000116,5,5000,32,COMPLETE
0,0,-1.3121,2022-03-09 11:44:08.017368,2022-03-09 11:44:53.441192,0 days 00:00:45.423824,0.000171,8,3000,256,COMPLETE
1,1,-1.3638,2022-03-09 11:44:53.444319,2022-03-09 11:45:18.175618,0 days 00:00:24.731299,0.000107,6,4000,64,COMPLETE


### train the model on entire train set and save

In [19]:
%%time

# maybe I shoud add save best model (see nu_iteration in cell below)
model = lgb.LGBMRegressor(**study.best_params)

model.fit(X, y)

CPU times: user 18.6 s, sys: 67.9 ms, total: 18.7 s
Wall time: 4.71 s


LGBMRegressor(learning_rate=0.0034648547200920796, max_depth=6,
              num_iterations=5000, num_leaves=64)

In [None]:
model_file = "lgboost.txt"

model.booster_.save_model(model_file, num_iteration=study.best_params['num_iterations'])