### Lgbm and Optuna
* changed with cross validation

In [2]:
import pandas as pd
import numpy as np

# the GBM used
import xgboost as xgb
import catboost as cat
import lightgbm as lgb

from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders

import warnings

warnings.filterwarnings("ignore")

import optuna

In [3]:
# globals and load train dataset

FILE_TRAIN = "train.csv"

In [4]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# Data preparation, feature engineering
#

# add features (hour, year) extracted form timestamp
data_extended = add_features(data_orig)

# ok, we will treat as categorical: holiday, hour, season, weather, workingday, year
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ["datetime", "casual", "registered", "temp"]

TARGET = "count"
cat_cols = ["season", "holiday", "workingday", "weather", "hour", "year"]
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [5]:
# Code categorical columns (only season, weather, year)
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

# finally we have the train dataset
X = data_used[features].values
y = data_used[TARGET].values

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [6]:
# general
FOLDS = 7
SEED = 4321
N_TRIALS = 60
STUDY_NAME = "gbm3"

In [7]:
#
# Here we define what we do using Optuna
#
def objective(trial):

    # tuning on max_depth, n_estimators for the example
    dict_params = {
        "num_iterations": trial.suggest_categorical(
            "num_iterations", [3000, 4000, 5000]
        ),
        "learning_rate": trial.suggest_loguniform("learning_rate", low=1e-4, high=1e-2),
        "metrics": ["rmse"],
        "verbose": -1,
    }
    max_depth = trial.suggest_int("max_depth", 4, 10)
    num_leaves = trial.suggest_int("num_leaves", 2 ** (max_depth), 2 ** (max_depth))

    dict_params["max_depth"] = max_depth
    dict_params["num_leaves"] = num_leaves

    regr = lgb.LGBMRegressor(**dict_params)

    # using rmsle for scoring
    scorer = make_scorer(rmsle, greater_is_better=False)

    scores = cross_validate(regr, X, y, cv=FOLDS, scoring=scorer)

    avg_test_score = round(np.mean(scores["test_score"]), 4)

    return avg_test_score

In [8]:
# launch Optuna Study
study = optuna.create_study(study_name=STUDY_NAME, direction="maximize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-09 20:42:40,270][0m A new study created in memory with name: gbm3[0m
[32m[I 2022-03-09 20:44:55,393][0m Trial 0 finished with value: -1.2306 and parameters: {'num_iterations': 3000, 'learning_rate': 0.00022491643627371854, 'max_depth': 10, 'num_leaves': 1024}. Best is trial 0 with value: -1.2306.[0m
[32m[I 2022-03-09 20:45:29,737][0m Trial 1 finished with value: -0.5385 and parameters: {'num_iterations': 3000, 'learning_rate': 0.008243270372124339, 'max_depth': 8, 'num_leaves': 256}. Best is trial 1 with value: -0.5385.[0m
[32m[I 2022-03-09 20:46:04,867][0m Trial 2 finished with value: -1.1615 and parameters: {'num_iterations': 4000, 'learning_rate': 0.00023139298087528442, 'max_depth': 6, 'num_leaves': 64}. Best is trial 1 with value: -0.5385.[0m
[32m[I 2022-03-09 20:49:03,362][0m Trial 3 finished with value: -1.1785 and parameters: {'num_iterations': 4000, 'learning_rate': 0.00020015343211910778, 'max_depth': 10, 'num_leaves': 1024}. Best is trial 1 with 

[32m[I 2022-03-09 21:36:21,682][0m Trial 35 finished with value: -0.5614 and parameters: {'num_iterations': 4000, 'learning_rate': 0.0007705635481575268, 'max_depth': 10, 'num_leaves': 1024}. Best is trial 32 with value: -0.4551.[0m
[32m[I 2022-03-09 21:38:11,335][0m Trial 36 finished with value: -0.4668 and parameters: {'num_iterations': 4000, 'learning_rate': 0.0010684052370007874, 'max_depth': 9, 'num_leaves': 512}. Best is trial 32 with value: -0.4551.[0m
[32m[I 2022-03-09 21:41:08,568][0m Trial 37 finished with value: -1.0091 and parameters: {'num_iterations': 4000, 'learning_rate': 0.00031186471147232167, 'max_depth': 10, 'num_leaves': 1024}. Best is trial 32 with value: -0.4551.[0m
[32m[I 2022-03-09 21:41:27,120][0m Trial 38 finished with value: -0.6645 and parameters: {'num_iterations': 5000, 'learning_rate': 0.0021862618043088557, 'max_depth': 4, 'num_leaves': 16}. Best is trial 32 with value: -0.4551.[0m
[32m[I 2022-03-09 21:43:20,597][0m Trial 39 finished with 

In [9]:
study.best_params

{'num_iterations': 4000,
 'learning_rate': 0.001214554022260763,
 'max_depth': 9,
 'num_leaves': 512}

In [10]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df["state"] == "COMPLETE"].sort_values(by=["value"], ascending=False)

# best on top
result_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_num_iterations,params_num_leaves,state
32,32,-0.4551,2022-03-09 21:29:32.818017,2022-03-09 21:31:18.642596,0 days 00:01:45.824579,0.001215,9,4000,512,COMPLETE
19,19,-0.4583,2022-03-09 21:03:47.002722,2022-03-09 21:05:31.029041,0 days 00:01:44.026319,0.001277,9,4000,512,COMPLETE
24,24,-0.4587,2022-03-09 21:13:42.090333,2022-03-09 21:16:07.003726,0 days 00:02:24.913393,0.001393,10,4000,1024,COMPLETE
18,18,-0.4588,2022-03-09 21:02:26.153530,2022-03-09 21:03:47.000883,0 days 00:01:20.847353,0.001512,9,3000,512,COMPLETE
54,54,-0.4609,2022-03-09 21:59:11.131690,2022-03-09 22:00:23.502942,0 days 00:01:12.371252,0.001274,8,4000,256,COMPLETE


### train the model on entire train set and save

In [11]:
%%time

# maybe I shoud add save best model (see nu_iteration in cell below)
model = lgb.LGBMRegressor(**study.best_params)

model.fit(X, y)

CPU times: user 1min 1s, sys: 152 ms, total: 1min 2s
Wall time: 15.7 s


LGBMRegressor(learning_rate=0.001214554022260763, max_depth=9,
              num_iterations=4000, num_leaves=512)

In [12]:
model_file = "lgboost.txt"

model.booster_.save_model(model_file, num_iteration=study.best_params["num_iterations"])

<lightgbm.basic.Booster at 0x7f46b363e2b0>