### Lgbm and Optuna

In [1]:
import pandas as pd
import numpy as np

# the GBM used
import lightgbm as lgb

from sklearn.model_selection import KFold

# to encode categoricals
from sklearn.preprocessing import LabelEncoder

# see utils.py
from utils import add_features, rmsle, train_encoders, apply_encoders

import warnings

warnings.filterwarnings("ignore")

import optuna

In [2]:
# globals and load train dataset

FILE_TRAIN = "train.csv"
FILE_TEST = "test.csv"

In [3]:
# load train dataset
data_orig = pd.read_csv(FILE_TRAIN)

#
# add features
#
data_extended = add_features(data_orig)

In [4]:
# ok, we will treat as categorical: holiday, hour, season, weather, windspeed, workingday, year

In [5]:
all_columns = data_extended.columns

# cols to be ignored
# atemp and temp are strongly correlated (0.98) we're taking only one
del_columns = ["datetime", "casual", "registered", "temp"]

TARGET = "count"
cat_cols = ["season", "holiday", "workingday", "weather", "windspeed", "hour", "year"]
num_cols = list(set(all_columns) - set([TARGET]) - set(del_columns) - set(cat_cols))
features = sorted(cat_cols + num_cols)

print("All columns:", len(all_columns))
print("Ignored columns:", len(del_columns))
print("Target:", len([TARGET]))
print("Categorical columns:", len(cat_cols))
print("Numerical columns:", len(num_cols))
print("All the features", len(features))

All columns: 14
Ignored columns: 4
Target: 1
Categorical columns: 7
Numerical columns: 2
All the features 9


In [6]:
# drop ignored columns
data_used = data_extended.drop(del_columns, axis=1)

In [7]:
# let's code categorical
le_list = train_encoders(data_used)

# coding
data_used = apply_encoders(data_used, le_list)

# define indexes for cat_cols
# cat boost want indexes
cat_columns_idxs = [i for i, col in enumerate(features) if col in cat_cols]

train for coding: season 
train for coding: weather 
train for coding: year 

Coding: season 
Coding: weather 
Coding: year 


In [11]:
# general
FOLDS = 5
SEED = 4321
N_TRIALS = 20
STUDY_NAME = "gbm3"

X = data_used[features].values
y = data_used[TARGET].values

In [12]:
# for LightGBM

# params is needed to remove a lot of annoying logs
dtrain = lgb.Dataset(X, y, params={"verbose": -1})

In [20]:
#
# Here we define what we do using Optuna
#
def objective(trial):

    # tuning on max_depth, n_estimators for the example
    dict_params = {
        "num_iterations": trial.suggest_categorical(
            "num_iterations", [1000, 2000, 3000, 4000, 5000]
        ),
        "learning_rate": trial.suggest_loguniform("learning_rate", low=1e-4, high=1e-2),
        "metrics": ["rmse"],
        "verbose": -1,
    }
    max_depth = trial.suggest_int("max_depth", 4, 10)
    num_leaves = trial.suggest_int("num_leaves", 2 ** (max_depth), 2 ** (max_depth))

    dict_params["max_depth"] = max_depth
    dict_params["num_leaves"] = num_leaves

    history = lgb.cv(
        params=dict_params, train_set=dtrain, nfold=FOLDS, seed=SEED, stratified=False
    )

    # take the last
    rmse = round(history["rmse-mean"][-1], 4)

    return rmse

In [21]:
# launch Optuna Study
study = optuna.create_study(study_name=STUDY_NAME, direction="minimize")

study.optimize(objective, n_trials=N_TRIALS)

[32m[I 2022-03-07 19:21:22,711][0m A new study created in memory with name: gbm3[0m
[32m[I 2022-03-07 19:21:46,734][0m Trial 0 finished with value: 66.9929 and parameters: {'num_iterations': 2000, 'learning_rate': 0.0008184268759992632, 'max_depth': 8, 'num_leaves': 256}. Best is trial 0 with value: 66.9929.[0m
[32m[I 2022-03-07 19:22:03,547][0m Trial 1 finished with value: 141.8963 and parameters: {'num_iterations': 3000, 'learning_rate': 0.0001314164451232725, 'max_depth': 6, 'num_leaves': 64}. Best is trial 0 with value: 66.9929.[0m
[32m[I 2022-03-07 19:22:20,518][0m Trial 2 finished with value: 145.4384 and parameters: {'num_iterations': 3000, 'learning_rate': 0.00011645906663238375, 'max_depth': 6, 'num_leaves': 64}. Best is trial 0 with value: 66.9929.[0m
[32m[I 2022-03-07 19:22:30,931][0m Trial 3 finished with value: 66.4997 and parameters: {'num_iterations': 4000, 'learning_rate': 0.0012405601530012212, 'max_depth': 4, 'num_leaves': 16}. Best is trial 3 with value

In [28]:
study.best_params

{'num_iterations': 2000,
 'learning_rate': 0.009794385997891245,
 'max_depth': 10,
 'num_leaves': 1024}

In [33]:
# visualize trials as an ordered Pandas df
df = study.trials_dataframe()

result_df = df[df["state"] == "COMPLETE"].sort_values(by=["value"], ascending=True)

# best on top
result_df.head()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_num_iterations,params_num_leaves,state
13,13,42.6527,2022-03-07 19:25:57.860036,2022-03-07 19:26:23.081479,0 days 00:00:25.221443,0.009794,10,2000,1024,COMPLETE
16,16,42.6534,2022-03-07 19:27:34.965119,2022-03-07 19:28:06.110567,0 days 00:00:31.145448,0.009899,9,4000,512,COMPLETE
12,12,42.6554,2022-03-07 19:25:28.163052,2022-03-07 19:25:57.858496,0 days 00:00:29.695444,0.00659,10,2000,1024,COMPLETE
19,19,42.674,2022-03-07 19:29:30.833477,2022-03-07 19:29:49.851714,0 days 00:00:19.018237,0.009617,7,4000,128,COMPLETE
14,14,42.7261,2022-03-07 19:26:23.083017,2022-03-07 19:27:18.109263,0 days 00:00:55.026246,0.008396,10,5000,1024,COMPLETE


### estimate RMSLE
* it is the metric used in Kaggle competition

In [23]:
# make an estimation of rmsle on entire dataset

kf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

avg_rmsle = 0.0

#
# at each iteration you get a different set of indexes
# from which you get different samples for train and validation dataset
#
for i, (train_idx, valid_idx) in enumerate(kf.split(data_used)):
    print("Processing fold:", i + 1)

    # here we split the DataFrame, using the indexes for the fold
    data_train = data_used.iloc[train_idx]
    data_valid = data_used.iloc[valid_idx]

    x_train = data_train[features].values
    y_train = data_train[TARGET].values
    x_valid = data_valid[features].values
    y_valid = data_valid[TARGET].values

    model = lgb.LGBMRegressor(**study.best_params)

    model.fit(x_train, y_train, verbose=-1)

    y_valid_preds = model.predict(x_valid)

    # clip to zero (to avoid nan for rmsle)
    y_valid_preds = np.where(y_valid_preds >= 0, y_valid_preds, 0)

    avg_rmsle += rmsle(y_valid_preds, y_valid) / float(FOLDS)

print()
print("Avg. RMSLE:", round(avg_rmsle, 4))
print()

Processing fold: 1
Processing fold: 2
Processing fold: 3
Processing fold: 4
Processing fold: 5

Avg. RMSLE: 0.4053



### train the model on entire train set and save

In [31]:
# maybe I shoud add save best model (see nu_iteration in cell below)
model = lgb.LGBMRegressor(**study.best_params)

model.fit(x_train, y_train)

LGBMRegressor(learning_rate=0.009794385997891245, max_depth=10,
              num_iterations=2000, num_leaves=1024)

In [32]:
model_file = "lgboost.txt"

model.booster_.save_model(model_file, num_iteration=study.best_params["num_iterations"])

<lightgbm.basic.Booster at 0x7ff9a682b040>