# LightGBM


add main project directory to path to access utils package


In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
sys.path.append(parent_dir)

import modules


In [2]:
import numpy as np
import optuna
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn import set_config
from sklearn.compose import ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

from utils.machine_learning import Rounder

In [3]:
train_data = pd.read_pickle("../../data/train_processed.pkl")
train_data

Unnamed: 0,PassengerNum,Age,HomePlanet,Destination,CabinDeck,CabinSide,CryoSleep,VIP,RoomService,FoodCourt,...,YesShoppingMall,YesSpa,YesVRDeck,YesTotalSpending,LogRoomService,LogFoodCourt,LogShoppingMall,LogSpa,LogVRDeck,LogTotalSpending
0,01,39.0,Europa,TRAPPIST-1e,B,P,False,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,01,24.0,Earth,TRAPPIST-1e,F,S,False,False,109.0,9.0,...,True,True,True,True,4.700480,2.302585,3.258097,6.309918,3.806662,6.602588
2,01,58.0,Europa,TRAPPIST-1e,A,S,False,True,43.0,3576.0,...,False,True,True,True,3.784190,8.182280,0.000000,8.812248,3.912023,9.248021
3,02,33.0,Europa,TRAPPIST-1e,A,S,False,False,0.0,1283.0,...,True,True,True,True,0.000000,7.157735,5.918894,8.110728,5.267858,8.551981
4,01,16.0,Earth,TRAPPIST-1e,F,S,False,False,303.0,70.0,...,True,True,True,True,5.717028,4.262680,5.023881,6.338594,1.098612,6.995766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,01,41.0,Europa,55 Cancri e,A,P,False,True,0.0,6819.0,...,False,True,True,True,0.000000,8.827615,0.000000,7.404888,4.317488,9.052165
8689,01,18.0,Earth,PSO J318.5-22,G,S,True,False,0.0,0.0,...,False,False,False,False,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8690,01,26.0,Earth,TRAPPIST-1e,G,S,False,False,0.0,0.0,...,True,True,False,True,0.000000,0.000000,7.535297,0.693147,0.000000,7.535830
8691,01,32.0,Europa,55 Cancri e,E,S,False,False,0.0,1049.0,...,False,True,True,True,0.000000,6.956545,0.000000,5.869297,8.082093,8.442039


In [4]:
df = train_data.copy()
X = df.drop(
    columns=[
        "Transported",
        "TotalSpending",
        "LogRoomService",
        "LogFoodCourt",
        "LogShoppingMall",
        "LogSpa",
        "LogVRDeck",
        "LogTotalSpending",
        "LogTotalSpending",
    ]
)
y = df["Transported"]
numerical_columns = list(X.select_dtypes(include="number").drop(columns="CabinBin"))
categorical_columns = list(X.select_dtypes(include=["object"]))
set_config(transform_output="pandas")

cat_pipeline = Pipeline([("one_hot", OneHotEncoder(sparse_output=False))])
num_pipeline = Pipeline(
    [
        ("imputer", IterativeImputer(random_state=0)),
        ("scaler", StandardScaler()),
    ]
)
ord_pipeline = Pipeline(
    [
        ("oe", OrdinalEncoder()),
        ("imputer", IterativeImputer(random_state=0)),
        ("rounder", Rounder(decimals=0)),
    ]
)
feature_preprocessing = ColumnTransformer(
    [
        ("cat", cat_pipeline, categorical_columns),
        ("num", num_pipeline, numerical_columns),
        ("ord", ord_pipeline, ["CabinBin"]),
    ],
    verbose_feature_names_out=False,
)

In [5]:
X_processed = feature_preprocessing.fit_transform(X)

In [6]:
model = LGBMClassifier()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_processed, y, scoring="accuracy", cv=cv, n_jobs=-1)
print("Accuracy: %.3f (%.3f)" % (np.mean(n_scores), np.std(n_scores)))

[LightGBM] [Info] Number of positive: 3940, number of negative: 3883
[LightGBM] [Info] Number of positive: 3940, number of negative: 3883
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.296834 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1511
[LightGBM] [Info] Number of data points in the train set: 7823, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503643 -> initscore=0.014573
[LightGBM] [Info] Start training from score 0.014573
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.295748 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1514
[LightGBM] [Info] Number of data points in the train set: 7823, number of used features: 66
[LightGBM] [Info] [binary:

With no hyperparameter tuning we're almost at the best version of our autoML.

Let's do some hyperparameter tuning


In [7]:
def objective(trial, data=X_processed, target=y):
    data = data.copy()

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0),
        "max_depth": trial.suggest_int("max_depth", 2, 20),
    }

    model = LGBMClassifier(**param)
    return cross_val_score(
        model, data, target, scoring="accuracy", n_jobs=-1, cv=5
    ).mean()


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

trial = study.best_trial

print("Accuracy: {}".format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[I 2024-10-17 23:12:51,754] A new study created in memory with name: no-name-d5f5c645-f1bd-4ea1-80e2-c2e15e05fc0d
[I 2024-10-17 23:12:52,835] Trial 0 finished with value: 0.7839681416842742 and parameters: {'lambda_l1': 5.982141070874863e-06, 'lambda_l2': 0.9080277394315109, 'num_leaves': 70, 'feature_fraction': 0.4427644919082101, 'bagging_fraction': 0.7009305985832924, 'bagging_freq': 2, 'min_child_samples': 95, 'learning_rate': 0.4628930131688978, 'max_depth': 17}. Best is trial 0 with value: 0.7839681416842742.
[I 2024-10-17 23:12:53,446] Trial 1 finished with value: 0.7867289442565499 and parameters: {'lambda_l1': 0.016928647280392558, 'lambda_l2': 7.802010453884022, 'num_leaves': 221, 'feature_fraction': 0.508415881215528, 'bagging_fraction': 0.481675594612943, 'bagging_freq': 5, 'min_child_samples': 71, 'learning_rate': 0.672750632823794, 'max_depth': 5}. Best is trial 1 with value: 0.7867289442565499.
[I 2024-10-17 23:12:54,264] Trial 2 finished with value: 0.7877626984279287 a

Accuracy: 0.8074335408297164
Best hyperparameters: {'lambda_l1': 9.491148511715021, 'lambda_l2': 1.604404937631013e-08, 'num_leaves': 118, 'feature_fraction': 0.9191598295606791, 'bagging_fraction': 0.730228902545114, 'bagging_freq': 2, 'min_child_samples': 22, 'learning_rate': 0.14228155266790352, 'max_depth': 19}


In [8]:
study.best_params

{'lambda_l1': 9.491148511715021,
 'lambda_l2': 1.604404937631013e-08,
 'num_leaves': 118,
 'feature_fraction': 0.9191598295606791,
 'bagging_fraction': 0.730228902545114,
 'bagging_freq': 2,
 'min_child_samples': 22,
 'learning_rate': 0.14228155266790352,
 'max_depth': 19}

In [12]:
model = LGBMClassifier(**study.best_params)

In [13]:
lgbm_pipeline = Pipeline(
    steps=[("preprocessor", feature_preprocessing), ("model", model)]
)

In [14]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
cross_val_score(
    lgbm_pipeline, X, y, scoring="accuracy", n_jobs=-1, cv=5, error_score="raise"
).mean()

[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005106 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1514
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM

0.8055926087436995

In [15]:
lgbm_pipeline.fit(X, y)

[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000836 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495


In [17]:
os.makedirs("../../ML_models_trained", exist_ok=True)

In [18]:
import pickle

model = lgbm_pipeline
model.fit(X, y)

with open(
    "../../ML_models_trained/light_gbm_model.pkl",
    "wb",
) as file:
    pickle.dump(model, file)

[LightGBM] [Info] Number of positive: 4378, number of negative: 4315
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1517
[LightGBM] [Info] Number of data points in the train set: 8693, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503624 -> initscore=0.014495
[LightGBM] [Info] Start training from score 0.014495
