In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import re

from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score, accuracy_score

from skopt import BayesSearchCV

from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
!pip install scikit-learn-intelex

In [2]:
df = pd.read_csv('data.csv')

df = df.rename(columns = lambda x: re.sub('[^A-Za-z0-9_]+', '', str(x)))

In [3]:
X = df.drop('readmitted', axis=1)
y = df.readmitted

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
lgbc = lgb.LGBMClassifier()

lgbc.fit(X_train, y_train)

LGBMClassifier()

In [6]:
print('LightGBM roc_auc_score: {0:0.4f}'.format(roc_auc_score(y_test, lgbc.predict_proba(X_test)[:, 1])))

LightGBM roc_auc_score: 0.7287


In [7]:
lgbc = lgb.LGBMClassifier(max_depth=8, num_leaves=256, min_data_in_leaf=100, n_estimators=100)

lgbc.fit(X_train, y_train)



LGBMClassifier(max_depth=8, min_data_in_leaf=100, num_leaves=256)

In [19]:
print('LightGBM roc_auc_score: {0:0.4f}'.format(roc_auc_score(y_test, lgbc.predict_proba(X_test)[:, 1])))
print('LightGBM accuracy_score: {0:0.4f}'.format(accuracy_score(y_test, lgbc.predict(X_test))))

LightGBM roc_auc_score: 0.7294
LightGBM accuracy_score: 0.6677


In [9]:
param_grid = {
    "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    "num_leaves": list(np.int64(np.linspace(20, 3000, 100))),
    "min_data_in_leaf": [100, 500, 1000, 5000],
    "n_estimators": [100, 500, 1000, 5000]
}

In [11]:
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(lgb.LGBMClassifier(), param_distributions=param_grid, n_iter=5, cv=3, verbose=1)

In [12]:
rs.fit(X_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


RandomizedSearchCV(cv=3, estimator=LGBMClassifier(), n_iter=5,
                   param_distributions={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9,
                                                      10, 11, 12],
                                        'min_data_in_leaf': [100, 500, 1000,
                                                             5000],
                                        'n_estimators': [100, 500, 1000, 5000],
                                        'num_leaves': [20, 50, 80, 110, 140,
                                                       170, 200, 230, 260, 290,
                                                       321, 351, 381, 411, 441,
                                                       471, 501, 531, 561, 591,
                                                       622, 652, 682, 712, 742,
                                                       772, 802, 832, 862, 892, ...]},
                   verbose=1)

In [13]:
rs.best_estimator_

LGBMClassifier(max_depth=10, min_data_in_leaf=1000, num_leaves=2608)

In [20]:
print('LightGBM roc_auc_score: {0:0.4f}'.format(roc_auc_score(y_test, rs.best_estimator_.predict_proba(X_test)[:, 1])))
print('LightGBM accuracy_score: {0:0.4f}'.format(accuracy_score(y_test, lgbc.predict(X_test))))

LightGBM roc_auc_score: 0.7295
LightGBM accuracy_score: 0.6677


In [15]:
opt = BayesSearchCV(
    lgb.LGBMClassifier(),
    {  
         "max_depth": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
        "num_leaves": list(np.int64(np.linspace(20, 3000, 100))),
        "min_data_in_leaf": [100, 500, 1000, 5000],
        "n_estimators": [100, 500, 1000, 5000]
    },
    n_iter=10,
    cv=3,
    n_jobs=-1
)

In [16]:
opt.fit(X_train, y_train)



BayesSearchCV(cv=3, estimator=LGBMClassifier(), n_iter=10, n_jobs=-1,
              search_spaces={'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                             'min_data_in_leaf': [100, 500, 1000, 5000],
                             'n_estimators': [100, 500, 1000, 5000],
                             'num_leaves': [20, 50, 80, 110, 140, 170, 200, 230,
                                            260, 290, 321, 351, 381, 411, 441,
                                            471, 501, 531, 561, 591, 622, 652,
                                            682, 712, 742, 772, 802, 832, 862,
                                            892, ...]})

In [17]:
opt.best_params_

OrderedDict([('max_depth', 7),
             ('min_data_in_leaf', 1000),
             ('n_estimators', 100),
             ('num_leaves', 1675)])

In [21]:
print('LightGBM roc_auc_score: {0:0.4f}'.format(roc_auc_score(y_test, opt.best_estimator_.predict_proba(X_test)[:, 1])))
print('LightGBM accuracy_score: {0:0.4f}'.format(accuracy_score(y_test, lgbc.predict(X_test))))

LightGBM roc_auc_score: 0.7283
LightGBM accuracy_score: 0.6677
