In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [3]:
X_train = pd.read_csv("../outputs/imputed_X_train_SMOTE.csv")
y_train = pd.read_csv("../outputs/imputed_y_train_SMOTE.csv")
X_test = pd.read_csv("../outputs/imputed_X_test_scaled.csv")
y_test = pd.read_csv("../outputs/imputed_y_test.csv")

In [4]:
X_train.shape, X_test.shape

((981, 39), (330, 39))

In [5]:
numerical_col = ["Age"]
categorical_col = list(X_train.columns)
categorical_col.remove("Age")
X_train[categorical_col] = X_train[categorical_col].astype('category')

In [6]:
categorical_features = X_train.columns[X_train.dtypes == 'category']
cat_col_numbers = [X_train.columns.get_loc(col) for col in X_train.select_dtypes(include="category")]
categorical_features

Index(['GenderCategory', 'RaceCategory', 'EthnicityCategory', 'Region',
       'Glipizide_Final', 'Glimepiride_Final', 'Glyburide_Final',
       'Metformin_Final', 'Pioglitazone_Final', 'Rosiglitazone_Final',
       'Beta_Blockers_Final', 'ACE_Inhibitors_Final', 'ARB_Final',
       'Diuretics_Final', 'PPI_Final', 'Levothyroxine_Final', 'CCB_Final',
       'Vasodilators_Final', 'Statins_Final', 'Anti_Platelets_Final',
       'Anti_Coagulants_Final', 'Steroids_Final', 'Heart_Disease_Final',
       'Hypothyroid_Final', 'Anemia_Final', 'Kidney_Disease_Final',
       'GERD_Final', 'Neuropathy_Final', 'Eye_Disorder_Final',
       'Atherosclerosis_Final', 'Alzheimer_Final', 'FootUlcer_Final',
       'Abnormal_Glucose_Final', 'DMScreen_Final', 'A1C_Final',
       'GlucoseTest_Final', 'DM_Drugs', 'AnyLab'],
      dtype='object')

In [7]:
lbl = LabelEncoder()
for col in categorical_col:
    X_train[col] = lbl.fit_transform(X_train[col])
    X_test[col] = lbl.transform(X_test[col])

# Modelling

In [8]:
clf = LogisticRegression(penalty="l1", solver = "liblinear")
clf.fit(X_train,y_train.values.ravel())
clf_coeff = pd.DataFrame(clf.coef_, columns=X_train.columns)
clf_coeff = clf_coeff.T.reset_index()
clf_coeff[clf_coeff[0] == 0].index

Int64Index([4, 5, 18, 24, 25, 28, 32, 34, 36], dtype='int64')

In [10]:
n_splits = 5

model_params = {
    "cat_1" : {
        "objective" : "CrossEntropy",
        "eval_metric" : 'AUC',
        "learning_rate" : 0.001, 
        "max_depth" : 7,
        # "random_state" : 56,
        "subsample" : 0.35,
        # 'early_stopping_rounds': 500,
        # 'ignored_features' : [4, 5, 18, 24, 25, 28, 32, 34, 36],
        "n_estimators": 10,
        "cat_features" : cat_col_numbers,
        "verbose" : 0        
    },


    "hgb_1" : {
        "learning_rate":0.015,
        "n_iter_no_change":100,
        "l2_regularization" : 0.02,
        "random_state" : 42,
        "categorical_features" : cat_col_numbers
        
    },

    "lgbm_1" :
    {
        "objective" : "binary",
        "metric" : 'AUC',
        "learning_rate" : 0.018, 
        "max_depth" : 15,
        "random_state" : 56,
        "reg_alpha" : 0.01246,
        "reg_lambda": 0.023483,
        "subsample" : 0.35,
        "colsample_bytree" : 0.8,
        "verbose" : 0,
        "num_leaves" : 16,
        "categorical_features" : cat_col_numbers,
        "device" : "gpu",
        "verbose" : -1,
    }
}

hgb_1 = HistGradientBoostingClassifier(**model_params["hgb_1"])
lgbm_1 = LGBMClassifier(**model_params["lgbm_1"])
cat_1 = CatBoostClassifier(**model_params["cat_1"])

In [11]:
oofs_dict = {model + "_oof" : np.zeros(len(X_train)) for model in model_params.keys()}

In [12]:
def eval(y_true, y_pred, y_pred_proba):
    acc_score = accuracy_score(y_true, y_pred)
    conf_mat = confusion_matrix(y_true,y_pred)

    prec_score = precision_score(y_true, y_pred) 
    rec_score = recall_score(y_true, y_pred)
    F1_score = f1_score(y_true, y_pred)

    roc_auc = roc_auc_score(y_true, y_pred)
    roc_auc_predict_proba = roc_auc_score(y_true, y_pred_proba)

    print(f'Accuracy: {acc_score}')

    print("Precision_score :", prec_score)
    print("recall_score :", rec_score)
    print("f1_score:", F1_score)

    print(f'ROC AUC score: {roc_auc}')
    print(f'ROC AUC PROBA: {roc_auc_predict_proba}')

    print(conf_mat)

    print("==" * 25)

In [13]:
kf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
model_auc = {model + "_auc" : []  for model in model_params.keys()}


for num_fold, (train_index, val_index) in enumerate(kf.split(X_train,y_train.values)):
    
    print("="*20, "Fold : ", num_fold, "="*20)
    X_train_fold = X_train.iloc[train_index]
    X_val_fold = X_train.iloc[val_index]
    y_train_fold = y_train.iloc[train_index].values.ravel()
    y_val_fold = y_train.iloc[val_index].values.ravel()
    

    cat_1.fit(X_train_fold,y_train_fold, eval_set=(X_val_fold,y_val_fold))
    oofs_dict["cat_1_oof"][val_index] = cat_1.predict(X_val_fold)
    y_pred = cat_1.predict(X_val_fold)
    y_pred_proba = cat_1.predict_proba(X_val_fold)[:, 1]
    # print('Fold', num_fold, 'CAT_1: ')
    model_auc["cat_1_auc"].append(roc_auc_score(y_val_fold, y_pred))
    eval(y_val_fold,y_pred, y_pred_proba)


    hgb_1.fit(X_train_fold, y_train_fold)
    oofs_dict["hgb_1_oof"][val_index] = hgb_1.predict(X_val_fold)
    y_pred = hgb_1.predict(X_val_fold)
    y_pred_proba = hgb_1.predict_proba(X_val_fold)[:, 1]
    model_auc["hgb_1_auc"].append(roc_auc_score(y_val_fold, y_pred))
    # print('Fold', num_fold, 'HGB_1: ')
    eval(y_val_fold,y_pred, y_pred_proba)
    
    
    lgbm_1.fit(X_train_fold, y_train_fold)
    oofs_dict["lgbm_1_oof"][val_index] = lgbm_1.predict(X_val_fold)
    y_pred = lgbm_1.predict(X_val_fold)
    y_pred_proba = lgbm_1.predict_proba(X_val_fold)[:, 1]
    # print('Fold', num_fold, 'LGBM_1: ')
    model_auc["lgbm_1_auc"].append(roc_auc_score(y_val_fold, y_pred))
    eval(y_val_fold,y_pred, y_pred_proba)


Accuracy: 0.8121827411167513
Precision_score : 0.7857142857142857
recall_score : 0.908256880733945
f1_score: 0.8425531914893617
ROC AUC score: 0.8007193494578816
ROC AUC PROBA: 0.855660967472894
[[61 27]
 [10 99]]
Accuracy: 0.817258883248731
Precision_score : 0.7829457364341085
recall_score : 0.926605504587156
f1_score: 0.8487394957983192
ROC AUC score: 0.8042118432026689
ROC AUC PROBA: 0.8876146788990826
[[ 60  28]
 [  8 101]]
Accuracy: 0.8477157360406091
Precision_score : 0.8062015503875969
recall_score : 0.9541284403669725
f1_score: 0.8739495798319327
ROC AUC score: 0.8350187656380317
ROC AUC PROBA: 0.8990825688073395
[[ 63  25]
 [  5 104]]
Accuracy: 0.7755102040816326
Precision_score : 0.751937984496124
recall_score : 0.8899082568807339
f1_score: 0.8151260504201681
ROC AUC score: 0.7610460824633556
ROC AUC PROBA: 0.8795739744806496
[[55 32]
 [12 97]]
Accuracy: 0.8316326530612245
Precision_score : 0.8114754098360656
recall_score : 0.908256880733945
f1_score: 0.8571428571428572
ROC A

In [14]:
sum(model_auc["cat_1_auc"])/5, sum(model_auc["hgb_1_auc"])/5, sum(model_auc["lgbm_1_auc"])/5

(0.8006901105327238, 0.8344688581480737, 0.8238423063280702)

# HGB is better classifier than LGBM

### Training HGB on full train data and testing it before Hyperparameter tuning:

In [15]:
roc_list = []
kf = StratifiedKFold(n_splits=5, random_state=2023, shuffle=True)
for num_fold, (train_index, val_index) in enumerate(kf.split(X_train,y_train.values)):

    
    # print("="*20, "Fold : ", num_fold, "="*20)
    X_train_fold = X_train.iloc[train_index]
    X_val_fold = X_train.iloc[val_index]
    y_train_fold = y_train.iloc[train_index].values.ravel()
    y_val_fold = y_train.iloc[val_index].values.ravel()

    
    hgb_1.fit(X_train_fold, y_train_fold)
    oofs_dict["hgb_1_oof"][val_index] = hgb_1.predict(X_val_fold)
    y_pred = hgb_1.predict(X_val_fold)
    y_pred_proba = hgb_1.predict_proba(X_val_fold)[:, 1]
    print('Fold', num_fold, 'HGB_1: ')
    roc = roc_auc_score(y_val_fold,y_pred)
    print("ROC: ", roc)
    roc_list.append(roc)

print("Average ROC: ",sum(roc_list)/len(roc_list))
    # eval(y_val_fold,y_pred, y_pred_proba)

Fold 0 HGB_1: 
ROC:  0.8326209341117599
Fold 1 HGB_1: 
ROC:  0.8690287883581146
Fold 2 HGB_1: 
ROC:  0.7943688706105663
Fold 3 HGB_1: 
ROC:  0.8047031530106507
Fold 4 HGB_1: 
ROC:  0.7886217441737846
Average ROC:  0.8178686980529752


In [17]:
import optuna

def objective(trial,data=X_train,target=y_train.values.ravel()):
    train_x, val_x, train_y, val_y = train_test_split(data, target, test_size=0.33,random_state=42, stratify=target)
    params = {
        'l2_regularization': trial.suggest_float('l2_regularization',1e-2,10, log = True),
        'early_stopping': trial.suggest_categorical('early_stopping', [False, True]),
        'learning_rate': trial.suggest_float('learning_rate', 0.001,0.1, log = True),
        # 'max_iter': trial.suggest_categorical('max_iter', [750, 1000, 1250, 1500]),
        'max_depth': trial.suggest_int('max_depth', 2,8),
        'max_bins': trial.suggest_int('max_bins', 5,255),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3,8),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 20,80),
        "categorical_features" : cat_col_numbers
    }

    model = HistGradientBoostingClassifier(**params)
    model.fit(train_x, train_y)
    predictions = model.predict(val_x)
    auc = roc_auc_score(val_y, predictions)
    print(auc)
    return auc

In [242]:
%%time
study = optuna.create_study(direction='maximize')
optuna.logging.set_verbosity(optuna.logging.ERROR)
study.optimize(objective, n_trials=999)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

0.8113636363636365
0.8
0.8454545454545455
0.8113636363636365
0.7886363636363636
0.7931818181818182
0.8750000000000001
0.7886363636363636
0.7886363636363636
0.7681818181818181
0.8636363636363635
0.85
0.8750000000000001
0.8727272727272727
0.85
0.8659090909090909
0.8318181818181818
0.8704545454545454
0.8909090909090909
0.7909090909090909
0.8727272727272727
0.8750000000000001
0.8363636363636363
0.8840909090909091
0.7954545454545455
0.825
0.825
0.8704545454545454
0.8272727272727273
0.8272727272727273
0.8159090909090909
0.8954545454545454
0.9159090909090909
0.8659090909090909
0.8568181818181818
0.825
0.85
0.8659090909090909
0.8704545454545454
0.8045454545454546
0.825
0.8727272727272727
0.8545454545454545
0.859090909090909
0.8840909090909091
0.8159090909090909
0.8931818181818181
0.8409090909090909
0.8386363636363636
0.8840909090909091
0.9159090909090909
0.8954545454545454
0.9068181818181817
0.8954545454545454
0.8977272727272727
0.9068181818181817
0.8977272727272727
0.8977272727272727
0.893181

In [245]:
study.best_trial.params

{'l2_regularization': 0.0034244177726349837,
 'early_stopping': False,
 'learning_rate': 0.07915435645346343,
 'max_depth': 8,
 'max_bins': 255,
 'min_samples_leaf': 8,
 'max_leaf_nodes': 53}

In [244]:
print(study.best_trial.value)

0.9590909090909091


In [248]:
hgb_1 = HistGradientBoostingClassifier(**study.best_trial.params)
hgb_1.fit(X_train,y_train.values.ravel())
y_pred = hgb_1.predict(X_test)
y_pred_proba = hgb_1.predict_proba(X_test)[:,1]
eval(y_test, y_pred, y_pred_proba)

Accuracy: 0.8393939393939394
Precision_score : 0.8966789667896679
recall_score : 0.9067164179104478
f1_score: 0.901669758812616
ROC AUC score: 0.7275517573423207
ROC AUC PROBA: 0.7173206547905633
[[ 34  28]
 [ 25 243]]
