In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

In [3]:
X_train = pd.read_csv("../outputs/imputed_X_train_SMOTE.csv")
y_train = pd.read_csv("../outputs/imputed_y_train_SMOTE.csv")
X_test = pd.read_csv("../outputs/imputed_X_test_scaled.csv")
y_test = pd.read_csv("../outputs/imputed_y_test.csv")

In [4]:
X_train.shape, X_test.shape

((981, 39), (330, 39))

In [5]:
numerical_col = ["Age"]
categorical_col = list(X_train.columns)
categorical_col.remove("Age")
X_train[categorical_col] = X_train[categorical_col].astype('category')

In [6]:
categorical_features = X_train.columns[X_train.dtypes == 'category']
categorical_features

Index(['GenderCategory', 'RaceCategory', 'EthnicityCategory', 'Region',
       'Glipizide_Final', 'Glimepiride_Final', 'Glyburide_Final',
       'Metformin_Final', 'Pioglitazone_Final', 'Rosiglitazone_Final',
       'Beta_Blockers_Final', 'ACE_Inhibitors_Final', 'ARB_Final',
       'Diuretics_Final', 'PPI_Final', 'Levothyroxine_Final', 'CCB_Final',
       'Vasodilators_Final', 'Statins_Final', 'Anti_Platelets_Final',
       'Anti_Coagulants_Final', 'Steroids_Final', 'Heart_Disease_Final',
       'Hypothyroid_Final', 'Anemia_Final', 'Kidney_Disease_Final',
       'GERD_Final', 'Neuropathy_Final', 'Eye_Disorder_Final',
       'Atherosclerosis_Final', 'Alzheimer_Final', 'FootUlcer_Final',
       'Abnormal_Glucose_Final', 'DMScreen_Final', 'A1C_Final',
       'GlucoseTest_Final', 'DM_Drugs', 'AnyLab'],
      dtype='object')

# Modelling

In [7]:
n_estimators = 1000
n_splits = 5

model_params ={
    "xgb_1" :  {
        'n_estimators': n_estimators,
        'reg_lambda': 0.023834, 
        'reg_alpha': 0.0411302070759748, 
        'colsample_bytree': 0.7,
        'subsample': 0.25, 
        'learning_rate': 0.02, 
        'max_depth': 12,
        'random_state': 2055, 
        'min_child_weight': 55,
        'eval_metric': 'logloss',
        'grow_policy': 'lossguide',
        'verbosity': 0, 
        "enable_categorical" : True,
    },
    
    "hgb_1" : {
        "max_iter":n_estimators,
        "learning_rate":0.015,
        "loss":"log_loss",
        "n_iter_no_change":100,
        "l2_regularization" : 0.02,
        "random_state" : 42,
        "categorical_features" : categorical_features
        
    },
    "lgbm_1" :{
        "objective" : "mae",
        "metric" : 'binary_logloss',
        "learning_rate" : 0.018, 
        "max_depth" : 15,
        "random_state" : 56,
        "reg_alpha" : 0.01246,
        "reg_lambda": 0.023483,
        "subsample" : 0.35,
        "colsample_bytree" : 0.8,
        "verbose" : 0,
        "num_leaves" : 16,
        "categorical_features" : categorical_features
    }

}

hgb_1 = HistGradientBoostingClassifier(**model_params["hgb_1"])
xgb_1 = XGBClassifier(**model_params["xgb_1"])
lgbm_1 = LGBMClassifier(**model_params["lgbm_1"])

In [8]:
oofs_dict = {model + "_oof" : np.zeros(len(X_train)) for model in model_params.keys()}
oofs_dict;

In [9]:
kf = StratifiedKFold(n_splits=n_splits, random_state=2023, shuffle=True)

for num_fold, (train_index, val_index) in enumerate(kf.split(X_train,y_train.values)):
    
    print("="*20, "Fold : ", num_fold, "="*20)
    X_train_fold = X_train.iloc[train_index]
    X_val_fold = X_train.iloc[val_index]
    y_train_fold = y_train.iloc[train_index]
    y_val_fold = y_train.iloc[val_index]

    hgb_1.fit(X_train_fold, y_train_fold)
    oofs_dict["hgb_1_oof"][val_index] = hgb_1.predict(X_val_fold)
    # predictions_dict["hgb_1_preds"] += hgb_1.predict(X_test)/ n_splits
    print('Fold', num_fold, 'HGB_1 MAE: ',classification_report(y_val_fold, oofs_dict["hgb_1_oof"][val_index]))
    
    xgb_1.fit(X_train_fold, y_train_fold)
    oofs_dict["xgb_1_oof"][val_index] = xgb_1.predict(X_val_fold)
    # predictions_dict["xgb_1_preds"] += xgb_1.predict(X_test)/ n_splits
    print('Fold', num_fold, 'XGB_1 MAE: ',classification_report(y_val_fold, oofs_dict["xgb_1_oof"][val_index]))
    
    lgbm_1.fit(X_train_fold, y_train_fold)
    oofs_dict["lgbm_1_oof"][val_index] = lgbm_1.predict(X_val_fold)
    # predictions_dict["lgbm_1_preds"] += lgbm_1.predict(X_test)/ n_splits
    print('Fold', num_fold, 'LGBM_1 MAE: ',classification_report(y_val_fold, oofs_dict["lgbm_1_oof"][val_index]))



ValueError: could not convert string to float: 'Male'

In [16]:
cv_results = lgb.cv(
    params,
    train_data,
    num_boost_round=1000,
    nfold=5,
    metrics='binary_logloss',
    stratified=True,
    seed=42
)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### DummyClassifier

In [26]:
from xgboost import XGBClassifier

# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


accuracy_dt = accuracy_score(y_test, y_pred)
precision_dt = precision_score(y_test, y_pred)
recall_dt = recall_score(y_test, y_pred)
f1_score_dt = f1_score(y_test, y_pred)

# Calculate AUC for Decision Tree
y_probs_dt = model.predict_proba(X_test)[:, 1]
roc_auc_dt = roc_auc_score(y_test, y_probs_dt)

print("XGBoost Metrics:")
print(f"Accuracy: {accuracy_dt:.2f}")
print(f"Precision: {precision_dt:.2f}")
print(f"Recall: {recall_dt:.2f}")
print(f"F1 Score: {f1_score_dt:.2f}")
print(f"AUC: {roc_auc_dt:.2f}")

XGBoost Metrics:
Accuracy: 0.89
Precision: 0.93
Recall: 0.94
F1 Score: 0.93
AUC: 0.80


In [None]:
###  DummyClassifier