In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
# from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [3]:
X_train = pd.read_csv("../outputs/imputed_X_train_SMOTE.csv")
y_train = pd.read_csv("../outputs/imputed_y_train_SMOTE.csv")
X_test = pd.read_csv("../outputs/imputed_X_test_scaled.csv")
y_test = pd.read_csv("../outputs/imputed_y_test.csv")

In [4]:
X_train.shape, X_test.shape

((981, 39), (330, 39))

In [5]:
numerical_col = ["Age"]
categorical_col = list(X_train.columns)
categorical_col.remove("Age")
X_train[categorical_col] = X_train[categorical_col].astype('category')

In [12]:
categorical_features = X_train.columns[X_train.dtypes == 'category']
cat_col_numbers = [X_train.columns.get_loc(col) for col in X_train.select_dtypes(include="category")]
categorical_features


Index([], dtype='object')

In [7]:
lbl = LabelEncoder()
for col in categorical_col:
    X_train[col] = lbl.fit_transform(X_train[col])
    X_test[col] = lbl.transform(X_test[col])

# Modelling

In [13]:
n_estimators = 1000
n_splits = 5

model_params = {
    "hgb_1" : {
        "max_iter":n_estimators,
        "learning_rate":0.015,
        "loss":"log_loss",
        "n_iter_no_change":100,
        "l2_regularization" : 0.02,
        "random_state" : 42,
        "categorical_features" : cat_col_numbers
        
    },
    "lgbm_1" :{
        "objective" : "mae",
        "metric" : 'binary_logloss',
        "learning_rate" : 0.018, 
        "max_depth" : 15,
        "random_state" : 56,
        "reg_alpha" : 0.01246,
        "reg_lambda": 0.023483,
        "subsample" : 0.35,
        "colsample_bytree" : 0.8,
        "verbose" : 0,
        "num_leaves" : 16,
        "categorical_features" : cat_col_numbers
    }

}

hgb_1 = HistGradientBoostingClassifier(**model_params["hgb_1"])
xgb_1 = XGBClassifier(**model_params["xgb_1"])
lgbm_1 = LGBMClassifier(**model_params["lgbm_1"])

In [10]:
oofs_dict = {model + "_oof" : np.zeros(len(X_train)) for model in model_params.keys()}
oofs_dict;

In [29]:
kf = StratifiedKFold(n_splits=n_splits, random_state=2023, shuffle=True)

for num_fold, (train_index, val_index) in enumerate(kf.split(X_train,y_train.values)):

    X_train
    
    print("="*20, "Fold : ", num_fold, "="*20)
    X_train_fold = X_train.iloc[train_index]
    X_val_fold = X_train.iloc[val_index]
    y_train_fold = y_train.iloc[train_index].values.ravel()
    y_val_fold = y_train.iloc[val_index].values.ravel()

    hgb_1.fit(X_train_fold, y_train_fold)
    oofs_dict["hgb_1_oof"][val_index] = hgb_1.predict(X_val_fold)
    # predictions_dict["hgb_1_preds"] += hgb_1.predict(X_test)/ n_splits
    print('Fold', num_fold, 'HGB_1: ',roc_auc_score(y_val_fold, hgb_1.predict_proba(X_val_fold)[:, 1]))
    
    xgb_1.fit(X_train_fold, y_train_fold)
    oofs_dict["xgb_1_oof"][val_index] = xgb_1.predict(X_val_fold)
    # predictions_dict["xgb_1_preds"] += xgb_1.predict(X_test)/ n_splits
    print('Fold', num_fold, 'XGB_1: ',roc_auc_score(y_val_fold, xgb_1.predict_proba(X_val_fold)[:, 1]))
    
    lgbm_1.fit(X_train_fold, y_train_fold)
    oofs_dict["lgbm_1_oof"][val_index] = lgbm_1.predict(X_val_fold)
    # predictions_dict["lgbm_1_preds"] += lgbm_1.predict(X_test)/ n_splits
    print('Fold', num_fold, 'LGBM_1: ',roc_auc_score(y_val_fold, lgbm_1.predict_proba(X_val_fold)[:, 1]))

Fold 0 HGB_1:  0.9424520433694745
Fold 0 XGB_1:  0.5
Fold 0 LGBM_1:  0.90121976647206
Fold 1 HGB_1:  0.9657281451017611
Fold 1 XGB_1:  0.5
Fold 1 LGBM_1:  0.9022461246440999
Fold 2 HGB_1:  0.9345143941790572
Fold 2 XGB_1:  0.5
Fold 2 LGBM_1:  0.8690287883581146
Fold 3 HGB_1:  0.9297690604239165
Fold 3 XGB_1:  0.5
Fold 3 LGBM_1:  0.8751449963091849
Fold 4 HGB_1:  0.9000316355583676
Fold 4 XGB_1:  0.5
Fold 4 LGBM_1:  0.8501529051987767


In [23]:
print(classification_report(y_train, oofs_dict["xgb_1_oof"]))

              precision    recall  f1-score   support

         0.0       0.44      0.80      0.57       436
         1.0       0.55      0.20      0.29       545

    accuracy                           0.47       981
   macro avg       0.50      0.50      0.43       981
weighted avg       0.50      0.47      0.42       981



In [16]:
cv_results = lgb.cv(
    params,
    train_data,
    num_boost_round=1000,
    nfold=5,
    metrics='binary_logloss',
    stratified=True,
    seed=42
)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### DummyClassifier

In [26]:
from xgboost import XGBClassifier

# fit model no training data
model = XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


accuracy_dt = accuracy_score(y_test, y_pred)
precision_dt = precision_score(y_test, y_pred)
recall_dt = recall_score(y_test, y_pred)
f1_score_dt = f1_score(y_test, y_pred)

# Calculate AUC for Decision Tree
y_probs_dt = model.predict_proba(X_test)[:, 1]
roc_auc_dt = roc_auc_score(y_test, y_probs_dt)

print("XGBoost Metrics:")
print(f"Accuracy: {accuracy_dt:.2f}")
print(f"Precision: {precision_dt:.2f}")
print(f"Recall: {recall_dt:.2f}")
print(f"F1 Score: {f1_score_dt:.2f}")
print(f"AUC: {roc_auc_dt:.2f}")

XGBoost Metrics:
Accuracy: 0.89
Precision: 0.93
Recall: 0.94
F1 Score: 0.93
AUC: 0.80


In [None]:
###  DummyClassifier