### **Library Imports**

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import random as r
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, \
                            roc_auc_score, \
                            precision_recall_fscore_support, \
                            classification_report, \
                            confusion_matrix

In [2]:
class CFG(object):
    def __init__(self,
                 seed: int = 42,
                 n_splits: int = 5,
                 show_info: bool = False,
                 ):

        self.seed = seed
        self.n_splits = n_splits
        self.show_info = show_info
        self.data_read_path = "../input/cardiovascular-disease-dataset/cardio_train.csv"
        self.model_save_path = "models"
        if not os.path.exists(self.model_save_path): os.makedirs(self.model_save_path)

cfg = CFG(seed=42, show_info=True)

### **Utilities and Helpers**

In [3]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + num*char + "\n")
    

def print_scores(accuracy: float, auc: float, precision: np.ndarray, recall: np.ndarray, f_score: np.ndarray) -> None:
    print(f"Accuracy  : {accuracy:.5f}")
    print(f"ROC-AUC   : {auc:.5f}")
    print(f"Precision : {precision}")
    print(f"Recall    : {recall}")
    print(f"F-Score   : {f_score}")
    

def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    accuracy = accuracy_score(y_pred, y_true)
    auc = roc_auc_score(y_pred, y_true)
    precision, recall, f_score, _ = precision_recall_fscore_support(y_pred, y_true)

    return accuracy, auc, precision, recall, f_score

In [4]:
class Pipelines(object):
    def __init__(self, preprocessor, seed: int):
        self.model = Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                ("classifier", GradientBoostingClassifier(random_state=seed)),
            ]
        )

### **Normal**

In [5]:
df = pd.read_csv(cfg.data_read_path, delimiter=";")
df = df.drop(columns=["id"])
df.age /= 365.25

y = df.cardio.copy().values

df = df.drop(columns=["cardio"])

df["bmi"] = df.weight / (df.height / 100)**2
df["smoke_alco"] = [df.smoke[i] and df.alco[i] for i in range(df.shape[0])]
df["smoke_alco_active"] = [df.smoke[i] and df.alco[i] and df.active[i] for i in range(df.shape[0])]
df["ap_mean"] = (df.ap_hi + df.ap_lo) / 2

X = df.copy().values

features = [i for i in range(X.shape[1])]

feature_transformer = Pipeline(
    steps=[
        ("Simple_Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("Standard_Scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("features", feature_transformer, features),
    ]
)

In [6]:
best_auc = 0.0
best_acc = 0.0
best_pre = 0.0
best_rec = 0.0
best_f1  = 0.0

fold = 1

breaker()
for tr_idx, va_idx in KFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(X):
    X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]
    my_pipeline = Pipelines(preprocessor, cfg.seed)
    my_pipeline.model.fit(X_train, y_train)

    y_pred = my_pipeline.model.predict(X_valid)
    acc, auc, pre, rec, f1 = get_scores(y_valid, y_pred)
    print(f"Fold {fold}\n")
    print_scores(acc, auc, pre, rec, f1)
    print("")
    print(confusion_matrix(y_valid, y_pred))
    breaker()

    if acc > best_acc:
        best_acc = acc
        best_acc_fold = fold
    
    if auc > best_auc:
        best_auc = auc
        best_auc_fold = fold

    if pre[1] > best_pre:
        best_pre = pre[1]
        best_pre_fold = fold
    
    if rec[1] > best_rec:
        best_rec = rec[1]
        best_rec_fold = fold
    
    if f1[1] > best_f1:
        best_f1 = f1[1]
        best_f1_fold = fold
    
    fold += 1

print(f"Best ACC Fold : {best_acc_fold}")
print(f"Best AUC Fold : {best_auc_fold}")
print(f"Best PRE Fold : {best_pre_fold}")
print(f"Best REC Fold : {best_rec_fold}")
print(f"Best F1 Fold  : {best_f1_fold}")

breaker()


**************************************************

Fold 1

Accuracy  : 0.74093
ROC-AUC   : 0.74245
Precision : [0.78019462 0.70179692]
Recall    : [0.72278934 0.76211863]
F-Score   : [0.75039571 0.73071498]

[[5452 1536]
 [2091 4921]]

**************************************************

Fold 2

Accuracy  : 0.73786
ROC-AUC   : 0.74050
Precision : [0.78883774 0.68757094]
Recall    : [0.71350507 0.76750079]
F-Score   : [0.74928269 0.72534052]

[[5484 1468]
 [2202 4846]]

**************************************************

Fold 3

Accuracy  : 0.73186
ROC-AUC   : 0.73326
Precision : [0.76935229 0.69475629]
Recall    : [0.71379081 0.75273287]
F-Score   : [0.74053083 0.72258351]

[[5357 1606]
 [2148 4889]]

**************************************************

Fold 4

Accuracy  : 0.73657
ROC-AUC   : 0.73798
Precision : [0.78007307 0.69160372]
Recall    : [0.72335158 0.75260828]
F-Score   : [0.75064233 0.72081756]

[[5551 1565]
 [2123 4761]]

**************************************************


### **Grid Search**

In [7]:
param_grid = {
    "n_estimators" : (50, 100, 150, 200),
    "learning_rate" : (1, 1e-1, 1e-2, 1e-3),
}

sc_X = StandardScaler()
si_mean = SimpleImputer(missing_values=np.nan, strategy="mean")

X = si_mean.fit_transform(X)
X = sc_X.fit_transform(X)

my_pipeline = Pipelines(preprocessor, cfg.seed)

breaker()
gscv_model = GridSearchCV(
    estimator=GradientBoostingClassifier(random_state=cfg.seed),
    param_grid=param_grid,
    cv=cfg.n_splits,
    verbose=3,
    return_train_score=True,
    ).fit(X, y)
breaker()


**************************************************

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END learning_rate=1, n_estimators=50;, score=(train=0.746, test=0.729) total time=   4.1s
[CV 2/5] END learning_rate=1, n_estimators=50;, score=(train=0.743, test=0.739) total time=   4.2s
[CV 3/5] END learning_rate=1, n_estimators=50;, score=(train=0.745, test=0.728) total time=   4.1s
[CV 4/5] END learning_rate=1, n_estimators=50;, score=(train=0.744, test=0.732) total time=   4.2s
[CV 5/5] END learning_rate=1, n_estimators=50;, score=(train=0.745, test=0.730) total time=   4.2s
[CV 1/5] END learning_rate=1, n_estimators=100;, score=(train=0.753, test=0.725) total time=   8.4s
[CV 2/5] END learning_rate=1, n_estimators=100;, score=(train=0.749, test=0.737) total time=   8.4s
[CV 3/5] END learning_rate=1, n_estimators=100;, score=(train=0.751, test=0.724) total time=   8.1s
[CV 4/5] END learning_rate=1, n_estimators=100;, score=(train=0.751, test=0.730) total time

In [8]:
breaker()
print(f"Best Estimator : {gscv_model.best_estimator_}")
print(f"Best Score     : {gscv_model.best_score_}")
print(f"Best Fold      : {gscv_model.best_index_}")
breaker()
print("Best Parameters\n")
for k, v in gscv_model.best_params_.items():
    # Left Justify the text making it 14 characte3rs long
    print(f"{k:<14} : {v}")
breaker()


**************************************************

Best Estimator : GradientBoostingClassifier(n_estimators=200, random_state=42)
Best Score     : 0.7369571428571429
Best Fold      : 7

**************************************************

Best Parameters

learning_rate  : 0.1
n_estimators   : 200

**************************************************



### **Best Grid Seached Model**

In [9]:
df = pd.read_csv(cfg.data_read_path, delimiter=";")
df = df.drop(columns=["id"])
df.age /= 365.25

y = df.cardio.copy().values

df = df.drop(columns=["cardio"])

df["bmi"] = df.weight / (df.height / 100)**2
df["smoke_alco"] = [df.smoke[i] and df.alco[i] for i in range(df.shape[0])]
df["smoke_alco_active"] = [df.smoke[i] and df.alco[i] and df.active[i] for i in range(df.shape[0])]
df["ap_mean"] = (df.ap_hi + df.ap_lo) / 2

X = df.iloc[:, :-1].copy().values

features = [i for i in range(X.shape[1])]

feature_transformer = Pipeline(
    steps=[
        ("Simple_Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("Standard_Scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("features", feature_transformer, features),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", GradientBoostingClassifier(random_state=cfg.seed, 
                                                  n_estimators=gscv_model.best_params_["n_estimators"], 
                                                  learning_rate=gscv_model.best_params_["learning_rate"])),
    ]
)

breaker()
model.fit(X, y)

with open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "wb") as fp: pickle.dump(model, fp)


**************************************************

