### **Library Imports**

In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import random as r
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV

from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier 

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report

### **Utilities and Constants**

In [2]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + num*char + "\n")


def print_scores(accuracy: float, auc: float, precision: np.ndarray, recall: np.ndarray, f_score: np.ndarray) -> None:
    print(f"Accuracy  : {accuracy:.5f}")
    print(f"ROC-AUC   : {auc:.5f}")
    print(f"Precision : {precision}")
    print(f"Recall    : {recall}")
    print(f"F-Score   : {f_score}")
    

def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    accuracy = accuracy_score(y_pred, y_true)
    auc = roc_auc_score(y_pred, y_true)
    precision, recall, f_score, _ = precision_recall_fscore_support(y_pred, y_true)

    return accuracy, auc, precision, recall, f_score

### **Configuration**

In [3]:
class CFG(object):
    def __init__(self,
                 seed: int = 42,
                 n_splits: int = 5,
                 show_info: bool = False,
                 ):

        self.seed = seed
        self.n_splits = n_splits
        self.show_info = show_info
        self.data_read_path = "../input/cardiovascular-disease-dataset/cardio_train.csv"
        self.model_save_path = "models"
        if not os.path.exists(self.model_save_path): os.makedirs(self.model_save_path)

cfg = CFG(seed=42, show_info=True)

### **Model**

In [4]:
class Pipelines(object):
    def __init__(self, model_name: str, preprocessor, seed: int):
        self.model_name = model_name

        if self.model_name == "lgr":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", LogisticRegression(random_state=seed)),
                ]
            )
        
        elif self.model_name == "knc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", KNeighborsClassifier()),
                ]
            )

        
        elif self.model_name == "dtc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", DecisionTreeClassifier(random_state=seed)),
                ]
            )

        elif self.model_name == "etc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreeClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "rfc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", RandomForestClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gbc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GradientBoostingClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "abc":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", AdaBoostClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "etcs":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", ExtraTreesClassifier(random_state=seed)),
                ]
            )
        
        elif self.model_name == "gnb":
            self.model = Pipeline(
                steps=[
                    ("preprocessor", preprocessor),
                    ("classifier", GaussianNB()),
                ]
            )

### **Train**

In [5]:
names = ["lgr", "knc", "gnb", "dtc", "etc", "abc", "gbc", "etcs", "rfc"]

df = pd.read_csv(cfg.data_read_path, delimiter=";")
df = df.drop(columns=["id"])

if cfg.show_info:
    breaker()
    for val in set(df.cardio):
        print(f"Class {val} count : {df[df.cardio == val].shape[0]}")

df.age /= 365.25
y = df.cardio.copy().values

df = df.drop(columns=["cardio"])

df["bmi"] = df.weight / (df.height / 100)**2
df["smoke_alco"] = [df.smoke[i] and df.alco[i] for i in range(df.shape[0])]
df["smoke_alco_active"] = [df.smoke[i] and df.alco[i] and df.active[i] for i in range(df.shape[0])]
df["ap_mean"] = (df.ap_hi + df.ap_lo) / 2

breaker()
df.head(5)


**************************************************

Class 0 count : 35021
Class 1 count : 34979

**************************************************



Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,bmi,smoke_alco,smoke_alco_active,ap_mean
0,50.35729,2,168,62.0,110,80,1,1,0,0,1,21.96712,0,0,95.0
1,55.381246,1,156,85.0,140,90,3,1,0,0,1,34.927679,0,0,115.0
2,51.627652,1,165,64.0,130,70,3,1,0,0,0,23.507805,0,0,100.0
3,48.249144,2,169,82.0,150,100,1,1,0,0,1,28.710479,0,0,125.0
4,47.841205,1,156,56.0,100,60,1,1,0,0,0,23.011177,0,0,80.0


In [6]:
X = df.iloc[:, :-1].copy().values

features = [i for i in range(X.shape[1])]

feature_transformer = Pipeline(
    steps=[
        ("Simple_Imputer", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("Standard_Scaler", StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("features", feature_transformer, features),
    ]
)

best_auc = 0.0
for name in names:
    fold = 1
    if cfg.show_info: breaker()
    for tr_idx, va_idx in KFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(X):
        X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]
        my_pipeline = Pipelines(name, preprocessor, cfg.seed)
        my_pipeline.model.fit(X_train, y_train)

        y_pred = my_pipeline.model.predict(X_valid)
        if cfg.show_info:
            acc, auc, pre, rec, f1 = get_scores(y_valid, y_pred)
            print(f"{my_pipeline.model_name}, {fold}\n")
            print_scores(acc, auc, pre, rec, f1)
            print("")
        else:
            _, auc, _, _, _ = get_scores(y_valid, y_pred)

        if auc > best_auc:
            best_auc = auc
            model_fold_name = f"{name}_{fold}"
            
            with open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "wb") as fp:
                pickle.dump(my_pipeline.model, fp)
        fold += 1
    

if cfg.show_info: 
    breaker()
    print(f"Best Model : {model_fold_name.split('_')[0]}, Best Fold : {model_fold_name.split('_')[1]}")

breaker()


**************************************************

lgr, 1

Accuracy  : 0.72379
ROC-AUC   : 0.72554
Precision : [0.7674585  0.68026241]
Recall    : [0.70519395 0.74589523]
F-Score   : [0.73500994 0.71156858]

lgr, 2

Accuracy  : 0.71893
ROC-AUC   : 0.72053
Precision : [0.75963751 0.67877412]
Recall    : [0.69993373 0.74113091]
F-Score   : [0.72856453 0.70858328]

lgr, 3

Accuracy  : 0.72079
ROC-AUC   : 0.72229
Precision : [0.76044808 0.68154043]
Recall    : [0.70262739 0.74195545]
F-Score   : [0.7303952  0.71046589]

lgr, 4

Accuracy  : 0.71950
ROC-AUC   : 0.72045
Precision : [0.75871276 0.67896572]
Recall    : [0.70955447 0.73134095]
F-Score   : [0.7333107  0.70418079]

lgr, 5

Accuracy  : 0.71986
ROC-AUC   : 0.72219
Precision : [0.77120823 0.66847671]
Recall    : [0.69948187 0.74490446]
F-Score   : [0.73359598 0.70462419]


**************************************************

knc, 1

Accuracy  : 0.64857
ROC-AUC   : 0.64872
Precision : [0.66327991 0.63391329]
Recall    : [0.64357123 0