# Classification Heart Disease in production

<img src="https://www.torrancememorial.org/app/files/public/22094243-8409-4a71-b75b-78a218ef7eb3/Content%20Hub/health%20and%20wellness/cutting%20edge%20of%20heart%20disease.png">

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from loguru import logger
import xgboost as xgb
from sklearn.pipeline import Pipeline

In [None]:
pd.set_option('display.max_columns', 30)

# **Functions**

In [None]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    # logger.info(f"{df.shape}")
    # logger.info(f"{df.sample(3)}")
    return df


def clean_data(df):
    # handle duplicates
    # logger.info(f" Number of duplicates before drop: {df[df.duplicated()].shape[0]}")
    df.drop_duplicates(inplace=True)
    # logger.info(f" Number of duplicates after drop: {df[df.duplicated()].shape[0]}")

    # handle Missing values
    # logger.info(f" Number of Mis before drop: {df.isna().sum()}")
    df.dropna(inplace=True)
    # logger.info(f" Number of duplicates before drop: {df.isna().sum()}")

    # reset index
    df.reset_index(inplace=True, drop=True)

    return df


def split_data(df):

    X = df.iloc[:, :-1].values
    y = df.iloc[:, -1].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=380)

    ss = StandardScaler()
    X_train = ss.fit_transform(X_train)
    X_test = ss.transform(X_test)

    return X_train, X_test, y_train, y_test


def train_model(model, X_train, y_train):
    model_obj = model
    model_obj.fit(X_train, y_train)

    return model_obj


def model_evaluation(pre_train_model ,X_test, y_test, return_df=True, pipline=False):

    # prediction
    y_pred = pre_train_model.predict(X_test)

    metrics_dict = dict()

    # metrics
    metrics_dict["accuracy"] = metrics.accuracy_score(y_test, y_pred)
    metrics_dict["f1"] = metrics.f1_score(y_test, y_pred)
    metrics_dict["precision"] = metrics.precision_score(y_test, y_pred)
    metrics_dict["recall"] = metrics.recall_score(y_test, y_pred)

    if return_df and pipline == False:
        metrics_df = pd.DataFrame([metrics_dict])
        metrics_df.insert(0, "model", type(pre_train_model).__name__)

        return metrics_df #.style.background_gradient(cmap='coolwarm')

    elif return_df and pipline:
        metrics_df = pd.DataFrame([metrics_dict])
        metrics_df.insert(0, "model", type(pre_train_model.best_estimator_.steps[-1][-1]).__name__)

        return metrics_df #.style.background_gradient(cmap='coolwarm')

    else:
        return metrics_dict


In [None]:
# df

In [None]:
def model_comaparison(model):
    df = load_data('/content/drive/MyDrive/Colab Notebooks/Filoger/tamrin_quera/ML/Classification/heart.csv')
    df = clean_data(df)
    X_train, X_test, y_train, y_test = split_data(df)
    model_t = train_model(model, X_train, y_train)
    return model_evaluation(model_t, X_test, y_test, return_df=True)

In [None]:
models = [
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    SVC(),
    DecisionTreeClassifier(),
    xgb.XGBClassifier(),
    RandomForestClassifier()
]

In [23]:
m_df = pd.concat([model_comaparison(model) for model in models ], ignore_index=True).sort_values(by="model")
m_df.style.background_gradient(cmap='coolwarm')

Unnamed: 0,model,accuracy,f1,precision,recall
1,AdaBoostClassifier,0.789474,0.829787,0.795918,0.866667
3,DecisionTreeClassifier,0.697368,0.741573,0.75,0.733333
0,KNeighborsClassifier,0.815789,0.854167,0.803922,0.911111
5,RandomForestClassifier,0.789474,0.829787,0.795918,0.866667
2,SVC,0.828947,0.865979,0.807692,0.933333
4,XGBClassifier,0.802632,0.83871,0.8125,0.866667


# Hyperparameter Tuning

In [27]:
df = load_data('/content/drive/MyDrive/Colab Notebooks/Filoger/tamrin_quera/ML/Classification/heart.csv')
df = clean_data(df)
X_train, X_test, y_train, y_test = split_data(df)

In [28]:
pipe_lr = Pipeline([('scl', StandardScaler()),
                    ('LR', LogisticRegression(random_state=42))])

pipe_dt = Pipeline([('scl', StandardScaler()),
                    ('DT',DecisionTreeClassifier(random_state=42))])

pipe_rf = Pipeline([('scl', StandardScaler()),
                    ('RF',RandomForestClassifier(random_state=42))])

pipe_knn = Pipeline([('scl', StandardScaler()),
                    ('KNN', KNeighborsClassifier())])

pipe_svm = Pipeline([('scl', StandardScaler()),
                     ('SVM', SVC(random_state=42))])

pipe_xgb = Pipeline([('scl', StandardScaler()),
                     ('XGB', xgb.XGBClassifier(random_state=42))])

In [29]:
param_range = [1, 2, 3, 4, 5, 6]
param_range_fl = [1.0, 0.5, 0.1]
n_estimators = [50,100,150]
learning_rates = [.1,.2,.3]


lr_param_grid = [{
    'LR__penalty': ['l1', 'l2'],
    'LR__C': param_range_fl,
    'LR__solver': ['liblinear']
}]


dt_param_grid = [{
    'DT__criterion': ['gini', 'entropy'],
    'DT__min_samples_leaf': param_range,
    'DT__max_depth': param_range,
    'DT__min_samples_split': param_range[1:]
}]


rf_param_grid = [{
    'RF__min_samples_leaf': param_range,
    'RF__max_depth': param_range,
    'RF__min_samples_split': param_range[1:]
}]


knn_param_grid = [{
    'KNN__n_neighbors': param_range,
    'KNN__weights': ['uniform', 'distance'],
    'KNN__metric': ['euclidean', 'manhattan']
}]


svm_param_grid = [{
    'SVM__kernel': ['linear', 'rbf'],
    'SVM__C': param_range
}]


xgb_param_grid = [{
    'XGB__learning_rate': learning_rates,
    'XGB__max_depth': param_range,
    'XGB__min_child_weight': param_range[:2],
    'XGB__subsample': param_range_fl,
    'XGB__n_estimators': n_estimators
}]

In [39]:
lr_grid_search = GridSearchCV(
    estimator=pipe_lr,
    param_grid=lr_param_grid,
    scoring='f1',
    cv=3
)

dt_grid_search = GridSearchCV(
    estimator=pipe_dt,
    param_grid=dt_param_grid,

    scoring='f1',
    cv=3
)

rf_grid_search = GridSearchCV(
    estimator=pipe_rf,
    param_grid=rf_param_grid,
    scoring='f1',
    cv=3
)

knn_grid_search = GridSearchCV(
    estimator=pipe_knn,
    param_grid=knn_param_grid,
    scoring='f1',
    cv=3
)

svm_grid_search = GridSearchCV(
    estimator=pipe_svm,
    param_grid=svm_param_grid,
    scoring='f1',
    cv=3
)

xgb_grid_search = GridSearchCV(
    estimator=pipe_xgb,
    param_grid=xgb_param_grid,
    scoring='f1',
    cv=3
)

In [40]:
grids = [
    lr_grid_search,
    dt_grid_search,
    rf_grid_search,
    knn_grid_search,
    svm_grid_search,
    xgb_grid_search
]

for pipe in grids:
    pipe.fit(X_train,y_train)

In [37]:
grid_dict = {0: 'Logistic Regression', 1: 'Decision Trees',
             2: 'Random Forest', 3: 'K-Nearest Neighbors',
             4: 'Support Vector Machines', 5: 'XGBoost'}
for i, model in enumerate(grids):
    print('{} Test Accuracy: {}'.format(grid_dict[i], model.score(X_test,y_test)))
    print('{} Best Params: {}'.format(grid_dict[i], model.best_params_))
    print('-'*100)

Logistic Regression Test Accuracy: 0.8289473684210527
Logistic Regression Best Params: {'LR__C': 0.1, 'LR__penalty': 'l2', 'LR__solver': 'liblinear'}
----------------------------------------------------------------------------------------------------
Decision Trees Test Accuracy: 0.7631578947368421
Decision Trees Best Params: {'DT__criterion': 'gini', 'DT__max_depth': 5, 'DT__min_samples_leaf': 5, 'DT__min_samples_split': 2}
----------------------------------------------------------------------------------------------------
Random Forest Test Accuracy: 0.8157894736842105
Random Forest Best Params: {'RF__max_depth': 2, 'RF__min_samples_leaf': 4, 'RF__min_samples_split': 2}
----------------------------------------------------------------------------------------------------
K-Nearest Neighbors Test Accuracy: 0.7894736842105263
K-Nearest Neighbors Best Params: {'KNN__metric': 'manhattan', 'KNN__n_neighbors': 4, 'KNN__weights': 'distance'}
---------------------------------------------------

In [38]:
grid_dict = {0: 'Logistic Regression', 1: 'Decision Trees',
             2: 'Random Forest', 3: 'K-Nearest Neighbors',
             4: 'Support Vector Machines', 5: 'XGBoost'}

pd.concat([model_evaluation(model, X_test, y_test, return_df=True, pipline=True) for i, model in enumerate(grids)], ignore_index=True).style.background_gradient(cmap='coolwarm')

Unnamed: 0,model,accuracy,f1,precision,recall
0,LogisticRegression,0.828947,0.868687,0.796296,0.955556
1,DecisionTreeClassifier,0.763158,0.804348,0.787234,0.822222
2,RandomForestClassifier,0.815789,0.857143,0.792453,0.933333
3,KNeighborsClassifier,0.789474,0.833333,0.784314,0.888889
4,SVC,0.868421,0.893617,0.857143,0.933333
5,XGBClassifier,0.815789,0.854167,0.803922,0.911111


In [None]:
# type(grids[0].best_estimator_.steps[-1][-1]).__name__