# Final Model Selection
In this Notebook I will compare the three different models developed:
1. Logistic Regression
2. Random Forrest
3. XGBoost

I will use the best parameters selected in the previous Notebooks to train each model and choose the one that has an overall best performance.

# Data Preparation

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle as pkl

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

from sklearn.metrics import roc_auc_score

In [3]:
df = pd.read_csv('../datasets/heart_disease/heart_2020_cleaned.csv')
df.columns = df.columns.str.lower()

numerical = list(df.dtypes[df.dtypes == 'float'].index.values)
categorical = list(df.dtypes[df.dtypes == 'object'].index.values)
categorical.remove('heartdisease')

for c in categorical:
    df[c] = df[c].str.lower()

df['heartdisease'] = df['heartdisease'].str.lower()


df_full_train, df_test = train_test_split(df, test_size=0.20, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

y_train = (df_train['heartdisease']=='yes').astype('int').values
y_val = (df_val['heartdisease']=='yes').astype('int').values
y_test = (df_test['heartdisease']=='yes').astype('int').values

df_train = df_train.drop(columns='heartdisease')
df_val = df_val.drop(columns='heartdisease')
df_test = df_test.drop(columns='heartdisease')

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(df_train.to_dict(orient='records'))
X_val = dv.transform(df_val.to_dict(orient='records'))
X_test = dv.transform(df_test.to_dict(orient='records'))

## 1. Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

In [8]:
lr = LogisticRegression(random_state=1, max_iter=10000, C=0.1)

lr.fit(X_train, y_train)

y_pred = lr.predict_proba(X_test)[:,1]

roc_auc_score(y_test, y_pred)

0.8431243906754589

## 2. Random Forrest

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
best_estimator = 300
best_depth = 15
best_min_samples_leaf = 4

rf = RandomForestClassifier(n_estimators=best_estimator,
                            max_depth=best_depth, 
                            min_samples_leaf=best_min_samples_leaf, 
                            random_state=1)

rf.fit(X_train, y_train)

y_pred = rf.predict_proba(X_test)[:,1]

roc_auc_score(y_test,y_pred)

0.8406213565558842

## 3. XGBoost

In [9]:
import xgboost as xgb

In [10]:
feature_names = list(dv.get_feature_names_out())

dtrain = xgb.DMatrix(data=X_train, label=y_train, feature_names=feature_names)

dtest = xgb.DMatrix(data=X_test, label=y_test, feature_names=feature_names)

In [11]:
best_eta = 0.01
best_max_depth = 6
best_min_child_weights = 1

In [12]:
xgb_params = {'eta':best_eta,
             'max_depth':best_max_depth,
             'min_child_weight':best_min_child_weights,
              
             'objective':'binary:logistic',
             'nthread':8,
              'eval_metric':'auc',
              
             'seed':1,
             'verbosity':1}

model = xgb.train(xgb_params, dtrain, num_boost_round=1000)

In [13]:
y_pred = model.predict(dtest)

In [14]:
roc_auc_score(y_test, y_pred)

0.8456598354472225

# 4 . Save Model

In [15]:
with open('dict_vectorizer.bin', 'wb') as file:
    pkl.dump(dv, file)

In [16]:
with open('logistic_regression.bin', 'wb') as file:
    pkl.dump(lr, file)

In [17]:
with open('random_forrest.bin', 'wb') as file:
    pkl.dump(rf, file)

In [18]:
with open('xgboost.bin', 'wb') as file:
    pkl.dump(model, file)

## Conclusion
Based on results all three models had almost same AUC, therefore I decided to use the logistic regression model because it is simpler to explain and understand when looking at dependencies between variables.