# Extra: Model Comparison
## Minh Nguyen

In [397]:
# Data Cleaning and Plotting Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

# Data Analysis Libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import LeaveOneOut, KFold, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, log_loss, roc_auc_score

# Sets parameters for the entire notebook for Seaborn/Matplotlib plots
# rcParams['figure.figsize'] = 10, 7
# rcParams.update({'font.size': 12})
# sns.set_style('darkgrid')

In [398]:
df = pd.read_csv('heart_disease.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,death
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [399]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  death                     299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [400]:
df["death"].value_counts()

0    203
1     96
Name: death, dtype: int64

## Use Logistic Regression model

In [401]:
X=df.drop('death', axis=1)
y=df['death']

mm_scaler = MinMaxScaler()

X_mm = pd.DataFrame(mm_scaler.fit_transform(X), columns=X.columns)

train_x, test_x, train_y, test_y = train_test_split(X_mm, y, test_size=0.2, stratify=y)

train_x.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time
289,0.909091,1.0,0.040061,0.0,0.363636,0.0,0.442357,0.044944,0.885714,0.0,0.0,0.896797
144,0.581818,1.0,0.117377,0.0,0.166667,1.0,0.379319,0.134831,0.742857,1.0,1.0,0.380783
18,0.545455,1.0,0.013014,0.0,0.166667,1.0,0.25688,0.05618,0.771429,0.0,0.0,0.039146
201,0.090909,0.0,0.036361,1.0,0.69697,1.0,0.426597,0.05618,0.657143,1.0,0.0,0.647687
46,0.2,0.0,0.173131,0.0,0.166667,1.0,0.298097,0.044944,0.485714,1.0,0.0,0.120996


In [402]:
lr_model = LogisticRegression(max_iter=1000)

lr_model.fit(train_x, train_y)
pred = lr_model.predict(test_x)

In [403]:
print("Accuracy: ", accuracy_score(test_y, pred))
print("Confusion Matrix: \n", confusion_matrix(test_y, pred))

Accuracy:  0.7833333333333333
Confusion Matrix: 
 [[38  3]
 [10  9]]


- As we can see, the model is pretty good.

## Apply LOOCV

In [404]:
loo = LeaveOneOut()
# loo.get_n_splits(train_x)
loo.get_n_splits(X_mm)

299

In [405]:
accuracy = []
neg_log_loss = []
roc_auc = []

for train_index, test_index in loo.split(X_mm):
    # print("train:", train_index, "validation:", test_index)
    X_train, X_test = X_mm.values[train_index], X_mm.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # lr_model1 = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    # y_pred_prob = lr_model.predict_proba(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    # neg_log_loss.append(log_loss(y_test, y_pred_prob))
    # roc_auc.append(roc_auc_score(y_test, y_pred_prob))

# calculate the mean of the metrics
accuracy_mean = np.mean(accuracy)
# neg_log_loss_mean = np.mean(neg_log_loss)
# roc_auc_mean = np.mean(roc_auc)

print(f"Accuracy: {accuracy_mean}")
# print(f"Neg Log Loss: {neg_log_loss_mean}")
# print(f"ROC AUC: {roc_auc_mean}")

Accuracy: 0.8294314381270903


## Apply 10 and 3 K-Fold

### k = 10

In [406]:
# kf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=None)
kf = KFold(n_splits=10, shuffle=True)

kf_accuracies_10 = []

for train_index, test_index in kf.split(X_mm):
    # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X_mm.values[train_index], X_mm.values[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    kf_accuracies_10.append(accuracy_score(y_test, y_pred))
    
mean_accuracy_10 = np.mean(kf_accuracies_10)

print(f"Accuracy: {mean_accuracy_10}")

Accuracy: 0.8193103448275864


In [407]:
kf = KFold(n_splits=3, shuffle=True)

kf_accuracies_3 = []

for train_index, test_index in kf.split(X_mm):
    # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X_mm.values[train_index], X_mm.values[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    
    lr_model.fit(X_train, y_train)
    y_pred = lr_model.predict(X_test)
    kf_accuracies_3.append(accuracy_score(y_test, y_pred))
    
mean_accuracy_3 = np.mean(kf_accuracies_3)

print(f"Accuracy: {mean_accuracy_3}")

Accuracy: 0.8227946127946127
