In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv('Heart_Disease_Dataset.csv')
df = pd.DataFrame(data)
df.head()

Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope,target
0,40,1,2,140,289,0,0,172,0,0.0,1,0
1,49,0,3,160,180,0,0,156,0,1.0,2,1
2,37,1,2,130,283,0,1,98,0,0.0,1,0
3,48,0,4,138,214,0,0,108,1,1.5,2,1
4,54,1,3,150,195,0,0,122,0,0.0,1,0


In [3]:
from sklearn.model_selection import train_test_split
y = df['target']
X = df.drop('target', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_test.head()


Unnamed: 0,age,sex,chest pain type,resting bp s,cholesterol,fasting blood sugar,resting ecg,max heart rate,exercise angina,oldpeak,ST slope
693,45,1,4,104,208,0,2,148,1,3.0,2
302,55,1,2,140,0,0,1,150,0,0.2,1
153,55,1,4,120,270,0,0,140,0,0.0,1
721,63,1,4,130,254,0,2,147,0,1.4,2
941,60,1,4,130,253,0,0,144,1,1.4,1


In [4]:
from sklearn.preprocessing import StandardScaler
numerical_cols = ['age', 'resting bp s', 'cholesterol', 'max heart rate', 'oldpeak'] 
scalar = StandardScaler()
X_train[numerical_cols] = scalar.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scalar.transform(X_test[numerical_cols])


In [5]:
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

print(list(X_test.columns))

['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope']


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
logreg_model = LogisticRegression(class_weight='balanced')

logreg_model.fit(X_train_np, y_train_np)
logreg_prediction = logreg_model.predict(X_test_np)

print(confusion_matrix(y_test_np, logreg_prediction))
print(classification_report(y_test_np, logreg_prediction))

[[145  25]
 [ 31 156]]
              precision    recall  f1-score   support

           0       0.82      0.85      0.84       170
           1       0.86      0.83      0.85       187

    accuracy                           0.84       357
   macro avg       0.84      0.84      0.84       357
weighted avg       0.84      0.84      0.84       357



In [7]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train_np, y_train_np)
knn_prediction = knn_model.predict(X_test_np)

print(confusion_matrix(y_test_np, knn_prediction))
print(classification_report(y_test_np, knn_prediction))

[[139  31]
 [ 19 168]]
              precision    recall  f1-score   support

           0       0.88      0.82      0.85       170
           1       0.84      0.90      0.87       187

    accuracy                           0.86       357
   macro avg       0.86      0.86      0.86       357
weighted avg       0.86      0.86      0.86       357



In [8]:
# some hyperparameter tuning for KNN
from sklearn.model_selection import GridSearchCV
import math

parameter_grid = {
    'n_neighbors' : range(int(math.sqrt(len(X_train_np)) - 5), int(math.sqrt(len(X_train_np)) + 5)),
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2]
}

grid_search = GridSearchCV(knn_model, parameter_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_np, y_train_np)

# best n is 1 : figured that out anyways using the classification report but this confirms


In [9]:
print("Best parameters: ", grid_search.best_params_)
best_model = grid_search.best_estimator_


Best parameters:  {'n_neighbors': 30, 'p': 1, 'weights': 'distance'}


In [10]:
improved_knn = best_model
improved_knn.fit(X_train_np, y_train_np )
improved_knn_prediction = improved_knn.predict(X_test_np)

print(confusion_matrix(y_test_np, improved_knn_prediction))
print(classification_report(y_test_np, improved_knn_prediction))



[[152  18]
 [ 18 169]]
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       170
           1       0.90      0.90      0.90       187

    accuracy                           0.90       357
   macro avg       0.90      0.90      0.90       357
weighted avg       0.90      0.90      0.90       357



In [14]:
from sklearn.svm import SVC

hard_svm =SVC(kernel="linear", C=1000)
hard_svm.fit(X_train_np, y_train_np)
hard_svm_prediction = hard_svm.predict(X_test_np)

print(confusion_matrix(y_test_np, hard_svm_prediction))
print(classification_report(y_test_np, hard_svm_prediction))

# SVM should be perfect for this problem given it is a binary classification problem 

[[141  29]
 [ 27 160]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       170
           1       0.85      0.86      0.85       187

    accuracy                           0.84       357
   macro avg       0.84      0.84      0.84       357
weighted avg       0.84      0.84      0.84       357



In [15]:
from sklearn.svm import SVC

soft_svm = SVC(kernel="linear", C=0.01)
soft_svm.fit(X_train_np, y_train_np)

soft_svm_prediction = soft_svm.predict(X_test_np)

print(confusion_matrix(y_test_np, soft_svm_prediction))
print(classification_report(y_test_np, soft_svm_prediction))


[[141  29]
 [ 30 157]]
              precision    recall  f1-score   support

           0       0.82      0.83      0.83       170
           1       0.84      0.84      0.84       187

    accuracy                           0.83       357
   macro avg       0.83      0.83      0.83       357
weighted avg       0.83      0.83      0.83       357



In [16]:
from sklearn.tree import DecisionTreeClassifier
# use gini entropy for loss
d_tree = DecisionTreeClassifier(criterion="gini")
d_tree.fit(X_train_np, y_train_np)

d_tree_prediction = d_tree.predict(X_test_np)

print(confusion_matrix(y_test_np, d_tree_prediction))
print(classification_report(y_test_np, d_tree_prediction))

[[145  25]
 [ 34 153]]
              precision    recall  f1-score   support

           0       0.81      0.85      0.83       170
           1       0.86      0.82      0.84       187

    accuracy                           0.83       357
   macro avg       0.83      0.84      0.83       357
weighted avg       0.84      0.83      0.83       357



In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

rf_model.fit(X_train, y_train)
rf_prediction = rf_model.predict(X_test_np)


print(confusion_matrix(y_test_np, rf_prediction))
print(classification_report(y_test_np, rf_prediction))

[[155  15]
 [  8 179]]
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       170
           1       0.92      0.96      0.94       187

    accuracy                           0.94       357
   macro avg       0.94      0.93      0.94       357
weighted avg       0.94      0.94      0.94       357





In [18]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_prediction = nb_model.predict(X_test_np)

print(confusion_matrix(y_test_np, nb_prediction))
print(classification_report(y_test_np, nb_prediction))

[[146  24]
 [ 32 155]]
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       170
           1       0.87      0.83      0.85       187

    accuracy                           0.84       357
   macro avg       0.84      0.84      0.84       357
weighted avg       0.84      0.84      0.84       357





In [19]:
from mlxtend.classifier import StackingCVClassifier
stacked_model = StackingCVClassifier(classifiers=[logreg_model, improved_knn, hard_svm, soft_svm, d_tree, rf_model, nb_model],
                                     meta_classifier=LogisticRegression(class_weight='balanced'), cv=3)

stacked_model.fit(X_train_np, y_train_np)

stacked_model_prediction = stacked_model.predict(X_test_np)
print(stacked_model_prediction)

print(confusion_matrix(y_test_np, stacked_model_prediction))
print(classification_report(y_test_np, stacked_model_prediction))

# expected to be perfect

[1 0 0 1 1 1 0 0 1 1 1 0 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0
 0 0 1 0 1 0 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 0
 1 0 0 0 0 1 1 1 1 0 0 0 1 1 1 0 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 0 1 0
 0 1 0 1 1 0 1 1 0 1 1 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 1 0 1 0 1 1 0 0 0 1
 1 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 0 0 1 0 1 1 1 1 1 1 0 1
 0 1 0 0 0 1 1 1 0 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 0 1 0 0 1 0
 1 0 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 0 1 1 0 0 1 1 1 1 0 0 1 1 0 1 0 0 0
 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 1 1 1 1 1 1 0 1 0 0 0 0 1 1 0 0
 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 1 0 0 0
 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 0 1 1 1 0 0]
[[156  14]
 [ 14 173]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       170
           1       0.93      0.93      0.93       187

    accuracy                           0.92       357
   macro avg       0.92      0.92 

In [22]:
from sklearn.metrics import roc_curve, accuracy_score

false_positive, true_positive, thresholds = roc_curve(y_test_np, stacked_model_prediction)
print("false positive: ", false_positive, "true_positive: ", true_positive, "thresholds: ", thresholds)
print(list(X_test.columns))

print("Accuracy: ", accuracy_score(y_test, stacked_model_prediction)*100, "%")

false positive:  [0.         0.08235294 1.        ] true_positive:  [0.         0.92513369 1.        ] thresholds:  [inf  1.  0.]
['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope']
Accuracy:  0.9215686274509803 %


In [21]:
from joblib import dump
dump(stacked_model, "check_my_heart_model.joblib")


['check_my_heart_model.joblib']