In [33]:
# Import necessary libraries
import kagglehub
import os 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import recall_score

In [12]:
# Download the data
# Download the latest version 
path = kagglehub.dataset_download('johnsmith88/heart-disease-dataset')
print('Path to dataset files: ', path)

# List all files in the dataset directory
files = os.listdir(path)
print('Dataset files: ', files)

# Load the CSV file
csv_file = os.path.join(path, 'heart.csv')
df = pd.read_csv(csv_file)

Path to dataset files:  /Users/joowanlim/.cache/kagglehub/datasets/johnsmith88/heart-disease-dataset/versions/2
Dataset files:  ['heart.csv']


In [13]:
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [16]:
X, y = df.drop('target', axis=1), df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

### Scale-Insensitive

In [19]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

In [20]:
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

In [21]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

### Scale-Sensitive

In [24]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

In [28]:
log = LogisticRegression()
log.fit(X_train_scaled, y_train)

In [29]:
svc = SVC()
svc.fit(X_train_scaled, y_train)

In [32]:
forest_accuracy = forest.score(X_test, y_test)
nb_clf_accuracy = nb_clf.score(X_test, y_test)
gb_clf_accuracy = gb_clf.score(X_test, y_test)
knn_accuracy = knn.score(X_test_scaled, y_test)
log_accuracy = svc.score(X_test_scaled, y_test)
svc_accuracy = svc.score(X_test_scaled, y_test)

print(f"Random Forest Classifier Accuracy: {forest_accuracy:.4f}")
print(f"Naive Bayes Classifier Accuracy: {nb_clf_accuracy:.4f}")
print(f"Gradient Boosting Classifier Accuracy: {gb_clf_accuracy:.4f}")
print(f"K-Neirghest Neigbhors Classifier Accuracy: {knn_accuracy:.4f}")
print(f"Logistic Regression Classifier Accuracy: {log_accuracy:.4f}")
print(f"Support Vector Classifier Accuracy: {svc_accuracy:.4f}")

Random Forest Classifier Accuracy: 0.9854
Naive Bayes Classifier Accuracy: 0.8146
Gradient Boosting Classifier Accuracy: 0.9610
K-Neirghest Neigbhors Classifier Accuracy: 0.8463
Logistic Regression Classifier Accuracy: 0.9244
Support Vector Classifier Accuracy: 0.9244


In [38]:
forest_y_preds = forest.predict(X_test)
nb_clf_y_preds = nb_clf.predict(X_test)
gb_clf_y_preds = gb_clf.predict(X_test)
knn_y_preds = knn.predict(X_test_scaled)
log_y_preds = log.predict(X_test_scaled)
svc_y_preds = svc.predict(X_test_scaled)

print(f"Random Forest Classifier Recall: {recall_score(y_test, forest_y_preds):.4f}")
print(f"Naive Bayes Classifier Recall: {recall_score(y_test, nb_clf_y_preds):.4f}")
print(f"Gradient Boosting Classifier Recall: {recall_score(y_test, gb_clf_y_preds):.4f}")
print(f"K-Neirghest Neigbhors Classifier Recall: {recall_score(y_test, knn_y_preds):.4f}")
print(f"Logistic Regression Classifier Recall: {recall_score(y_test, log_y_preds):.4f}")
print(f"Support Vector Classifier Recall: {recall_score(y_test, svc_y_preds):.4f}")

Random Forest Classifier Recall: 0.9712
Naive Bayes Classifier Recall: 0.8606
Gradient Boosting Classifier Recall: 0.9663
K-Neirghest Neigbhors Classifier Recall: 0.8702
Logistic Regression Classifier Recall: 0.8365
Support Vector Classifier Recall: 0.9423
