In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import LabelEncoder

dataset = pd.read_csv('heart_2020_cleaned.csv')
dataset.head()

# number of data in the dataset
print("Number of data in the dataset: ", len(dataset.index))
print(dataset.shape[0])

In [None]:
if dataset.duplicated().any():      # checking for duplicate data
    dataset.drop_duplicates(inplace=True)       # removing the duplicates
    print("Number of data after removing duplicates: ", dataset.shape[0])
else: 
    print("No duplicates")

In [None]:
if dataset.isnull().values.any():    # checking for null data
    print(dataset.isnull().sum())
    dataset.dropna()        # removing the null values
    print("Number of data after removing null values: ", dataset.shape[0])
else:
    print("No null values")

In [None]:
no_of_target_data = dataset['HeartDisease'].value_counts()      # checking the number of 'yes' and 'no' in the label
print(no_of_target_data)

In [None]:
# We haven't encoded BMI, sleeptime, MentalHealth and PhysicalHealth
# They are already integers and floats

label_encoder = LabelEncoder()

label_encoder.fit(dataset['HeartDisease'])
dataset['HeartDisease']=label_encoder.transform(dataset['HeartDisease'])

label_encoder.fit(dataset['Smoking'])
dataset['Smoking']=label_encoder.transform(dataset['Smoking'])

label_encoder.fit(dataset['AlcoholDrinking'])
dataset['AlcoholDrinking']=label_encoder.transform(dataset['AlcoholDrinking'])

label_encoder.fit(dataset['Stroke'])
dataset['Stroke']=label_encoder.transform(dataset['Stroke'])

label_encoder.fit(dataset['DiffWalking'])
dataset['DiffWalking']=label_encoder.transform(dataset['DiffWalking'])

label_encoder.fit(dataset['Sex'])
dataset['Sex']=label_encoder.transform(dataset['Sex'])

label_encoder.fit(dataset['AgeCategory'])
dataset['AgeCategory']=label_encoder.transform(dataset['AgeCategory'])

label_encoder.fit(dataset['Race'])
dataset['Race']=label_encoder.transform(dataset['Race'])

label_encoder.fit(dataset['Diabetic'])
dataset['Diabetic']=label_encoder.transform(dataset['Diabetic'])

label_encoder.fit(dataset['PhysicalActivity'])
dataset['PhysicalActivity']=label_encoder.transform(dataset['PhysicalActivity'])

label_encoder.fit(dataset['GenHealth'])
dataset['GenHealth']=label_encoder.transform(dataset['GenHealth'])

label_encoder.fit(dataset['Asthma'])
dataset['Asthma']=label_encoder.transform(dataset['Asthma'])

label_encoder.fit(dataset['KidneyDisease'])
dataset['KidneyDisease']=label_encoder.transform(dataset['KidneyDisease'])

label_encoder.fit(dataset['SkinCancer'])
dataset['SkinCancer']=label_encoder.transform(dataset['SkinCancer'])

In [None]:
print("Dataset after encoding the labels")
dataset.head()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

# Correlation matrix
plt.figure(figsize= (13,10))
sns.heatmap(dataset.corr(),annot = True,cmap='Reds')

In [None]:
# defining features and label
x = dataset.drop(["HeartDisease"], axis = 1).values     # feature
y = dataset["HeartDisease"].values      # label

In [None]:
# data scaling
# robust_scaler = RobustScaler()

# x_train = robust_scaler.fit_transform(x_train)
# x_test = robust_scaler.fit_transform(x_test)

Implementing RandomForest by Handling Imbalanced Dataset using SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter

counter_before = Counter(y)
print("Before oversampling: No. of data = ", counter_before)

smote = SMOTE(random_state=42)
x1, y1 = smote.fit_resample(x, y)

counter_after = Counter(y1)
print("After oversampling: No. of data = ", counter_after)

In [None]:
# splitting the oversampled dataset
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.20, random_state=42)

print("Total number of data: ", x1.shape[0])
print("Total number of train data: ", x1_train.shape[0])
print("Total number of test data: ", x1_test.shape[0])

Decision Tree

In [None]:
from sklearn import tree
d_tree1 = tree.DecisionTreeClassifier()
d_tree1.fit(x1_train, y1_train)
d_tree_pred = d_tree1.predict(x1_test)

accuracy = accuracy_score(y1_test, d_tree_pred)
print("Accuracy: ", accuracy*100, "%")

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score, roc_curve

# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, d_tree_pred)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, d_tree_pred)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, d_tree_pred)
print("Precision: ", precision)
recall = recall_score(y1_test, d_tree_pred)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, d_tree_pred)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, d_tree_pred)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier1 = RandomForestClassifier()
classifier1.fit(x1_train, y1_train)

pred2 = classifier1.predict(x1_test)

accuracy1 = accuracy_score(y1_test, pred2)
print("Accuracy: ", accuracy1*100, "%")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, pred2)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, pred2)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, pred2)
print("Precision: ", precision)
recall = recall_score(y1_test, pred2)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, pred2)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, pred2)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

XGBoost

In [None]:
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_classifier.fit(x1_train, y1_train)

xgb_pred = xgb_classifier.predict(x1_test)

accuracy2 = accuracy_score(y1_test, xgb_pred)
print("Accuracy: ", accuracy2*100, "%")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, xgb_pred)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, xgb_pred)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, xgb_pred)
print("Precision: ", precision)
recall = recall_score(y1_test, xgb_pred)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, xgb_pred)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, xgb_pred)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_classifier = GradientBoostingClassifier()
gb_classifier.fit(x1_train, y1_train)
gb_pred = gb_classifier.predict(x1_test)

accuracy3 = accuracy_score(y1_test, gb_pred)
print("Accuracy: ", accuracy3*100, "%")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, gb_pred)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, gb_pred)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, gb_pred)
print("Precision: ", precision)
recall = recall_score(y1_test, gb_pred)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, gb_pred)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, gb_pred)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier()
ada_classifier.fit(x1_train, y1_train)
ada_pred = ada_classifier.predict(x1_test)

accuracy4 = accuracy_score(y1_test, ada_pred)
print("Accuracy: ", accuracy4*100, "%")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, ada_pred)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, ada_pred)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, ada_pred)
print("Precision: ", precision)
recall = recall_score(y1_test, ada_pred)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, ada_pred)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, ada_pred)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
lr_classifier.fit(x1_train, y1_train)
lr_pred = lr_classifier.predict(x1_test)

accuracy5 = accuracy_score(y1_test, lr_pred)
print("Accuracy: ", accuracy5*100, "%")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, lr_pred)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, lr_pred)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, lr_pred)
print("Precision: ", precision)
recall = recall_score(y1_test, lr_pred)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, lr_pred)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, lr_pred)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

nn_classifier = KNeighborsClassifier()
nn_classifier.fit(x1_train, y1_train)
nn_pred = nn_classifier.predict(x1_test)

accuracy5 = accuracy_score(y1_test, nn_pred)
print("Accuracy: ", accuracy5*100, "%")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, nn_pred)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, nn_pred)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, nn_pred)
print("Precision: ", precision)
recall = recall_score(y1_test, nn_pred)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, nn_pred)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, nn_pred)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

SVM

In [None]:
from sklearn import svm

svm_classifier = svm.SVC()
svm_classifier.fit(x1_train, y1_train)
svm_pred= svm_classifier.predict(x1_test)

accuracy5 = accuracy_score(y1_test, svm_pred)
print("Accuracy: ", accuracy5*100, "%")

In [None]:
# Confusion Matrix
conf_matrix = confusion_matrix(y1_test, svm_pred)

plt.figure(figsize= (4,3))
sns.heatmap(conf_matrix, 
            annot=True,
            fmt='g', 
            xticklabels=['Heart Disease','Not Heart Disease'],
            yticklabels=['Heart Disease','Not Heart Disease'])
plt.ylabel('Prediction',fontsize=8)
plt.xlabel('Actual',fontsize=8)
plt.title('Confusion Matrix',fontsize=12)
plt.show()

# Accuracy
accuracy = accuracy_score(y1_test, svm_pred)
print("Accuracy: ", accuracy*100, "%")

# precision_score and recall_score 
precision = precision_score(y1_test, svm_pred)
print("Precision: ", precision)
recall = recall_score(y1_test, svm_pred)
print("Recall: ", recall)

# roc_auc score
roc_auc = roc_auc_score(y1_test, svm_pred)
print("AUC (Area under the curve) score: ", roc_auc)

# Plot ROC curve
fpr, tpr, thresholds = roc_curve(y1_test, svm_pred)
plt.figure(figsize=(4, 3))
plt.plot(fpr, tpr, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()