In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier


In [2]:
# read data
df = pd.read_csv('../Data/data_cleaned.csv')
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,7,0,2,1,3,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,12,0,0,1,3,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,9,0,2,1,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11,0,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,4,0,0,1,3,8.0,0,0,0


## splitting data into train and Test sets

In [3]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(df.drop('HeartDisease', axis=1), df['HeartDisease'], test_size=0.2, random_state=42)
# show the shape of the train and test data
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
print(y_test.head())




(255836, 17)
(63959, 17)
(255836,)
(63959,)
271884    0
270361    0
219060    0
24010     0
181930    0
Name: HeartDisease, dtype: int64


## OverSampling 

In [4]:
# create a classifier
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
# fit the classifier to the training data
clf.fit(X_train, y_train)
# predict the test data
y_pred = clf.predict(X_test)
# show the accuracy score
print('Accuracy score before oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix before oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report before oversampling: \n', classification_report(y_test, y_pred))


Accuracy score before oversampling:  0.9125689895089042
Confusion matrix before oversampling: 
 [[58367     0]
 [ 5592     0]]
Classification report before oversampling: 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95     58367
           1       0.00      0.00      0.00      5592

    accuracy                           0.91     63959
   macro avg       0.46      0.50      0.48     63959
weighted avg       0.83      0.91      0.87     63959



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## -----------------Random Forrest-----------------

In [5]:
# oversample the data
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# show the shape of the resampled data
print(X_resampled.shape)
print(y_resampled.shape)
# use the resampled data to train the classifier
clf.fit(X_resampled, y_resampled)
# predict the test data
y_pred = clf.predict(X_test)
# show the accuracy score
print('Accuracy score after oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test, y_pred))


(468110, 17)
(468110,)
Accuracy score after oversampling:  0.7250113353867321
Confusion matrix after oversampling: 
 [[42175 16192]
 [ 1396  4196]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.97      0.72      0.83     58367
           1       0.21      0.75      0.32      5592

    accuracy                           0.73     63959
   macro avg       0.59      0.74      0.58     63959
weighted avg       0.90      0.73      0.78     63959



# Models

## -----------------Logistic Regression-----------------

In [6]:
logreg = LogisticRegression()
# fit the model to the training data
logreg.fit(X_resampled, y_resampled)
# predict the test data
y_pred = logreg.predict(X_test)
# show the accuracy score
print('Accuracy score after oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test, y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy score after oversampling:  0.7477602839318939
Confusion matrix after oversampling: 
 [[43484 14883]
 [ 1250  4342]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.97      0.75      0.84     58367
           1       0.23      0.78      0.35      5592

    accuracy                           0.75     63959
   macro avg       0.60      0.76      0.60     63959
weighted avg       0.91      0.75      0.80     63959



## -----------------Naive Bayes-----------------


In [7]:
# make a naive bayes classifier
gnb = GaussianNB()
# fit the model to the training data
gnb.fit(X_resampled, y_resampled)
# predict the test data
y_pred = gnb.predict(X_test)
# show the accuracy score
print('Accuracy score after oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test, y_pred))


Accuracy score after oversampling:  0.7956816085304649
Confusion matrix after oversampling: 
 [[47572 10795]
 [ 2273  3319]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.95      0.82      0.88     58367
           1       0.24      0.59      0.34      5592

    accuracy                           0.80     63959
   macro avg       0.59      0.70      0.61     63959
weighted avg       0.89      0.80      0.83     63959



## -----------------SVM-----------------


In [8]:
# # create a classifier
# clf = svm.SVC()
# # fit the classifier to the training data
# clf.fit(X_resampled, y_resampled)
# # predict the test data
# y_pred = clf.predict(X_test)
# # show the accuracy score
# print('Accuracy score after oversampling: ', accuracy_score(y_test, y_pred))
# # show the confusion matrix
# print('Confusion matrix after oversampling: \n', confusion_matrix(y_test, y_pred))
# # show the classification report
# print('Classification report after oversampling: \n', classification_report(y_test, y_pred))

## -----------------KNN-----------------


In [9]:
# create a classifier
clf = KNeighborsClassifier(n_neighbors=3)
# fit the classifier to the training data
clf.fit(X_resampled, y_resampled)
# predict the test data
y_pred = clf.predict(X_test)
# show the accuracy score
print('Accuracy score after oversampling: ', accuracy_score(y_test, y_pred))
# show the confusion matrix
print('Confusion matrix after oversampling: \n', confusion_matrix(y_test, y_pred))
# show the classification report
print('Classification report after oversampling: \n', classification_report(y_test, y_pred))


Accuracy score after oversampling:  0.8358010600540972
Confusion matrix after oversampling: 
 [[51628  6739]
 [ 3763  1829]]
Classification report after oversampling: 
               precision    recall  f1-score   support

           0       0.93      0.88      0.91     58367
           1       0.21      0.33      0.26      5592

    accuracy                           0.84     63959
   macro avg       0.57      0.61      0.58     63959
weighted avg       0.87      0.84      0.85     63959

