In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kde, gaussian_kde
from matplotlib import gridspec
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("/content/heart_2020_cleaned.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [None]:
df.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,319795.0,319795.0,319795.0,319795.0
mean,28.325399,3.37171,3.898366,7.097075
std,6.3561,7.95085,7.955235,1.436007
min,12.02,0.0,0.0,1.0
25%,24.03,0.0,0.0,6.0
50%,27.34,0.0,0.0,7.0
75%,31.42,2.0,3.0,8.0
max,94.85,30.0,30.0,24.0


In [None]:
# get the number of missing data points per column
missing_values_count = df.isnull().sum()
print(missing_values_count)

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64


In [None]:
# The count of duplication rows in Dataset
df.duplicated().sum()

18078

In [None]:
# Remove Duplication Rows in Dataset
dataset = df.drop_duplicates()

In [None]:
dataset.duplicated().sum()

0

In [None]:
# Remove column name 'Race'
dataset = dataset.drop(['Race'], axis=1)

In [None]:
dataset.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,No,Yes,Very good,8.0,No,No,No


In [None]:
# Encoding Binary Columns
cols = ['HeartDisease', 'Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'PhysicalActivity', 
        'Diabetic','Asthma','GenHealth', 'KidneyDisease', 'SkinCancer']

In [None]:
# Implement Label Encoding on Binary Columns
dataset[cols] = dataset[cols].apply(LabelEncoder().fit_transform)

In [None]:
dataset.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,55-59,2,1,4,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,80 or older,0,1,4,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,65-69,2,1,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,75-79,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,40-44,0,1,4,8.0,0,0,0


In [None]:
# Now Count repated values in the same Column
dataset['AgeCategory'].value_counts()

65-69          31670
60-64          31219
70-74          29273
55-59          27610
50-54          23736
80 or older    23352
75-79          20713
45-49          20518
18-24          19998
40-44          19837
35-39          19526
30-34          17953
25-29          16312
Name: AgeCategory, dtype: int64

In [None]:
# Create Dictionary for AgeCategory column
AgeCategory_dict = {'18-24': 0, '25-29': 1, '30-34':2, '35-39':3, '40-44':4, '45-49':5, '50-54':6, '55-59':7,
                    '60-64':8, '65-69':9, '70-74':10, '75-79':11, '80 or older':12}

In [None]:
# Encoding AgeCategory column with dictionary values
dataset.AgeCategory = dataset.AgeCategory.replace(AgeCategory_dict)

In [None]:
dataset.head(10)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,7,2,1,4,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,12,0,1,4,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,9,2,1,1,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,11,0,0,2,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,4,0,1,4,8.0,0,0,0
5,1,28.87,1,0,0,6.0,0.0,1,0,11,0,0,1,12.0,0,0,0
6,0,21.63,0,0,0,15.0,0.0,0,0,10,0,1,1,4.0,1,0,1
7,0,31.64,1,0,0,5.0,0.0,1,0,12,2,0,2,9.0,1,0,0
8,0,26.45,0,0,0,0.0,0.0,0,0,12,1,0,1,5.0,0,1,0
9,0,40.69,0,0,0,0.0,0.0,1,1,9,0,1,2,10.0,0,0,0


In [None]:
# Information for New Dataset After Dropping
for feature in dataset.columns:
    print(feature)
    print(dataset[feature].unique(),"\n")

HeartDisease
[0 1] 

BMI
[16.6  20.34 26.58 ... 62.42 51.46 46.56] 

Smoking
[1 0] 

AlcoholDrinking
[0 1] 

Stroke
[0 1] 

PhysicalHealth
[ 3.  0. 20. 28.  6. 15.  5. 30.  7.  1.  2. 21.  4. 10. 14. 18.  8. 25.
 16. 29. 27. 17. 24. 12. 23. 26. 22. 19.  9. 13. 11.] 

MentalHealth
[30.  0.  2.  5. 15.  8.  4.  3. 10. 14. 20.  1.  7. 24.  9. 28. 16. 12.
  6. 25. 17. 18. 21. 29. 22. 13. 23. 27. 26. 11. 19.] 

DiffWalking
[0 1] 

Sex
[0 1] 

AgeCategory
[ 7 12  9 11  4 10  8  6  5  0  3  2  1] 

Diabetic
[2 0 1 3] 

PhysicalActivity
[1 0] 

GenHealth
[4 1 2 3 0] 

SleepTime
[ 5.  7.  8.  6. 12.  4.  9. 10. 15.  3.  2.  1. 16. 18. 14. 20. 11. 13.
 17. 24. 19. 21. 22. 23.] 

Asthma
[1 0] 

KidneyDisease
[0 1] 

SkinCancer
[1 0] 



In [None]:
x = dataset.drop('HeartDisease', axis=1)
y = dataset['HeartDisease']

sc = StandardScaler()
x = sc.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=0)

In [None]:
# Create a KNN classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
# Train the classifier on the training set
knn.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = knn.predict(X_test)


In [None]:
# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8905442131777808


In [None]:
# Create a Gaussian Naive Bayes classifier
naive_bayes = GaussianNB()

In [None]:
# Train the classifier on the training set
naive_bayes.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = naive_bayes.predict(X_test)

In [None]:
# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8397023730611163


In [None]:
# Create a Logistic Regression classifier
logreg = LogisticRegression()

In [None]:
# Train the classifier on the training set
logreg.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = logreg.predict(X_test)


In [None]:
# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.909485615802731
