# Importing the libraries

In [31]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Importing the dataset

In [32]:
df = pd.read_csv("heart.csv")
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


- Age: age of the patient [years]
- Sex: sex of the patient [M: Male, F: Female]
- ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
- RestingBP: resting blood pressure [mm Hg]
- Cholesterol: serum cholesterol [mm/dl]
- FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
- RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
- MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
- ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
- Oldpeak: oldpeak = ST [Numeric value measured in depression]
- ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
- HeartDisease: output class [1: heart disease, 0: Normal]

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [34]:
df.describe(include='all')

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


# Data Preprocessing

## Missing values

In [35]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

## Encoding categorical features

In [36]:
categorical_cols = []
for col in df.columns:
    if df[col].dtypes == 'object':
        categorical_cols.append(col)
print(categorical_cols)

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']


In [39]:
df = pd.concat([df, pd.get_dummies(df[categorical_cols], drop_first=True)], axis=1)
df.drop(categorical_cols, axis=1, inplace=True)
df.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1


## Splitting the dataset into the Training set and Test set

In [40]:
X = df.loc[:, df.columns != 'HeartDisease']
y = df.loc[:, 'HeartDisease']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Feature Scaling

In [44]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Building

## Logistic Regression

In [45]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [46]:
pred_lr = lr.predict(X_test)

In [47]:
cm_lr = confusion_matrix(y_test, pred_lr)
print(cm_lr)
print('accuracy_score =', accuracy_score(y_test, pred_lr))
print('f1_score =', f1_score(y_test, pred_lr))

[[ 68  24]
 [ 16 122]]
accuracy_score = 0.8260869565217391
f1_score = 0.8591549295774648


## KNN

In [69]:
k = 10

In [70]:
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [71]:
pred_knn = knn.predict(X_test)

In [72]:
cm_knn = confusion_matrix(y_test, pred_knn)
print(cm_knn)
print('accuracy_score =', accuracy_score(y_test, pred_knn))
print('f1_score =', f1_score(y_test, pred_knn))

[[ 72  20]
 [ 14 124]]
accuracy_score = 0.8521739130434782
f1_score = 0.8794326241134751


## SVM

In [52]:
svc_l = SVC(kernel = 'linear', random_state = 0)
svc_l.fit(X_train, y_train)

svc_r = SVC(kernel = 'rbf', random_state = 0)
svc_r.fit(X_train, y_train)

SVC(random_state=0)

In [53]:
pred_svc_l = svc_l.predict(X_test)
pred_svc_r = svc_r.predict(X_test)

In [54]:
cm_svc_l = confusion_matrix(y_test, pred_svc_l)
print(cm_svc_l)
print('accuracy_score =', accuracy_score(y_test, pred_svc_l))
print('f1_score =', f1_score(y_test, pred_svc_l))

[[ 67  25]
 [ 17 121]]
accuracy_score = 0.8173913043478261
f1_score = 0.8521126760563381


In [55]:
cm_svc_r = confusion_matrix(y_test, pred_svc_r)
print(cm_svc_r)
print('accuracy_score =', accuracy_score(y_test, pred_svc_r))
print('f1_score =', f1_score(y_test, pred_svc_r))

[[ 70  22]
 [ 12 126]]
accuracy_score = 0.8521739130434782
f1_score = 0.881118881118881


## Naive Bayes

In [56]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [57]:
pred_nb = nb.predict(X_test)

In [58]:
cm_nb = confusion_matrix(y_test, pred_nb)
print(cm_nb)
print('accuracy_score =', accuracy_score(y_test, pred_nb))
print('f1_score =', f1_score(y_test, pred_nb))

[[ 70  22]
 [ 13 125]]
accuracy_score = 0.8478260869565217
f1_score = 0.8771929824561403


##  Decision Tree

In [59]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [60]:
pred_dt = dt.predict(X_test)

In [61]:
cm_dt = confusion_matrix(y_test, pred_dt)
print(cm_dt)
print('accuracy_score =', accuracy_score(y_test, pred_dt))
print('f1_score =', f1_score(y_test, pred_dt))

[[ 70  22]
 [ 24 114]]
accuracy_score = 0.8
f1_score = 0.832116788321168


## Random Forest

In [62]:
rf = RandomForestClassifier(n_estimators = 200)
rf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200)

In [63]:
pred_rf = rf.predict(X_test)

In [64]:
cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)
print('accuracy_score =', accuracy_score(y_test, pred_rf))
print('f1_score =', f1_score(y_test, pred_rf))

[[ 75  17]
 [ 14 124]]
accuracy_score = 0.8652173913043478
f1_score = 0.888888888888889
