In [170]:
#Heart Disease Data Set
#This database contains 76 attributes, but all published experiments refer to using a subset of 14 of them. 
#In particular, the Cleveland database is the only one that has been used by ML researchers to this date. 
#The "goal" field refers to the presence of heart disease in the patient. 
#It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on 
#simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0). 

In [171]:
#Column Names
cols=['age','sex','cp' ,'trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','num']


In [172]:
import pandas as pd
heartdf=pd.read_csv('processed.cleveland.data',names=cols)

In [173]:
heartdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null float64
cp          303 non-null float64
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null float64
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null float64
ca          303 non-null object
thal        303 non-null object
num         303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB


In [174]:
heartdf.shape

(303, 14)

In [175]:
heartdf[heartdf.isnull()].count()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [176]:
heartdf.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


In [177]:
#presence (values 1,2,3,4) from absence (value 0)
#converting 2,3,4 to 1
heartdf.num.replace(to_replace=[2,3,4],value=1,inplace=True)

In [178]:
#taking the count of the unique classes
heartdf.num.value_counts()

0    164
1    139
Name: num, dtype: int64

In [179]:
#Changing the Datatypes of the below column as it was object
heartdf['ca']=pd.to_numeric(heartdf['ca'],errors='coerce')
heartdf['thal']=pd.to_numeric(heartdf['thal'],errors='coerce')

In [180]:
heartdf.info()
heartdf.ca.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null float64
sex         303 non-null float64
cp          303 non-null float64
trestbps    303 non-null float64
chol        303 non-null float64
fbs         303 non-null float64
restecg     303 non-null float64
thalach     303 non-null float64
exang       303 non-null float64
oldpeak     303 non-null float64
slope       303 non-null float64
ca          299 non-null float64
thal        301 non-null float64
num         303 non-null int64
dtypes: float64(13), int64(1)
memory usage: 33.2 KB


0    0.0
1    3.0
2    2.0
3    0.0
4    0.0
Name: ca, dtype: float64

In [181]:
heartdf.tail(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
293,63.0,1.0,4.0,140.0,187.0,0.0,2.0,144.0,1.0,4.0,1.0,2.0,7.0,1
294,63.0,0.0,4.0,124.0,197.0,0.0,0.0,136.0,1.0,0.0,2.0,0.0,3.0,1
295,41.0,1.0,2.0,120.0,157.0,0.0,0.0,182.0,0.0,0.0,1.0,0.0,3.0,0
296,59.0,1.0,4.0,164.0,176.0,1.0,2.0,90.0,0.0,1.0,2.0,2.0,6.0,1
297,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
298,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
299,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,1
300,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,1
301,57.0,0.0,2.0,130.0,236.0,0.0,2.0,174.0,0.0,0.0,2.0,1.0,3.0,1
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,,3.0,0


In [182]:
#checking missing values in the Dataframe
heartdf.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64

In [183]:
#Replacing missing value with mean
heartdf.ca.fillna(heartdf.ca.mean(),inplace=True)
heartdf.thal.fillna(heartdf.thal.mean(),inplace=True)

In [184]:
#Again veryfying is the Na are replaced by mean or not
heartdf.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [185]:
#Making the Features and Labels
#Features
X=heartdf.drop('num',axis=1)
#Labels
y=heartdf['num']

In [186]:
#Splitting the Train test data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=1)

In [187]:
#Doing Standardization to bring the data into same scale

from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(X_train)

scaledX_train=scaler.transform(X_train)
scaledX_test=scaler.transform(X_test)

In [188]:
#Training the Model=Fitting the Model to the data
#Fit /Train the classifier on training set
#make prediction on Test set
#compare prediction with known labels

from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=9)
knn.fit(scaledX_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [189]:
knn.score(scaledX_test,y_test)

0.8791208791208791

In [190]:
y_pred=knn.predict(scaledX_test)
y_pred

array([1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1])

In [191]:
from sklearn import metrics
metrics.confusion_matrix(y_test,y_pred)

array([[43,  6],
       [ 5, 37]])

In [192]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred))  
print(classification_report(y_test, y_pred))  

[[43  6]
 [ 5 37]]
              precision    recall  f1-score   support

           0       0.90      0.88      0.89        49
           1       0.86      0.88      0.87        42

    accuracy                           0.88        91
   macro avg       0.88      0.88      0.88        91
weighted avg       0.88      0.88      0.88        91

