# Cleveland Heart Disease dataset UCI — Models

## Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

## Loading Dataset into Dataframe

In [2]:
dataset_path = "../data/heart_cleveland_upload.csv"

In [4]:
df = pd.read_csv(dataset_path)
df.rename(columns={'condition':'target'}, inplace=True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


In [5]:
df.shape

(297, 14)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    int64  
 1   sex       297 non-null    int64  
 2   cp        297 non-null    int64  
 3   trestbps  297 non-null    int64  
 4   chol      297 non-null    int64  
 5   fbs       297 non-null    int64  
 6   restecg   297 non-null    int64  
 7   thalach   297 non-null    int64  
 8   exang     297 non-null    int64  
 9   oldpeak   297 non-null    float64
 10  slope     297 non-null    int64  
 11  ca        297 non-null    int64  
 12  thal      297 non-null    int64  
 13  target    297 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 32.6 KB


In [7]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0,297.0
mean,54.542088,0.676768,2.158249,131.693603,247.350168,0.144781,0.996633,149.599327,0.326599,1.055556,0.602694,0.676768,0.835017,0.461279
std,9.049736,0.4685,0.964859,17.762806,51.997583,0.352474,0.994914,22.941562,0.469761,1.166123,0.618187,0.938965,0.95669,0.49934
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,2.0,120.0,211.0,0.0,0.0,133.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,56.0,1.0,2.0,130.0,243.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,0.0,0.0
75%,61.0,1.0,3.0,140.0,276.0,0.0,2.0,166.0,1.0,1.6,1.0,1.0,2.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,3.0,2.0,1.0


## Linear regression and kNN

In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import accuracy_score

In [68]:
X = df.drop("target", axis="columns")
y = df.target

In [69]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
292,40,1,3,152,223,0,0,181,0,0.0,0,0,2
293,39,1,3,118,219,0,0,140,0,1.2,1,0,2
294,35,1,3,120,198,0,0,130,1,1.6,1,0,2
295,35,0,3,138,183,0,0,182,0,1.4,0,0,0


In [70]:
y

0      0
1      0
2      0
3      1
4      0
      ..
292    1
293    1
294    1
295    0
296    1
Name: target, Length: 297, dtype: int64

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [72]:
print("X_train.shape: ", X_train.shape)
print("X_test.shape: ", X_test.shape)
print("y_train.shape: ", y_train.shape)
print("y_test.shape: ", y_test.shape)

X_train.shape:  (267, 13)
X_test.shape:  (30, 13)
y_train.shape:  (267,)
y_test.shape:  (30,)


### Normalization

In [73]:
X_train = (X_train - X_train.min())/(X_train.max() - X_train.min()).values
X_test = (X_test-X_test.min())/(X_test.max()-X_test.min()).values

In [74]:
X_train.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
111,0.520833,1.0,0.666667,0.651163,0.242009,0.0,1.0,0.717557,0.0,0.258065,0.0,0.0,1.0
219,0.604167,0.0,1.0,0.883721,0.226027,1.0,1.0,0.572519,1.0,0.451613,0.5,0.666667,0.5
24,0.875,0.0,0.333333,0.767442,0.401826,0.0,0.0,0.694656,0.0,0.064516,0.0,0.666667,0.0
82,0.770833,0.0,0.666667,0.604651,0.347032,0.0,1.0,0.618321,0.0,0.0,0.5,0.333333,0.0
7,0.666667,1.0,0.0,0.465116,0.246575,0.0,0.0,0.564885,0.0,0.419355,0.5,0.666667,0.0


### Fitting into the model

In [75]:
logre = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=2)

In [76]:
logre.fit(X_train, y_train)
knn.fit(X_train, y_train)

In [79]:
y_pred_logre = logre.predict(X_test)
y_pred_knn = knn.predict(X_test)

print("y_pred_logre:", y_pred_logre)
print("y_pred_knn:", y_pred_knn)

y_pred_logre: [1 1 0 0 1 0 1 1 1 0 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 1 1 1]
y_pred_knn: [1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 1 1 0 0]


### Model evaluation

In [81]:
print("Accuracy score of logistic regression",accuracy_score(y_test,y_pred_logre))
print("Accuracy score of kNN",accuracy_score(y_test,y_pred_knn))

Accuracy score of logistic regression 0.8333333333333334
Accuracy score of kNN 0.8666666666666667
