In [1]:
import pandas as pd

Import the dataset

In [2]:
data = pd.read_csv("Heart Disease Prediction.csv")
data.head()


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


check missing values

In [3]:
data.isnull().sum()
# No missing values

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Check duplicate values

In [4]:
dup_data = data.duplicated().any()
dup_data
# dataset contains some duplicate values 


True

drop duplicate records

In [5]:
data = data.drop_duplicates() # method to drop duplicate records

In [6]:
dup_data = data.duplicated().any()
dup_data
# post dropping the duplicate values, our dataset does not contain duplicate values

False

#### Data Preprocessing

In [7]:
# Separate the categorical features and numerical features

cat_col = []
num_col = []

for col in data.columns:
    if data[col].nunique() <=10:
        cat_col.append(col)
    else:
        num_col.append(col)

In [8]:
print("Categorical columns are :",cat_col)
print()
print("Numerical columns are :",num_col)
copy_data = data.copy()

Categorical columns are : ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'target']

Numerical columns are : ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']


##### Encoding the categorical columns

In [9]:
data['cp'].unique()

array([0, 1, 2, 3], dtype=int64)

In [10]:
cat_col.remove('sex')
cat_col.remove('target')

In [11]:
print("Categorical columns are :",cat_col)

Categorical columns are : ['cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']


In [12]:
data = pd.get_dummies(data,columns=cat_col,drop_first=True)
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,52,1,125,212,168,1.0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,53,1,140,203,155,3.1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,70,1,145,174,125,2.6,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,61,1,148,203,161,0.0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,62,0,138,294,106,1.9,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


In [13]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()

In [14]:
data[num_col] = st.fit_transform(data[num_col])
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,exang_1,slope_1,slope_2,ca_1,ca_2,ca_3,ca_4,thal_1,thal_2,thal_3
0,-0.267966,1,-0.376556,-0.667728,0.806035,-0.037124,0,0,0,0,...,0,0,1,0,1,0,0,0,0,1
1,-0.15726,1,0.47891,-0.841918,0.237495,1.773958,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,1.724733,1,0.764066,-1.403197,-1.074521,1.342748,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,0.728383,1,0.935159,-0.841918,0.499898,-0.899544,0,0,0,0,...,0,0,1,1,0,0,0,0,0,1
4,0.839089,0,0.364848,0.919336,-1.905464,0.739054,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0


Split the dataset in training and test

In [19]:
X = data.drop('target',axis=1)
y = data['target']

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [28]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
log.fit(X_train,y_train)

LogisticRegression()

In [31]:
y_pred1 = log.predict(X_test)
y_pred1

array([1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1], dtype=int64)

In [30]:
from sklearn.metrics import accuracy_score

In [34]:
accuracy_score(y_test,y_pred1)

0.7868852459016393

SVC

In [35]:
from sklearn import svm

In [36]:
svm = svm.SVC()

In [37]:
svm.fit(X_train,y_train)

SVC()

In [38]:
y_pred2 = svm.predict(X_test)

In [39]:
accuracy_score(y_test,y_pred2)

0.8032786885245902

KNN

In [40]:
from sklearn.neighbors import KNeighborsClassifier

In [41]:
knn = KNeighborsClassifier()

In [42]:
knn.fit(X_train,y_train)

KNeighborsClassifier()

In [43]:
y_pred3 = knn.predict(X_test)

In [44]:
accuracy_score(y_test,y_pred3)

0.7377049180327869

In [45]:
score = []
for k in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,y_train)
    y_pred = knn.predict(X_test)
    score.append(accuracy_score(y_test,y_pred))
    

In [46]:
score

[0.7213114754098361,
 0.8032786885245902,
 0.7049180327868853,
 0.7049180327868853,
 0.7377049180327869,
 0.8032786885245902,
 0.7868852459016393,
 0.8032786885245902,
 0.7704918032786885,
 0.7540983606557377,
 0.7704918032786885,
 0.7540983606557377,
 0.7377049180327869,
 0.7377049180327869,
 0.7540983606557377,
 0.7704918032786885,
 0.7540983606557377,
 0.7540983606557377,
 0.7377049180327869,
 0.7540983606557377,
 0.7377049180327869,
 0.7213114754098361,
 0.7377049180327869,
 0.7377049180327869,
 0.7213114754098361,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869,
 0.7377049180327869]