In [47]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection  import train_test_split
import statistics as stats

In [7]:
heart_df = pd.read_csv("./heart.csv")
heart_df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [8]:
# One Hot Encoding
Sex = pd.get_dummies(heart_df.Sex,prefix="Sex")
ExerciseAngina = pd.get_dummies(heart_df.ExerciseAngina,prefix="ExerciseAngina")
ChestPainType = pd.get_dummies(heart_df.ChestPainType,prefix="ChestPainType")
RestingECG = pd.get_dummies(heart_df.RestingECG,prefix="RestingECG")
STSlope = pd.get_dummies(heart_df.ST_Slope,prefix="STSlope")

In [24]:
heart_df_encoded = pd.DataFrame()
heart_df_encoded["HeartDisease"] = heart_df["HeartDisease"]
heart_df_encoded = heart_df_encoded.join(Sex)
heart_df_encoded = heart_df_encoded.join(ChestPainType)
heart_df_encoded = heart_df_encoded.join(RestingECG)
heart_df_encoded = heart_df_encoded.join(ExerciseAngina)
heart_df_encoded = heart_df_encoded.join(STSlope)
heart_df_encoded["RestingBP"] = heart_df["RestingBP"]
heart_df_encoded["Cholesterol"] = heart_df["Cholesterol"]
heart_df_encoded["FastingBS"] = heart_df["FastingBS"]
heart_df_encoded["MaxHR"] = heart_df["MaxHR"]
heart_df_encoded["Oldpeak"] = heart_df["Oldpeak"]
heart_df_encoded["Age"] = heart_df["Age"]
heart_df_encoded.head()

Unnamed: 0,HeartDisease,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,...,ExerciseAngina_Y,STSlope_Down,STSlope_Flat,STSlope_Up,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Age
0,0,0,1,0,1,0,0,0,1,0,...,0,0,0,1,140,289,0,172,0.0,40
1,1,1,0,0,0,1,0,0,1,0,...,0,0,1,0,160,180,0,156,1.0,49
2,0,0,1,0,1,0,0,0,0,1,...,0,0,0,1,130,283,0,98,0.0,37
3,1,1,0,1,0,0,0,0,1,0,...,1,0,1,0,138,214,0,108,1.5,48
4,0,0,1,0,0,1,0,0,1,0,...,0,0,0,1,150,195,0,122,0.0,54


In [25]:
heart_y_df = heart_df_encoded["HeartDisease"]
heart_x_df = heart_df_encoded.drop(["HeartDisease"], axis=1)
heart_x_df.head()

Unnamed: 0,Sex_F,Sex_M,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ExerciseAngina_N,ExerciseAngina_Y,STSlope_Down,STSlope_Flat,STSlope_Up,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Age
0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,140,289,0,172,0.0,40
1,1,0,0,0,1,0,0,1,0,1,0,0,1,0,160,180,0,156,1.0,49
2,0,1,0,1,0,0,0,0,1,1,0,0,0,1,130,283,0,98,0.0,37
3,1,0,1,0,0,0,0,1,0,0,1,0,1,0,138,214,0,108,1.5,48
4,0,1,0,0,1,0,0,1,0,1,0,0,0,1,150,195,0,122,0.0,54


In [26]:
x_heart_train_df, x_heart_test_validation_df, y_heart_train_df, y_heart_test_validation_df = train_test_split(heart_x_df, heart_y_df, test_size=.4)

In [27]:
x_heart_test_df, x_heart_validation_df, y_heart_test_df, y_heart_validation_df= train_test_split(heart_x_df, heart_y_df, test_size=.5)

In [28]:
def eucledian(v1,v2):
    distance = np.sqrt(np.sum((v1-v2)**2))
    return distance

In [91]:
def predict(x_train_df,y_train_df,x_df,k):
    x_train_arr = np.array(x_train_df)
    pred = []
    for x in x_df:
        distance_list = []
        for j in range(len(x_train_arr)): 
            distances = eucledian(np.array(x_train_arr[j,:]),x) 
            distance_list.append(distances)
             
        distance_list = np.array(distance_list) 
        k_best_indexes = np.argsort(distance_list)[:k] 
        pred.append(stats.mode(y_train_df[k_best_indexes]))
    return pred

In [104]:
def five_fold_cross_validation(heart_x_df,heart_y_df):
    fold_size = len(heart_y_df)//5
    current_fold = 0
    cms = []
    k = [1,3,5,10,15]
    for i in range(5):
        training_index = [0,1,2,3,4]
        x_test_df = heart_x_df[(training_index[i]*fold_size):(((training_index[i]+1)*(fold_size))-1)]
        y_test_df = heart_y_df[(training_index[i]*fold_size):(((training_index[i]+1)*(fold_size))-1)]
        training_index.pop(i)
        
        x_train_df = heart_x_df[(training_index[0]*fold_size):(((training_index[0]+1)*(fold_size))-1)]
        y_train_df = heart_y_df[(training_index[0]*fold_size):(((training_index[0]+1)*(fold_size))-1)]
        for j in range(1,len(training_index)):
            x_train_df = x_train_df.append(heart_x_df[(training_index[j]*fold_size):(((training_index[j]+1)*(fold_size))-1)],ignore_index=False)
            y_train_df = y_train_df.append(heart_y_df[(training_index[j]*fold_size):(((training_index[j]+1)*(fold_size))-1)],ignore_index=False)
        
        model_pred = predict(np.array(x_train_df),np.array(y_train_df),np.array(x_test_df),k[i])
        actual = y_test_df.tolist()
        cms.append((actual, model_pred))
    return cms

In [105]:
import warnings
warnings.filterwarnings("ignore")
c = five_fold_cross_validation(heart_x_df,heart_y_df)
warnings.filterwarnings("default")

In [106]:
from sklearn import metrics
k = [1,3,5,10,15]
for i in range(5):
    print(f"K: {k[i]}")
    accuracy = metrics.accuracy_score(c[i][0], c[i][1])
    print(f"Accuracy: {accuracy}")
    precision = metrics.precision_score(c[i][0], c[i][1])
    print(f"Precision: {precision}")
    recall = metrics.recall_score(c[i][0], c[i][1])
    print(f"Recall: {recall}")
    f1_score = metrics.f1_score(c[i][0], c[i][1])
    print(f"F1 Score: {f1_score}")
    print()

K: 1
Accuracy: 0.5824175824175825
Precision: 0.45555555555555555
Recall: 0.6029411764705882
F1 Score: 0.5189873417721518

K: 3
Accuracy: 0.7087912087912088
Precision: 0.7
Recall: 0.8316831683168316
F1 Score: 0.7601809954751131

K: 5
Accuracy: 0.7692307692307693
Precision: 0.8496732026143791
Recall: 0.87248322147651
F1 Score: 0.8609271523178809

K: 10
Accuracy: 0.6868131868131868
Precision: 0.7333333333333333
Recall: 0.6666666666666666
F1 Score: 0.6984126984126984

K: 15
Accuracy: 0.6373626373626373
Precision: 0.6551724137931034
Recall: 0.4523809523809524
F1 Score: 0.5352112676056338



In [101]:
pred = predict(np.array(x_heart_train_df), np.array(y_heart_train_df), np.array(x_heart_test_df), 5)

In [90]:
from sklearn.metrics import confusion_matrix
actual = y_heart_test_df.tolist()
confusion_matrix(actual, pred)

array([[155,  55],
       [ 57, 192]])