## kNN Clasiffier
### ID: eo9232
### Name: Md Reza
### CSC 5825 - Fall 2021

In [1]:
# Import libraries and initialize variable
import csv
import numpy as np
import random
import math
import pandas as pd
from numpy.random import RandomState
k=5

#### Load the dataset

In [2]:
df=pd.read_csv('/u/mreza6/5825/Data/heart.csv')
rng = RandomState()
df = df.sample(frac = 1).reset_index(drop=True)

#### Split the features & target class

In [3]:
nRows = len(df.index)
# Features
X = df[df.columns[1:13]]
# Target class
y = df[df.columns[13]]
X = X.astype(float)

#### Transformed features using function F(X) = (X - mean)/std

In [4]:
xTrans = (X - X.mean())/X.std()

#### Mapped categorical variables to numerical values (through one-hot encoding)

In [5]:
xTrans=pd.get_dummies(xTrans)

#### Split the instances into training, validation, and test sets as (6:2:2)

In [6]:
nTrain = round(nRows*0.6)
xTrain = xTrans[:nTrain]

# training set
yTrain = y[:nTrain]

# Remaining test set
X_n = xTrans[-(nRows-nTrain):]
y_n = y[-(nRows-nTrain):]

# Split remaining 40% into half to keep 20% for Validation
numVal = int((nRows-nTrain)/2)
numTest = nRows-nTrain-numVal
xVal = X_n[:numVal]
yVal = y_n[:numVal]

# Calculated X_tst and y_tst 
xTest = X_n[-numTest:]
yTest = y_n[-numTest:]

#### Convert to NumPy array

In [7]:
xTrain = xTrain.to_numpy()
yTrain = yTrain.to_numpy()
xTest = xTest.to_numpy()
yTest = yTest.to_numpy()
xValidation = xVal.to_numpy()
yValidation = yVal.to_numpy()

#### Calculate the accuracy of best K

In [8]:
def accuracy(yTest,yPredict):
    x=yTest.flatten()==yPredict.flatten()
    target=np.mean(x)
    return np.round(target*100,2)

#### Calculate the L2 distance between neighbors

In [9]:
def l2_dist(xTrain,xTest,k):
    euc = -2 * xTrain@xTest.T 
    + np.sum(xTest**2,axis=1) 
    + np.sum(xTrain**2,axis=1)[:, np.newaxis]
    euc[euc < 0] = 0
    euc = euc**.5
    indx = np.argsort(euc, 0)
    euc = np.sort(euc,0)
    return indx[0:k,:], euc[0:k,:]

#### Get the K closest neighbors

In [10]:
def predict(xTrain,yTrain,xTest,k):
    indx, euc = l2_dist(xTrain,xTest,k)
    yTrain = yTrain.flatten()
    rows, columns = indx.shape
    result = list()
    for k in range(columns):
        tmp = list()
        for j in range(rows):
            item = indx[j][k]
            tmp.append(yTrain[item])
        result.append(max(tmp,key=tmp.count))
    result=np.array(result)
    return result

#### Fine tune the hyper-parameter k

In [11]:
bestAccuracy = np.zeros((k-1))
for n in range(1,k):
    yTemp = predict(xTrain,yTrain,xValidation,n)
    bestAccuracy[n-1] = accuracy(yTemp,yValidation)
print( "The value of K =", bestAccuracy.argmax()+1, " has the optimum accuracy of :", bestAccuracy.max(),"%")

The value of K = 4  has the optimum accuracy of : 73.33 %


#### Calculate kNN accuracy

In [12]:
def knnAccuracy(xTest,yPredict):
    c = 0
    for i in range(len(xTest)):
        if xTest[i] == yPredict[i]:
            c += 1
    accuracy=(c/float(len(xTest))) * 100.0
    return accuracy

#### Define the confusion matrix and calculate precision, recall, & f1-score

In [13]:
def confusionMatrix(xTest,yPredict):
    
    # Count TP
    #===================================
    tp=0
    for i in range(len(xTest)):
        if(xTest[i]==1 and yPredict[i]==1):
            tp=tp+1
       
    # Count FP
    #===================================
    fp=0
    for i in range(len(xTest)):
        if(xTest[i]==0 and yPredict[i]==1):
            fp=fp+1

    # Count FN
    #===================================
    fn=0
    for i in range(len(xTest)):
        if(xTest[i]==1 and yPredict[i]==0):
            fn=fn+1
    
    # Count TN
    #===================================
    tn=0
    for i in range(len(xTest)):
        if(xTest[i]==0 and yPredict[i]==0):
            tn=tn+1
    
    # Confusion Matrix
    print("\n\nConfusion Matrix")
    print("\n%15sActual" % "")
    print("%6s %7s %7s" % ("", "1", "0"))
    print("P%6s +--------+--------+" % "")
    print("r%6s | %-6s | %-6s |" % ("1", 'TP='+str(tp), 'FP='+str(fp)))
    print("e%6s +--------+--------+" % "")
    print("d%6s | %-6s | %-6s |" % ("0", 'FN='+str(fn), 'TN='+str(tn)))
    print(".%6s +--------+--------+\n" % "")

    # Precision
    precision=(tp)/(tp+fp)
    print("\n\n Precision:", format(precision,".4f"))
    
    # Recall
    recall=(tp)/(tp+fn)
    print(" Recall   :", format(recall,".4f"))
    
    # F1 Score
    f1score=(2*precision*recall)/(precision+recall)
    print(" F1-Score :",format(f1score,".4f"))

#### Make Prediction

In [14]:
kVal = bestAccuracy.argmax()+1
yPredict = predict(xTrain,yTrain,xTest,kVal)

#### Print Accuracy 

In [15]:
def printAccuracy(yTest,yPredict):
    acc=knnAccuracy(yTest,yPredict) 
    print(" Accuracy :",format(acc,".2f"),"%")

#### Print confusion matrix, Precision, Recall, F1-Score, & Accuracy

In [16]:
confusionMatrix(yTest,yPredict)
printAccuracy(yTest,yPredict)



Confusion Matrix

               Actual
             1       0
P       +--------+--------+
r     1 | TP=31  | FP=12  |
e       +--------+--------+
d     0 | FN=1   | TN=17  |
.       +--------+--------+



 Precision: 0.7209
 Recall   : 0.9688
 F1-Score : 0.8267
 Accuracy : 78.69 %
