In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import warnings


warnings.filterwarnings("ignore")

# read Dataset
df = pd.read_csv("sorlie.csv", header=None)
data = df.to_numpy()
n = np.size(data, 1) - 1
X = data[:, 0:n]
y = data[:,n]


# normal data
scaler = MinMaxScaler()
X = scaler.fit_transform(X)


# holdout
def holdout():
    split_list = []
    num_iteration = 20
    for i in range(num_iteration):
        X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
        split_list.append([X_train, X_test, Y_train, Y_test])
    
    return split_list


# KNN algorithm
def knn(X_train, X_test, Y_train, K):
    Y_test = []
    for x_test in X_test:
        
        dist_list = {}
        for i, x_train in enumerate(X_train):
            dist_list[i] = distance.euclidean(x_test, x_train)
        
        sort_dist_list = dict(sorted(dist_list.items(), key=lambda item: item[1])).keys()
        indexes = [key for key in sort_dist_list]
        k_dist = indexes[:K]
        y_knn = []
        for i in k_dist:
            y_knn.append(Y_train[i])
            
        y_test = most_frequent(y_knn)
        Y_test.append(y_test)
    
    return Y_test
        
        
            
def most_frequent(List):
    return int(max(set(List), key = List.count))    
      


# Evaluate Prediction
def evaluate():
    accuracies = []
    precisions = []
    recalls = []
    
    # split data
    h_list = holdout()
    for split_data in h_list:
        [X_train, X_test, Y_train, Y_test] = split_data
        predictions = knn(X_train, X_test, Y_train, 3)
        
        accuracies.append(accuracy_score(Y_test, predictions))
        precisions.append(precision_score(Y_test, predictions, average='macro'))
        recalls.append(recall_score(Y_test, predictions, average='macro'))
    
    accuracy = np.mean(accuracies)   
    precision = np.mean(precisions)   
    recall = np.mean(recalls) 
    
    return accuracy, precision, recall


print("Normal Data: \n")
print(X)
print("\n\nValidations: \n")
Accuracy, Precision, Recall = evaluate()
print("Validation Accuracy based on KNN :" , Accuracy)
print("Validation Precision based on KNN :" , Precision)
print("Validation Recall based on KNN :" , Recall)



Normal Data: 

[[0.66550523 0.62954545 0.32898551 ... 0.47640118 0.5640327  0.56671252]
 [0.68850174 0.37727273 0.35362319 ... 0.68112094 0.36512262 0.78404402]
 [0.70615563 0.23863636 0.31405797 ... 0.62979351 0.         0.58872077]
 ...
 [0.70034843 0.42727273 0.31884058 ... 0.55294985 0.48228883 0.46795048]
 [0.60046458 0.21136364 0.24782609 ... 0.36283186 0.52588556 0.40165062]
 [0.70963995 0.30681818 0.12173913 ... 0.63569322 0.48501362 0.52269601]]


Validations: 

Validation Accuracy based on KNN : 0.7911764705882354
Validation Precision based on KNN : 0.8099166666666667
Validation Recall based on KNN : 0.8138273809523809
