In [1]:
import csv
import numpy as np
import pandas as pd
import sys
import time
import matplotlib.pyplot as plt
from collections import Counter

iris_dataset = './iris.data'
pima_dataset = './pima-indians-diabetes.data'

iris_df = pd.read_csv(iris_dataset, sep=',', header=None)
pima_df = pd.read_csv(pima_dataset, sep=',', header=None)

iris_df =  pd.DataFrame(iris_df)
iris_df = iris_df.sample(frac=1)

pima_df =  pd.DataFrame(pima_df)
pima_df = pima_df.sample(frac=1)

iris_x = iris_df.iloc[:,0:4].values
iris_y = iris_df.iloc[: , 4:5].values

iris_y = iris_y.ravel()
iris_y = np.array(iris_y).astype(str)

pima_x = pima_df.iloc[:,0:8].values
pima_y = pima_df.iloc[:,8:9].values
pima_y = pima_y.ravel()
pima_y = np.array(pima_y).astype(str)
#np.shape(iris_y)

In [2]:
def nearestNeighborClassifier(train_x, train_y, test_x, k):
    if k > len(train_x):
        return 1    # oi neighbors den ginetai na ypervainoun tis eggrafes tou train set
    distances = []
    targets = []
    
    for i in range(len(train_x)):
        distance = np.sqrt(np.sum(np.square(test_x - train_x[i, :]))) # euclidean distance
        distances.append([distance, i])
        
    distances = sorted(distances) # sort distances

    for i in range(k):  # k neighbors
        index = distances[i][1]
        targets.append(train_y[index])

    # return most common target
    return Counter(targets).most_common(1)[0][0]

In [4]:
folds = 10

####### IRIS DATASET KNN #######
K = list(range(1,50))
K = list(filter(lambda x: x % 2 != 0, K))

fold_size = int(len(iris_x) / folds)

all_iris_scores = []
for k in K:
    total_score = []
    for i in range(folds):
        val_from = 15*i
        val_to = (i+1)*15

        validation_set = iris_x[val_from:val_to]
        validation_label = iris_y[val_from:val_to]
        
        train_set = np.delete(iris_x, np.s_[val_from:val_to], 0)
        train_label = np.delete(iris_y, np.s_[val_from:val_to], 0)
        
        predictions = []
        for test_x in validation_set:
            knn = nearestNeighborClassifier(train_set, train_label, test_x, k)
            predictions.append(knn)

        predictions_num = len(predictions)
        score = 0.0
        match = 0
        for j in range(len(predictions)):
            if predictions[j] == validation_label[j]:
                match += 1
            score = (match / predictions_num) * 100
        total_score.append(score)
        # print('Accuracy ', score, match)
    all_iris_scores.append([k, np.mean(total_score) ])
    
for s in all_iris_scores:
    print('[IRIS Dataset] Number of Neighbors: ', s[0], ' -> ', round(s[1],2), '%')

    
    
####### PIMA DATASET KNN #######
K= []
K = list(range(1,71))
K = list(filter(lambda x: x % 2 != 0, K))
fold_size = int(len(pima_x) / folds)

all_pima_scores = []

for k in K:
    total_score = []
    for i in range(folds):
        val_from = fold_size*i
        val_to = (i+1)*fold_size

        if i == 9:
            val_to = (len(pima_x))

        validation_set = pima_x[val_from:val_to]
        validation_label = pima_y[val_from:val_to]

        train_set = np.delete(pima_x, np.s_[val_from:val_to], 0)
        train_label = np.delete(pima_y, np.s_[val_from:val_to], 0)

        predictions = []
        for test_x in validation_set:
            knn = nearestNeighborClassifier(train_set, train_label, test_x, k)
            predictions.append(knn)

        predictions_num = len(predictions)
        score = 0.0
        match = 0
        for j in range(len(predictions)):
            if predictions[j] == validation_label[j]:
                match += 1
            score = (match / predictions_num) * 100
        total_score.append(score)
    all_pima_scores.append([k, np.mean(total_score) ])

for s in all_pima_scores:
    print('[PIMA Dataset] Number of Neighbors: ', s[0], ' -> ', round(s[1],2), '%')

print('[PIMA Dataset] MAX ACCURACY  ', round(max(all_pima_scores[1]),2), '%')
    
    
# IRIS PLOT SCORES
neighbors = [i[0] for i in all_iris_scores]
accuracy = [i[1] for i in all_iris_scores]
plt.plot(neighbors,accuracy)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy %')
plt.show()

# PIMA PLOT SCORES
neighbors = [i[0] for i in all_iris_scores]
accuracy = [i[1] for i in all_iris_scores]
plt.plot(neighbors,accuracy)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy %')
plt.show()


[IRIS Dataset] Number of Neighbors:  1  ->  96.0 %
[IRIS Dataset] Number of Neighbors:  3  ->  96.0 %
[IRIS Dataset] Number of Neighbors:  5  ->  97.33 %
[IRIS Dataset] Number of Neighbors:  7  ->  96.67 %
[IRIS Dataset] Number of Neighbors:  9  ->  97.33 %
[IRIS Dataset] Number of Neighbors:  11  ->  98.0 %
[IRIS Dataset] Number of Neighbors:  13  ->  98.0 %
[IRIS Dataset] Number of Neighbors:  15  ->  97.33 %
[IRIS Dataset] Number of Neighbors:  17  ->  98.0 %
[IRIS Dataset] Number of Neighbors:  19  ->  98.0 %
[IRIS Dataset] Number of Neighbors:  21  ->  97.33 %
[IRIS Dataset] Number of Neighbors:  23  ->  96.0 %
[IRIS Dataset] Number of Neighbors:  25  ->  97.33 %
[IRIS Dataset] Number of Neighbors:  27  ->  95.33 %
[IRIS Dataset] Number of Neighbors:  29  ->  96.0 %
[IRIS Dataset] Number of Neighbors:  31  ->  96.0 %
[IRIS Dataset] Number of Neighbors:  33  ->  96.0 %
[IRIS Dataset] Number of Neighbors:  35  ->  95.33 %
[IRIS Dataset] Number of Neighbors:  37  ->  94.0 %
[IRIS Dat