In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle

In [12]:
class KNearestNeighbors():
    def __init__(self, n_neighbors, distance):
        self.n_neighbors = n_neighbors
        self.distance = distance
        self.data_x = None
        self.data_y = None

    def euclidean(self, x1, x2):
        return np.sqrt(np.sum((x1 - x2) ** 2, axis=0))

    def normalized_euclidean(self, x1, x2):
        x1 = x1 / np.linalg.norm(x1)
        x2 = x1 / np.linalg.norm(x2)
        return np.sqrt(np.sum((x1 - x2) ** 2, axis=0))

    def cosine_similarity(self, x1, x2):
        return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))

    def fit(self, data_x, data_y):
        self.data_x = data_x
        self.data_y = data_y

    def predict(self, new_data_x):
        predict_y = []
        for i in range(len(new_data_x)):
            train_distances = []
            for j in range(len(self.data_x)):
                if self.distance == 'euclidean':
                    element = (self.euclidean(new_data_x[i], self.data_x[j]), j, self.data_y[j])
                elif self.distance == 'normalized_euclidean':
                    element = (self.normalized_euclidean(new_data_x[i], self.data_x[j]), j, self.data_y[j])
                else:
                    element = (self.cosine_similarity(new_data_x[i], self.data_x[j]), j, self.data_y[j])
                train_distances.append(element)
            train_distances.sort()
            new_train_distances = train_distances[:self.n_neighbors]
            pred_y = [new_train_distances[j][2] for j in range(len(new_train_distances))]
            if len(list(np.unique(pred_y))) == len(pred_y):
                predict_y.append(pred_y[0])
            else:
                count = []
                for j in list(np.unique(pred_y)):
                    c = 0
                    for k in pred_y:
                        if int(j) == int(k):
                            c += 1
                    count.append(c)
                maximum_count = max(count)
                maximum_count_index = count.index(maximum_count)
                predict_y.append(pred_y[maximum_count_index])
        return predict_y

In [3]:
def data_preprocessing(data):
    data = data.dropna()
    data = shuffle(data)
    class_names = list(np.unique(list(data['class'])))
    data['class'] = [class_names.index(i) for i in list(data['class'])]
    train, val, test = data.iloc[:int(0.6*len(data))], data.iloc[int(0.6*len(data)):int(0.8*len(data))], \
                       data.iloc[int(0.8*len(data)):]
    return train, val, test, class_names

In [8]:
def accuracy_score(actual, predict):
    acc = 0
    for i, j, in zip(actual, predict):
        if int(i) == int(j):
            acc += 1
    return acc/len(actual)

In [10]:
def k_nearest_neighbors(k, train, val, test):
    k_acc_scores = []
    distances = ['euclidean', 'normalized_euclidean', 'cosine_similarity']
    train_x, train_y = train.drop(columns=['class']).values, train['class'].values
    val_x, val_y = val.drop(columns=['class']).values, val['class'].values
    test_x, test_y = test.drop(columns=['class']).values, test['class'].values
    for j in distances:
        k_acc_scores = []
        for i in k:
            model = KNearestNeighbors(n_neighbors=i, distance=j)
            model.fit(train_x, train_y)
            pred_y = model.predict(val_x)
            acc = accuracy_score(val_y, pred_y)
            k_acc_scores.append(acc)
            plt.

In [6]:
def main():
    k = [1, 3, 5, 7]
    data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
    print()
    print('Total Dataset size: ', len(data))
    train, val, test, class_names = data_preprocessing(data)
    print()
    print('Training set size: ', len(train))
    print('Validation set size: ', len(val))
    print('Testing set size: ', len(test))
    print()
    k_nearest_neighbors(k, train, val, test)

In [13]:
main()


Total Dataset size:  150

Training set size:  90
Validation set size:  30
Testing set size:  30

[(0.26457513110645964, 2, 2)]
[2]
[(0.24494897427831766, 67, 1)]
[1]
[(0.264575131106459, 80, 0)]
[0]
[(0.31622776601683755, 21, 2)]
[2]
[(0.14142135623730964, 29, 1)]
[1]
[(0.14142135623730917, 23, 0)]
[0]
[(0.24494897427831838, 42, 2)]
[2]
[(0.22360679774997935, 85, 1)]
[1]
[(0.5385164807134504, 25, 2)]
[2]
[(0.1414213562373093, 87, 0)]
[0]
[(0.24494897427831785, 31, 2)]
[2]
[(0.3162277660168377, 67, 1)]
[1]
[(0.7211102550927978, 9, 1)]
[1]
[(0.24494897427831766, 29, 1)]
[1]
[(0.3000000000000001, 10, 1)]
[1]
[(0.5196152422706629, 85, 1)]
[1]
[(0.14142135623730995, 85, 1)]
[1]
[(0.0, 21, 2)]
[2]
[(0.22360679774997935, 64, 2)]
[2]
[(0.33166247903553975, 32, 1)]
[1]
[(0.1999999999999993, 11, 1)]
[1]
[(0.17320508075688812, 20, 0)]
[0]
[(0.24494897427831822, 85, 1)]
[1]
[(0.5099019513592784, 21, 2)]
[2]
[(0.244948974278318, 54, 0)]
[0]
[(0.22360679774997858, 20, 0)]
[0]
[(0.22360679774997827,