In [6]:
# example from: https://machinelearningmastery.com/tutorial-to-implement-k-nearest-neighbors-in-python-from-scratch/
# github: https://github.com/madhug-nadig/Machine-Learning-Algorithms-from-Scratch/blob/master/K%20Means%20Clustering.py

In [1]:
from sklearn.datasets import load_iris
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import numpy as np
from math import sqrt

In [8]:
# get iris dataset
X_class,y_class = load_iris().data, load_iris().target

# we just want binary classification
X_class = X_class[:100]
y_class = y_class[:100]

X_class,y_class = shuffle(X_class, y_class)

x_df = pd.DataFrame(X_class)
y_df = pd.DataFrame(y_class)

In [13]:
# calculate the Euclidean distance between two vectors
# each row is the set of feataures for a node
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)

In [16]:
a = [0,0]
b = [3,3]

euclidean_distance(a,b)

4.242640687119285

In [28]:
# Locate the most similar neighbors
def get_neighbors(X_train, y_train, test_row, num_neighbors):
	distances = list()
	for i in range(len(X_train)):
		dist = euclidean_distance(X_train[i], test_row)
		distances.append((X_train[i], dist, y_train[i]))
	distances.sort(key=lambda tup: tup[1])
	neighbors = list()
	for i in range(num_neighbors):
		neighbors.append([distances[i][0], distances[i][2]])
	return neighbors

In [29]:
pseudo_x_train = [[1,2,3,4,5], [6,7,8,9,10],[90,91,92,93,94], [100,101,102,103,104]]
pseudo_y_train = [0,0,1,1]
pseudo_test_row = [10,11,12,13,14]

get_neighbors(pseudo_x_train, pseudo_y_train, pseudo_test_row, 3)

[[[6, 7, 8, 9, 10], 0], [[1, 2, 3, 4, 5], 0], [[90, 91, 92, 93, 94], 1]]

In [30]:
# Make a classification prediction with neighbors
def predict_classification(X_train, y_train, test_row, num_neighbors):
	neighbors = get_neighbors(X_train, y_train, test_row, num_neighbors)
	output_values = [row[-1] for row in neighbors]
	prediction = max(set(output_values), key=output_values.count)
	return prediction

In [31]:
predict_classification(pseudo_x_train, pseudo_y_train, pseudo_test_row, 3)

0

In [34]:
def knn_from_scratch(X_train, X_test, y_train, y_test, num_neighbors):
	y_pred = []
	for test_row in X_test:
		y_pred.append(predict_classification(X_train, y_train, test_row, num_neighbors))

	f1_binary = f1_score(y_test, y_pred, average="binary")
	accuracy = accuracy_score(y_test, y_pred)

	# print("ypred:", y_pred)
	# print("ytest:", y_test)

	return f1_binary, accuracy

In [36]:
#testing

numFolds = 10
num_neighbors = 9
stratifiedKFold = StratifiedKFold(
    n_splits=numFolds, shuffle=True, random_state=86
)

count = 1
avgF1 = 0
avgAcc = 0

X = X_class
y = y_class

for train_index, test_index in stratifiedKFold.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    f1, acc = knn_from_scratch(X_train, X_test, y_train, y_test, num_neighbors)
    avgF1 += f1
    avgAcc += acc

    # print(f"\tFold {count}:: Average accuracy: {avgAcc}, Average F1 score: {avgF1}")

    count += 1

avgF1 = avgF1/numFolds
avgAcc = avgAcc/numFolds

print(f"Average accuracy: {avgAcc}, Average F1 score: {avgF1}")

	Fold 1:: Average accuracy: 1.0, Average F1 score: 1.0
	Fold 2:: Average accuracy: 2.0, Average F1 score: 2.0
	Fold 3:: Average accuracy: 3.0, Average F1 score: 3.0
	Fold 4:: Average accuracy: 4.0, Average F1 score: 4.0
	Fold 5:: Average accuracy: 5.0, Average F1 score: 5.0
	Fold 6:: Average accuracy: 6.0, Average F1 score: 6.0
	Fold 7:: Average accuracy: 7.0, Average F1 score: 7.0
	Fold 8:: Average accuracy: 8.0, Average F1 score: 8.0
	Fold 9:: Average accuracy: 9.0, Average F1 score: 9.0
	Fold 10:: Average accuracy: 10.0, Average F1 score: 10.0
Average accuracy: 1.0, Average F1 score: 1.0
