In [None]:
import numpy as np
import matplotlib.pyplot as plt
import random
import csv

In [None]:
def read_csv(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
        columns = data[0]
        data = data[1:]
        return data, columns

data, columns = read_csv('./Iris.csv')
print(columns)
for row in data:
    print(row)

In [None]:
def train_test_splitter(X, y, test_size = 0.2):
    indices = list(range(len(X)))
    random.shuffle(indices)
    
    test_set_size = int(len(X) * test_size)
    
    test_indices = indices[:test_set_size]
    train_indices = indices[test_set_size:]
    
    X_train = np.array([X[i] for i in train_indices])
    X_test = np.array([X[i] for i in test_indices])
    y_train = np.array([y[i] for i in train_indices])
    y_test = np.array([y[i] for i in test_indices])
    
    return X_test, y_train, X_train, y_test

In [None]:
X = [row[:-1] for row in data]  # All columns except the last one
y = [row[-1] for row in data]   # Only the last column

print("First few rows of X before conversion:", X[:5])
print("First few rows of y before conversion:", y[:5])

X = [[float(value) for value in row] for row in X]
y = [(value) for value in y]

print("First few rows of X after conversion:", X[:5])
print("First few rows of y after conversion:", y[:5])

X_test, y_train, X_train, y_test = train_test_splitter(X,y,1/3)

print("X_test:", X_test)
print("y_test:", y_test)
print("X_train:", X_train)
np.set_printoptions(suppress=True)  # Suppress scientific notation for small numbers
print("X_train after suppression:",X_train)
print("y_train:", y_train)

In [None]:
def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance 

In [None]:
class KNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions
    
    def _predict(self, x):
        # compute the distance
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        
        # get closest k
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # manually count occurrences of each label
        label_counts = {}
        for label in k_nearest_labels:
            if label in label_counts:
                label_counts[label] += 1
            else:
                label_counts[label] = 1
        
        # find the most common label
        most_common_label = max(label_counts, key=label_counts.get)
        return most_common_label

In [None]:
clf = KNN(k=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(predictions, "\n")

acc = np.sum(predictions == y_test) / len(y_test)
print(acc * 100,'%')