In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import csv

In [3]:
def read_csv(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = list(reader)
        columns = data[0]
        data = data[1:]
        return data, columns

data, columns = read_csv('./Iris.csv')
print(columns)
for row in data:
    print(row)

['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
['1', '5.1', '3.5', '1.4', '0.2', 'Iris-setosa']
['2', '4.9', '3.0', '1.4', '0.2', 'Iris-setosa']
['3', '4.7', '3.2', '1.3', '0.2', 'Iris-setosa']
['4', '4.6', '3.1', '1.5', '0.2', 'Iris-setosa']
['5', '5.0', '3.6', '1.4', '0.2', 'Iris-setosa']
['6', '5.4', '3.9', '1.7', '0.4', 'Iris-setosa']
['7', '4.6', '3.4', '1.4', '0.3', 'Iris-setosa']
['8', '5.0', '3.4', '1.5', '0.2', 'Iris-setosa']
['9', '4.4', '2.9', '1.4', '0.2', 'Iris-setosa']
['10', '4.9', '3.1', '1.5', '0.1', 'Iris-setosa']
['11', '5.4', '3.7', '1.5', '0.2', 'Iris-setosa']
['12', '4.8', '3.4', '1.6', '0.2', 'Iris-setosa']
['13', '4.8', '3.0', '1.4', '0.1', 'Iris-setosa']
['14', '4.3', '3.0', '1.1', '0.1', 'Iris-setosa']
['15', '5.8', '4.0', '1.2', '0.2', 'Iris-setosa']
['16', '5.7', '4.4', '1.5', '0.4', 'Iris-setosa']
['17', '5.4', '3.9', '1.3', '0.4', 'Iris-setosa']
['18', '5.1', '3.5', '1.4', '0.3', 'Iris-setosa']
['19', '5.7', '3.8', '1.7

In [4]:
def train_test_splitter(X, y, test_size = 0.2):
    indices = list(range(len(X)))
    random.shuffle(indices)
    
    test_set_size = int(len(X) * test_size)
    
    test_indices = indices[:test_set_size]
    train_indices = indices[test_set_size:]
    
    X_train = np.array([X[i] for i in train_indices])
    X_test = np.array([X[i] for i in test_indices])
    y_train = np.array([y[i] for i in train_indices])
    y_test = np.array([y[i] for i in test_indices])
    
    return X_test, y_train, X_train, y_test

In [9]:
X = [row[:-1] for row in data]  # All columns except the last one
y = [row[-1] for row in data]   # Only the last column

print("First few rows of X before conversion:", X[:5])
print("First few rows of y before conversion:", y[:5])

X = [[float(value) for value in row] for row in X]
y = [(value) for value in y]

print("First few rows of X after conversion:", X[:5])
print("First few rows of y after conversion:", y[:5])

X_test, y_train, X_train, y_test = train_test_splitter(X,y,1/3)

print("X_test:", X_test)
print("y_test:", y_test)
print("X_train:", X_train)
np.set_printoptions(suppress=True)  # Suppress scientific notation for small numbers
print("X_train after suppression:",X_train)
print("y_train:", y_train)

First few rows of X before conversion: [['1', '5.1', '3.5', '1.4', '0.2'], ['2', '4.9', '3.0', '1.4', '0.2'], ['3', '4.7', '3.2', '1.3', '0.2'], ['4', '4.6', '3.1', '1.5', '0.2'], ['5', '5.0', '3.6', '1.4', '0.2']]
First few rows of y before conversion: ['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
First few rows of X after conversion: [[1.0, 5.1, 3.5, 1.4, 0.2], [2.0, 4.9, 3.0, 1.4, 0.2], [3.0, 4.7, 3.2, 1.3, 0.2], [4.0, 4.6, 3.1, 1.5, 0.2], [5.0, 5.0, 3.6, 1.4, 0.2]]
First few rows of y after conversion: ['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
X_test: [[1.22e+02 5.60e+00 2.80e+00 4.90e+00 2.00e+00]
 [1.00e+01 4.90e+00 3.10e+00 1.50e+00 1.00e-01]
 [4.50e+01 5.10e+00 3.80e+00 1.90e+00 4.00e-01]
 [1.45e+02 6.70e+00 3.30e+00 5.70e+00 2.50e+00]
 [1.90e+01 5.70e+00 3.80e+00 1.70e+00 3.00e-01]
 [1.14e+02 5.70e+00 2.50e+00 5.00e+00 2.00e+00]
 [1.06e+02 7.60e+00 3.00e+00 6.60e+00 2.10e+00]
 [1.03e+02 7.10e+00 3.00e+00 5.90e+00 

In [12]:
def euclidean_distance(x1, x2):
    distance = np.sqrt(np.sum((x1-x2)**2))
    return distance 

In [15]:
class KNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return predictions
    
    def _predict(self, x):
        # compute the distance
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
        
        # get closest k
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        
        # manually count occurrences of each label
        label_counts = {}
        for label in k_nearest_labels:
            if label in label_counts:
                label_counts[label] += 1
            else:
                label_counts[label] = 1
        
        # find the most common label
        most_common_label = max(label_counts, key=label_counts.get)
        return most_common_label

In [17]:
clf = KNN(k=5)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print(predictions, "\n")

acc = np.sum(predictions == y_test) / len(y_test)
print(acc * 100,'%')

['Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-virginica', 'Iris-virginica', 'Iris-virginica', 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor'] 

100.0 %
