In [None]:
import numpy as np
import pandas as pd
from typing import Tuple
from scipy.stats import mode
from sklearn.metrics import confusion_matrix

In [None]:
csv_path = "iris.csv"
pd.read_csv(csv_path, header=None)

In [None]:
def load_csv(csv_path:str) ->Tuple[pd.DataFrame,pd.DataFrame]:
    seed: int = 42
    dataset = pd.read_csv(csv_path, delimiter=",", header=None, na_values="\"\"", dtype=float)
    dataset.sample(frac=1, random_state=seed)
    dataset = dataset[dataset >= 0.0].dropna()
    dataset = dataset[dataset <= 13.0].dropna()
    dataset = dataset.reset_index()
    x,y = dataset.iloc[:,:4],dataset.iloc[:,-1]
    return x,y

In [None]:
x,y = load_csv(csv_path)
x,y

In [None]:
x[x.isna()] = 3.5

In [None]:
x.head(n=150)

In [None]:
def train_test_split(features: pd.DataFrame, labels: pd.DataFrame, test_split_ratio: float) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    test_size = int(len(features) * test_split_ratio)
    train_size = len(features) - test_size
    assert len(features) == test_size + train_size, "Size mismatch!"

    x_train,y_train = features.iloc[:train_size,:],labels.iloc[:train_size]
    x_test,y_test = features.iloc[train_size:train_size+test_size,:], labels.iloc[train_size:train_size + test_size]
    return (x_train,y_train,x_test,y_test)


In [None]:
x_train, y_train, x_test, y_test = train_test_split(x, y, 0.2)

In [None]:
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
def euclidean(points: pd.DataFrame,element_of_x: pd.DataFrame) ->  pd.DataFrame:
    return ((points - element_of_x)**2).sum(axis=1)**0.5

In [None]:
 for index, row in x_test.iterrows():
    distances = euclidean(x_train,row)
    print(distances)

In [None]:
def predict(x_train: pd.DataFrame, y_train: pd.DataFrame, x_test: pd.DataFrame, k: int) -> pd.DataFrame:
    labels_pred = []
    for index, row in x_test.iterrows():
        distances = euclidean(x_train,row)
        distances = pd.DataFrame(sorted(zip(distances,y_train)))
        label_pred = distances.iloc[:k,1].mode()
        labels_pred.append(label_pred)
    return pd.DataFrame(labels_pred).iloc[:,0]

In [None]:
y_preds = predict(x_train, y_train, x_test, 3)
y_preds

In [None]:
def accuracy(y_test:pd.DataFrame,y_preds:pd.DataFrame) -> float:
    true_positive = (y_test.reset_index(drop=True) == y_preds.reset_index(drop=True)).sum()
    return true_positive / len(y_test) * 100

In [None]:
accuracy(y_test, y_preds)

In [None]:
def plot_confusion_matrix(y_test:pd.DataFrame,y_preds:pd.DataFrame) -> pd.DataFrame:
    conf_matrix = confusion_matrix(y_test,y_preds)
    return conf_matrix

In [None]:
plot_confusion_matrix(y_test, y_preds)

In [None]:
def best_k(x_test: pd.DataFrame, k, y_test: pd.DataFrame, y_preds: pd.DataFrame, x_train: pd.DataFrame) -> Tuple[int, float]:
    best_k = 0
    best_accuracy = 0.0
    original_k = k
    for i in range(20):
        k = i+1
        predict(x_train, y_train, x_test, k)
        current_accuracy = accuracy(y_test, y_preds)
        if (best_accuracy < current_accuracy):
            best_k = k
            best_accuracy = current_accuracy

    k = original_k
    return best_k, round(best_accuracy, 2)

In [None]:
best = best_k(x_test, 1, y_test, y_preds, x_train)

In [None]:
best