# KNN

## Univariate

In [176]:
import pandas as pd
import numpy as np
import math
from collections import Counter

In [177]:
df = [
    [5.1, 3.5, 1.4, 0.2, "setosa"],
    [4.9, 3.0, 1.4, 0.2, "setosa"],
    [4.7, 3.2, 1.3, 0.2, "setosa"],
    [4.6, 3.1, 1.5, 0.2, "setosa"],
    [5.0, 3.6, 1.4, 0.2, "setosa"],
    [5.4, 3.9, 1.7, 0.4, "setosa"],
    [5.8, 4.0, 1.2, 0.2, "setosa"],
    [6.0, 2.2, 4.0, 1.5, "versicolor"],
    [6.1, 2.8, 4.7, 1.4, "versicolor"],
    [5.9, 3.0, 4.2, 1.5, "versicolor"],
    [6.7, 3.1, 4.4, 1.4, "versicolor"],
    [6.3, 2.5, 4.9, 1.5, "versicolor"],
    [6.5, 3.0, 5.1, 2.0, "virginica"],
    [6.2, 2.8, 4.5, 1.5, "versicolor"],
    [6.4, 2.9, 4.3, 1.3, "versicolor"],
    [5.5, 2.4, 4.0, 1.3, "versicolor"],
    [5.7, 2.8, 4.1, 1.3, "versicolor"],
    [5.8, 2.7, 5.1, 1.9, "virginica"],
    [6.9, 3.1, 5.4, 2.3, "virginica"],
    [6.0, 2.2, 5.0, 1.5, "virginica"],
    [6.3, 2.3, 5.6, 2.4, "virginica"],
    [6.1, 2.8, 5.6, 2.4, "virginica"],
    [5.6, 2.9, 3.6, 1.3, "versicolor"],
    [5.8, 2.7, 4.1, 1.0, "versicolor"],
    [6.0, 2.9, 4.5, 1.5, "versicolor"],
    [6.1, 2.6, 4.7, 1.4, "versicolor"],
    [6.5, 3.0, 5.2, 2.0, "virginica"],
    [6.2, 2.9, 5.4, 2.3, "virginica"],
    [5.9, 3.0, 5.1, 1.8, "virginica"],
    [6.3, 2.7, 5.6, 2.1, "virginica"]
]


In [178]:
df = pd.DataFrame(df, columns=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWdith', 'Class'])

In [179]:
print(df.head())

   SepalLength  SepalWidth  PetalLength  PetalWdith   Class
0          5.1         3.5          1.4         0.2  setosa
1          4.9         3.0          1.4         0.2  setosa
2          4.7         3.2          1.3         0.2  setosa
3          4.6         3.1          1.5         0.2  setosa
4          5.0         3.6          1.4         0.2  setosa


In [180]:
print(df.shape)

(30, 5)


In [181]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [182]:
print(X.head())
print(y.head())

   SepalLength  SepalWidth  PetalLength  PetalWdith
0          5.1         3.5          1.4         0.2
1          4.9         3.0          1.4         0.2
2          4.7         3.2          1.3         0.2
3          4.6         3.1          1.5         0.2
4          5.0         3.6          1.4         0.2
0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: Class, dtype: object


In [183]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [184]:
df

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWdith,Class
0,6.2,2.9,5.4,2.3,virginica
1,5.5,2.4,4.0,1.3,versicolor
2,5.8,2.7,4.1,1.0,versicolor
3,5.8,2.7,5.1,1.9,virginica
4,6.1,2.8,4.7,1.4,versicolor
5,5.9,3.0,4.2,1.5,versicolor
6,5.9,3.0,5.1,1.8,virginica
7,6.0,2.9,4.5,1.5,versicolor
8,6.5,3.0,5.1,2.0,virginica
9,5.1,3.5,1.4,0.2,setosa


In [185]:
data_list = df.values.tolist()

In [186]:

def split_folds(data, num_folds=5):
    fold_size = len(data) // num_folds
    return [data[i * fold_size: (i + 1) * fold_size] for i in range(num_folds)]

In [187]:
def euclidean(p1, p2):
    summ = 0
    for i in range(len(p1)):
        summ += (p1[i] - p2[i]) ** 2

    return math.sqrt(summ)

In [188]:
def knn(train, test, k):
    distances = [(euclidean(test[:-1], row[:-1]), row[-1]) for row in train]
    distances.sort()
    k_nearest = [label for _, label in distances[:k]]
    return Counter(k_nearest).most_common(1)[0][0]

In [189]:
def cross_validate(data, k_values, num_folds=5):
    folds = split_folds(data, num_folds)
    results = []

    for k in k_values:
        accuracies = []
        for i in range(num_folds):
            test_set = folds[i]
            train_set = [item for j, fold in enumerate(folds) if j != i for item in fold]
            correct = sum(knn(train_set, row, k) == row[-1] for row in test_set)
            accuracy = correct / len(test_set)
            accuracies.append(accuracy)
        print(accuracies)
        mean_accuracy = sum(accuracies) / len(accuracies)
        results.append((k, mean_accuracy))
    return results


In [190]:
k_values = [1, 3, 5, 7, 9]

In [191]:
results = cross_validate(data_list, k_values)
print(results)

[1.0, 1.0, 1.0, 1.0, 0.8333333333333334]
[1.0, 1.0, 1.0, 1.0, 0.8333333333333334]
[1.0, 1.0, 1.0, 1.0, 0.8333333333333334]
[1.0, 0.8333333333333334, 1.0, 1.0, 0.8333333333333334]
[1.0, 1.0, 0.3333333333333333, 1.0, 0.8333333333333334]
[(1, 0.9666666666666666), (3, 0.9666666666666666), (5, 0.9666666666666666), (7, 0.9333333333333333), (9, 0.8333333333333334)]


In [192]:
best_k, _ = max(results, key=lambda x: (x[1], -x[0]))

In [193]:
train_size = int(0.8 * len(df))
train_df = df.iloc[:train_size]
test_df = df.iloc[train_size:]

In [194]:
train_data = train_df.values.tolist()
test_data = test_df.values.tolist()

In [195]:
correct = sum(knn(train_data, row, k=best_k) == row[-1] for row in test_data)
test_accuracy = correct / len(test_data)

In [196]:
print(best_k)

1


## knn normalize 

In [197]:
import pandas as pd
import numpy as np
from collections import Counter

In [198]:
df = [
    [5.2, 3.4, 1.5, 0.2, "setosa"],
    [4.8, 3.1, 1.6, 0.3, "setosa"],
    [5.0, 3.2, 1.2, 0.2, "setosa"],
    [5.3, 3.7, 1.4, 0.3, "setosa"],
    [4.9, 3.0, 1.5, 0.1, "setosa"],
    [5.1, 3.5, 1.3, 0.3, "setosa"],
    [5.4, 3.4, 1.7, 0.2, "setosa"],
    [5.0, 3.3, 1.4, 0.2, "setosa"],
    [6.0, 2.7, 4.2, 1.3, "versicolor"],
    [6.2, 2.9, 4.3, 1.3, "versicolor"],
    [5.7, 2.6, 3.5, 1.0, "versicolor"],
    [5.8, 2.7, 4.1, 1.2, "versicolor"],
    [6.1, 3.0, 4.6, 1.4, "versicolor"],
    [5.6, 2.8, 4.0, 1.3, "versicolor"],
    [6.3, 2.5, 4.9, 1.5, "versicolor"],
    [6.0, 3.4, 4.5, 1.6, "versicolor"],
    [5.9, 3.0, 4.2, 1.5, "versicolor"],
    [6.4, 2.8, 5.0, 1.7, "versicolor"],
    [5.5, 2.5, 4.0, 1.2, "versicolor"],
    [6.2, 2.2, 4.8, 1.8, "versicolor"],
    [6.5, 3.0, 5.2, 2.0, "virginica"],
    [6.9, 3.1, 5.4, 2.1, "virginica"],
    [6.7, 3.0, 5.8, 2.2, "virginica"],
    [7.1, 3.0, 5.9, 2.1, "virginica"],
    [6.3, 2.9, 5.6, 1.8, "virginica"],
    [6.6, 2.8, 5.3, 2.0, "virginica"],
    [7.0, 3.2, 5.7, 2.3, "virginica"],
    [6.5, 3.2, 5.1, 2.0, "virginica"],
    [6.8, 3.0, 5.5, 2.1, "virginica"],
    [6.4, 2.9, 5.6, 2.2, "virginica"],
    [6.2, 3.4, 5.4, 2.3, "virginica"],
    [6.9, 3.1, 5.1, 2.3, "virginica"],
    [7.2, 3.2, 6.0, 2.2, "virginica"],
    [6.3, 2.8, 5.7, 1.9, "virginica"],
    [6.1, 3.0, 5.5, 1.8, "virginica"],
    [6.7, 3.3, 5.7, 2.1, "virginica"],
    [6.4, 3.1, 5.5, 1.8, "virginica"],
    [6.8, 3.2, 5.9, 2.3, "virginica"],
    [7.3, 2.9, 6.1, 2.5, "virginica"],
    [6.5, 3.0, 5.8, 2.2, "virginica"]
]


In [199]:
df = pd.DataFrame(df, columns=['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Class'])

In [201]:
print(df.head())

   SepalLength  SepalWidth  PetalLength  PetalWidth   Class
0          5.2         3.4          1.5         0.2  setosa
1          4.8         3.1          1.6         0.3  setosa
2          5.0         3.2          1.2         0.2  setosa
3          5.3         3.7          1.4         0.3  setosa
4          4.9         3.0          1.5         0.1  setosa


In [202]:
def normalize(df):
    df_norm = df.copy()
    for col in df.columns[:-1]:
        df_norm[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    return df_norm

In [203]:
df = normalize(df)

In [204]:
print(df.head())

   SepalLength  SepalWidth  PetalLength  PetalWidth   Class
0         0.16    0.800000     0.061224    0.041667  setosa
1         0.00    0.600000     0.081633    0.083333  setosa
2         0.08    0.666667     0.000000    0.041667  setosa
3         0.20    1.000000     0.040816    0.083333  setosa
4         0.04    0.533333     0.061224    0.000000  setosa


In [205]:
def manhattan_distance(a, b):
    return np.sum(np.abs(a - b))

In [206]:
def knn_predict(train_X, train_y, test_x, k):
    distances = []
    for i in range(len(train_X)):
        dist = manhattan_distance(train_X.iloc[i], test_x)
        distances.append((dist, train_y.iloc[i]))
    distances.sort(key=lambda x: x[0])
    top_k = [label for _, label in distances[:k]]
    return Counter(top_k).most_common(1)[0][0]

In [207]:
def k_fold_split(df, k=5):
    df_shuffled = df.sample(frac=1, random_state=1).reset_index(drop=True)
    return np.array_split(df_shuffled, k)

In [208]:
def cross_validate(df, k_values, folds=5):
    results = {}
    split_data = k_fold_split(df, folds)
    for k in k_values:
        acc_list = []
        for i in range(folds):
            test_df = split_data[i]
            train_df = pd.concat([split_data[j] for j in range(folds) if j != i])
            train_X, train_y = train_df.iloc[:, :-1], train_df.iloc[:, -1]
            test_X, test_y = test_df.iloc[:, :-1], test_df.iloc[:, -1]
            correct = 0
            for idx in range(len(test_df)):
                pred = knn_predict(train_X, train_y, test_X.iloc[idx], k)
                if pred == test_y.iloc[idx]:
                    correct += 1
            acc_list.append(correct / len(test_df))
        results[k] = round(np.mean(acc_list), 4)
    return results
            