In [87]:
import numpy as np
import pandas as pd

In [301]:
df = pd.read_csv('fruit_data_with_colors.txt', delimiter='\t')

In [307]:
df.head()

Unnamed: 0,fruit_name,fruit_subtype,mass,width,height,color_score
0,apple,granny_smith,192,8.4,7.3,0.55
1,apple,granny_smith,180,8.0,6.8,0.59
2,apple,granny_smith,176,7.4,7.2,0.6
3,mandarin,mandarin,86,6.2,4.7,0.8
4,mandarin,mandarin,84,6.0,4.6,0.79


In [308]:
X = pd.get_dummies(df.drop(columns=['fruit_name', 'fruit_subtype']))
y = df[['fruit_name', 'fruit_subtype']]

In [309]:
def normalize(X):
    X = np.asarray(X)
    X_normalized = X / np.linalg.norm(X, axis=1, keepdims=True)
    return X_normalized

In [310]:
X = normalize(X)

In [311]:
def train_test_split(X, y, test_size=0.25, shuffle=True):
    X, y = np.asarray(X), np.asarray(y)
    n = len(X)
    if shuffle:
        perm = np.random.permutation(n)
        X, y = X[perm], y[perm]
    test_rows = round(test_size * n)
    X_train, X_test, y_train, y_test = X[test_rows:], X[:test_rows], y[test_rows:], y[:test_rows]
    return X_train, X_test, y_train, y_test

In [312]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 0.33)

In [322]:
class KNNClassifier:
    def __init__(self, k=5):
        self._k = k
    
    def __euclidean_distance(self, x):
        dist = np.sqrt(((x - self._X) ** 2).sum(1))
        return dist
    
    def fit(self, X, y):
        self._X = X
        self._y = y
    
    def predict(self, X):
        dists = np.array(list(map(lambda row: self.__euclidean_distance(row), X)))
        knn_idxs = np.argpartition(dists, self._k)[:, :self._k]
        knn = self._y[knn_idxs]
        pred = np.array(list(map(lambda row: max(row, key=lambda s: (row == s).sum()), knn)))
        return pred
    
    def score(self, X, y):
        preds = self.predict(X)
        correct = (preds == y).sum()
        accuracy = correct / y.size
        return accuracy

In [323]:
clf = KNNClassifier()

In [324]:
clf.fit(X_train, y_train)

In [325]:
clf.score(X_test, y_test)

0.6052631578947368