In [None]:
!pip install wget



In [None]:
import numpy as np
import collections


class kNNClassifier(object):

    def __init__(self, k):
        self.k = k

    def fit(self, dataset, labels):
        self.dataset = dataset
        self.labels = labels

    def predict(self, testset):
        result = []
        for test_data in testset:
            dist = np.array([np.linalg.norm(test_data-v) for v in self.dataset])
            dist_index = np.argsort(dist)[:self.k]
            label_dict = collections.Counter([self.labels[i] for i in dist_index])
            result.append(sorted(label_dict.items(), key=lambda x: x[1], reverse=True)[0][0])
        return result

    def score(self, X_test, y_test):
        y_predict = self.predict(X_test)
        return sum(y_predict[i] == y_test[i] for i in range(len(y_test)))/len(y_test)

In [None]:
import util


X1, y1, X2, y2 = util.mnist_init('small')

clf = kNNClassifier(3)
clf.fit(X1, y1)
print(clf.score(X2, y2))

y_pred = clf.predict(X2)
print(util.f1_score(y2, y_pred))
print(util.recall_score(y2, y_pred))

0.8510182207931404
0.8462682834020671
0.8449254610915184


In [None]:
from sklearn.neighbors import KNeighborsClassifier


X1, y1, X2, y2 = util.mnist_init('complete')

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X1, y1)
print(clf.score(X2, y2))

y_pred = clf.predict(X2)
print(util.f1_score(y2, y_pred))
print(util.recall_score(y2, y_pred))

0.9705
0.9703749561556656
0.9701144344783679


In [None]:
class KMeans:
    
    def __init__(self, k):
        self.k = k
        
    def fit(self, dataset):
        self.centroids = dataset[np.random.choice(dataset.shape[0], self.k, replace=False)]
        self.intial_centroids = self.centroids
        self.prev_label, self.labels = [None for _ in range(dataset.shape[0])], [None for _ in range(dataset.shape[0])]

        while not np.all(self.labels == self.prev_label) :
            self.prev_label = self.labels
            self.labels = self.predict(dataset)
            self.update_centroid(dataset)

        
    def predict(self, dataset):
        return np.apply_along_axis(self.compute_label, 1, dataset)

    def compute_label(self, x):
        return np.argmin(np.sqrt(np.sum((self.centroids - x)**2, axis=1)))

    def update_centroid(self, dataset):
        self.centroids = np.array([np.mean(dataset[self.labels == k], axis=0)  for k in range(self.k)])

    def score(self, test_set, test_labels):
        pred_labels = self.predict(test_set)
        labels_list = [[] for _ in range(self.k)]
        for i in range(len(pred_labels)):
            labels_list[pred_labels[i]].append(test_labels[i])
        actl_labels = [np.argmax(np.bincount(x)) for x in labels_list]
        occurs = [sum([actl_labels[i] == label for label in labels_list[i]]) for i in range(len(labels_list))] 
        return ([occurs[i]/len(labels_list[i]) for i in range(len(occurs))], sum(occurs)/sum([len(labels) for labels in labels_list]))
        


        


In [None]:
X1, y1, X2, y2 = util.mnist_init('small', transformed=False)
clf = KMeans(2)
clf.fit(X1)

print(clf.score(X1, y1))
print(clf.score(X2, y2))

([0.6032045240339302, 0.5103319949074527], 0.5190738112136267)
([0.5843373493975904, 0.5158823529411765], 0.5219721329046088)


In [None]:
X1, y1, X2, y2 = util.mnist_init('complete')
clf = KMeans(10)
clf.fit(X1)

print(clf.score(X1, y1))
print(clf.score(X2, y2))

([0.7058823529411765, 0.4087418783225044, 0.5, 0.4725274725274725, 0.398406374501992, 0.1409514220991694, 0.36394214598049107, 0.38768369489153254, 0.3560804899387577, 0.4525652049297793], 0.19401666666666667)
([1.0, 0.4412811387900356, 1.0, 0.55, 0.30303030303030304, 0.14258793969849246, 0.42299794661190965, 0.3625, 0.41935483870967744, 0.44280442804428044], 0.1998)
