In [None]:
import numpy as np
import gzip
import pickle
from scipy.stats import rankdata
import collections

In [None]:
# Chargement des données
f = gzip.open('/Users/corro/notebook/proba_alternance/new/mnist.pkl.gz', 'rb')
u = pickle._Unpickler(f)
u.encoding = 'latin1'
train_set, valid_set, test_set = u.load()

# redimension pour accélerer un peu le tout
def split_and_resize(data, dim=28):
    X, Y = data
    X = X.reshape(-1, dim, dim)[:, ::2, ::2].reshape(-1, int(dim*dim/4))
    return X, Y

X_train, Y_train = split_and_resize(train_set)
X_test, Y_test = split_and_resize(test_set)

print(X_train.shape, Y_train.shape)

# On garde juste les 500 premières images du test,
# just pour le prototypage, normalement il faudrait tout garder
X_test = X_test[:500]
Y_test = Y_test[:500]
print(X_test.shape, Y_test.shape)

In [None]:
x = X_test[0]
y = Y_test[0]

print(x.shape, y.shape, y)

# Classificateur plus proche voisin simple

In [None]:
class NNClassifier:
    def __init__(self, X_train, Y_train):
        self.X_train = X_train#.copy()
        self.Y_train = Y_train#.copy()
        
    def predict(self, x):
        dist = np.sum((self.X_train - x) ** 2, axis=1)
        i_min = np.argmin(dist)
        return self.Y_train[i_min]
    
    def accuracy(self, X, Y):
        n_correct = 0
        for i in range(X.shape[0]):
            n_correct += Y[i] == self.predict(X[i])
        return n_correct / X.shape[0]

In [None]:
c = NNClassifier(X_train, Y_train)
c.accuracy(X_train[:500], Y_train[:500])

In [None]:
c.accuracy(X_test[:500], Y_test[:500])

# Classificateur k-plus proches voisins

In [None]:
class KNNClassifier:
    def __init__(self, k, X_train, Y_train):
        self.k = k
        self.X_train = X_train#.copy()
        self.Y_train = Y_train#.copy()
        
    def predict(self, x):
        dist = np.sum((self.X_train - x) ** 2, axis=1)
        r = rankdata(dist).astype(np.long) - 1
        knn = self.Y_train[r < self.k]

        c = collections.Counter(knn)
        #pred, _ = max(c.items(), key=lambda i: i[1])
        pred = c.most_common()[0]
        return pred
    
    def accuracy(self, X, Y):
        n_correct = 0
        for i in range(X.shape[0]):
            n_correct += Y[i] == self.predict(X[i])
        return n_correct / X.shape[0]
    


In [None]:
k = 10
c = KNNClassifier(k, X_train, Y_train)
c.accuracy(X_train[:500], Y_train[:500])

In [None]:
c = KNNClassifier(k, X_train, Y_train)
c.accuracy(X_test[:500], Y_test[:500])

# Classificateur qui utilise un représentant moyen

- version standard
- version fonctionnant avec des "batchs" de données

In [None]:
class MeanClassifier:
    def __init__(self, l, X_train, Y_train):
        self.theta = np.empty((l, X_train.shape[1]))
        for i in range(l):
            self.theta[i] = np.mean(X_train[Y_train == i], axis=0)
            
    def predict(self, X):
        score = np.sum(
            (X[:, None, :] - self.theta[None, :, :]) ** 2,
            axis=2
        )
        return np.argmin(score, axis=1)
    
    def accuracy(self, X, Y):
        pred = self.predict(X)
        return np.sum(pred == Y) / X.shape[0]

In [None]:
c = MeanClassifier(10, X_train, Y_train)
c.accuracy(X_train[:500], Y_train[:500])

In [None]:
c.accuracy(X_test[:500], Y_test[:500])

In [None]:
.shape