# k 近傍法による分類
k Nearest Neighbor

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import csv

%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

In [2]:
DATADIR = '../../data/MNIST/raw/'

trainimg_path = DATADIR + 'train-images-idx3-ubyte'
trainlabel_path = DATADIR + 'train-labels-idx1-ubyte'
testimg_path = DATADIR + 't10k-images-idx3-ubyte'
testlabel_path = DATADIR + 't10k-labels-idx1-ubyte'

In [3]:
def load_img(path):
    with open(path, 'rb') as f:
        data = np.frombuffer(f.read(), np.uint8, offset=16)
    data = data.reshape(-1, 28**2) / 255.0
    return data

def load_label(path):
    with open(path, 'rb') as f:
        label = np.frombuffer(f.read(), np.uint8, offset=8)
    return label

In [4]:
trainimgs = load_img(trainimg_path)
trainlabels = load_label(trainlabel_path)
testimgs = load_img(testimg_path)
testlabels = load_label(testlabel_path)
SAVEPATH = '../../data/parameters/else/kNN.csv'

FileNotFoundError: [Errno 2] No such file or directory: '../../data/MNIST/raw/train-images-idx3-ubyte'

### l^2 ノルム

In [None]:
%%time
from sklearn.neighbors import KNeighborsClassifier

p = 10
neigh = KNeighborsClassifier(n_neighbors=p, n_jobs=-1)
neigh.fit(trainimgs, trainlabels)
prob = neigh.score(testimgs, testlabels)
print(f'param {p} is completed: score {prob}')


In [None]:
%%time
from sklearn.neighbors import KNeighborsClassifier

params = [1, 2, 3, 4, 5, 10, 20, 50, 100, 1000]
score = list()
for p in params:
    neigh = KNeighborsClassifier(n_neighbors=p, n_jobs=-1)
    neigh.fit(trainimgs, trainlabels)
    prob = neigh.score(testimgs, testlabels)
    score.append(prob)
    print(f'param {p} is completed: score {prob}')
    with open(SAVEPATH, 'a') as f:
        writer = csv.writer(f)
        writer.writerow(['l2', p, prob])

display(pd.DataFrame([params, score]))
plt.plot(params, score)
plt.show()

### Mahalanobis 距離

In [None]:
%%time

# covariance # 逆行列の保証 
V = np.cov(trainimgs.T) + 1e-6 * np.identity(28**2)
VI = np.linalg.inv(V)


from sklearn.neighbors import KNeighborsClassifier

params = [1, 2, 3, 4, 5, 10, 20, 50, 100, 1000]
score = list()
for p in params:
    neigh = KNeighborsClassifier(n_neighbors=p, metric="mahalanobis", metric_params={'VI': VI}, n_jobs=-1)
    neigh.fit(trainimgs, trainlabels)
    prob = neigh.score(testimgs, testlabels)
    score.append(prob)
    print(f'param {p} is completed: score {prob}')
    with open(SAVEPATH, 'a') as f:
        writer = csv.writer(f)
        writer.writerow(['maha', p, prob])

display(pd.DataFrame([params, score]))
plt.plot(params, score)
plt.show()