# The $k$-nearest neighbors (KNN, $k$-NN) algorithm

Authors:

Joseph Salmon, Alexandre Gramfort, Claire Vernade, Mathurin Massias

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClassifierMixin
from scipy import stats
from sklearn import neighbors
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA


from tp_knn_source import (rand_gauss, rand_bi_gauss, rand_tri_gauss,
                           rand_checkers, rand_clown, plot_2d, ErrorCurve,
                           frontiere_new, LOOCurve)


import seaborn as sns
from matplotlib import rc

plt.close('all')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Computer Modern Roman']})
params = {'axes.labelsize': 12,
          'font.size': 16,
          'legend.fontsize': 16,
          'text.usetex': False,
          'figure.figsize': (8, 6)}
plt.rcParams.update(params)

sns.set_context("poster")
sns.set_palette("colorblind")
sns.set_style("white")
_ = sns.axes_style()

## Data generation

In [None]:
np.random.seed(42)  # fix seed globally

n = 100
# infer the parameters and choose their values
rand_gauss(n, mu, sigma)

n1 = 20
n2 = 20
# TODO for four functions
X1, y1 = rand_bi_gauss()

n1 = 50
n2 = 50
n3 = 50
X2, y2 = rand_tri_gauss()

n1 = 50
n2 = 50

X3, y3 = rand_checkers

n1 = 150
n2 = 150
X4, y4 = rand_clown

In [None]:
############################################################################
#     Displaying labeled data
############################################################################

plt.show()
plt.close("all")
plt.ion()
plt.figure(1, figsize=(15, 5))
plt.subplot(141)
plt.title('First data set')
plot_2d(X1, y1)

plt.subplot(142)
plt.title('Second data set')
# todo plot,
# todo other datasets on other subplots



## The $k$-NN algorithm

In [None]:
# Write your own implementation

class KNNClassifier(BaseEstimator, ClassifierMixin):
    """ Home made KNN Classifier class"""
    def __init__(self, n_neighbors=1):
        self.n_neighbors = n_neighbors

    def fit(self, X, y):
        # no work is done at fit time, except storing training data
        self.X_ = X
        self.y_ = y
        return self

    def predict(self, X):
        n_samples, n_features = X.shape
        # TODO : Compute all pairwise distances between X and self.X_

        # TODO : Get indices to sort them

        # TODO Get indices of neighbors

        # TODO: Get labels of neighbors
        Y_neighbors = # TODO

        # TODO : Find the predicted labels y for each entry in X
        # You can use the scipy.stats.mode function
        y_pred = # TODO
        return y_pred

# TODO : compare your implementation with scikit-learn

# Focus on dataset 2
X_train = X2[::2]
Y_train = 
X_test = 
Y_test = 

# TODO

# your classifier
n_neighbors = 1
knn = KNNClassifier(n_neighbors=n_neighbors)

Y_pred = # TODO

sknn = # TODO
Y_pred_skl = # todo

# TODO check that all labels match

# From now on use the Scikit-Learn implementation

In [None]:
# test now for all datasets

n_neighbors = 5  # the k in k-NN
knn = neighbors.KNeighborsClassifier # todo


# TODO something like:
# for data in [data1, data2, data3, data4]:
    # TODO: fit your knn in the loop
    
    plt.figure()
    #todo plot
    n_labels = # TODO
    frontiere_new(knn, X, y, w=None, step=50, alpha_choice=1,
                  n_labels=n_labels, n_neighbors=n_neighbors)

In [None]:
# Display the predictions when varying the value of k


plt.figure(3, figsize=(12, 8))
plt.subplot(3, 5, 3)
plot_2d(X_train, Y_train)
plt.xlabel('Samples')
ax = plt.gca()
ax.get_yaxis().set_ticks([])
ax.get_xaxis().set_ticks([])

for n_neighbors in # TODO:
    # TODO 
    plt.subplot(3, 5, 5 + n_neighbors)
    # todo put a label indicating the number of neighbors used in the algo
    

    frontiere_new(knn, X, y, w=None, step=50, alpha_choice=1,
                  colorbar=False, samples=False)
    plt.draw()  # update plot

plt.tight_layout()

In [None]:
# Scores on train data
n_neighbors = 1



# TODO use knn.score, on test and train

In [None]:
# Scores on left out data

n1 = n2 = 200
sigma = 0.1
data4 = rand_checkers(2 * n1, 2 * n2, sigma)

X_train = X4[::2]
Y_train = y4[::2].astype(int)
X_test = X4[1::2]
Y_test = y4[1::2].astype(int)


# TODO instantiate ErrorCurve with k_range=range(1, 51)
error_curve = 
# TODO fit it, plot it


In [None]:
collist = ['blue', 'grey', 'red', 'purple', 'orange', 'salmon', 'black',
           'fuchsia']

sigma = 0.1
plt.figure(5)
range_n_samples = [100, 500, 1000]
niter = len(range_n_samples)
for n in range(niter):
    n1 = n2 = range_n_samples[n]
    X_train, Y_train = rand_checkers(n1, n2, sigma)
    X_test, Y_test = rand_checkers(n1, n2, sigma)
    # TODO fit and plot with color varying from collist

plt.legend(["training size : %d" % n for n in range_n_samples],
           loc='upper left')

plt.close(6)
plt.figure(6)
plot_2d(X_train, Y_train)
n_neighbors = 40
knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, Y_train)



frontiere_new(knn, X_train, Y_train, w=None, step=50, alpha_choice=1)

What are the pros and cons of this classifier?

### Application to the DIGITS dataset

In [None]:
# test k-NN on digits dataset

# The digits dataset
digits = datasets.load_digits()

print(type(digits))
# A Bunch is a subclass of 'dict' (dictionary)
# help(dict)


# inspect digits attributes:
print(digits.keys())
print(digits.target[:50])
print(len(digits.data))
print(digits.data[0])
print(digits['data'][0])
print(digits['images'][0])
print(digits.data[0] == digits['data'][0])


for idx, (img, lbl) in enumerate(list(zip(digits.images,
                                          digits.target))[10:20]):
    plt.subplot(2, 5, idx + 1)
    plt.axis('off')
    plt.imshow(img, cmap=plt.cm.gray_r, interpolation='None')
    plt.title('Training: %i' % lbl)

n_samples = len(digits.data)

X_train = 
Y_train = 
X_test = =
Y_test = 

plt.figure()
# todo plot histogram of Y_test


# TODO fit, print score
print(score)

In [None]:
# Compute confusion matrix

conf_mat = # TODO 
print(conf_mat)

# TODO normalize CM so that each row sums to 1
conf_mat_normalized = 
plt.matshow(conf_mat_normalized)
# use a colorbar, plt.imshow(interpolation='nearest') may be an alternative


In [None]:
# Estimate k with cross-validation

# Have a look at the class 'LOOCurve', defined in the source file.
# LOO stands for Leave One Out


loo_curve = # TODO
# TODO fit it
# TODO print cross val scores


plt.figure()
# TODO plot curve


### Weighted $k$-NN classifier

In [None]:
# Implement weights for the kNN classifier


def weights(dist):
    """Returns an array of weights, exponentially decreasing in the square
    of the distance.

    Parameters
    ----------
    dist : a one-dimensional array of distances.

    Returns
    -------
    weight : array of the same size as dist
    """
    # TODO: use weights equal to exp(- dist^2 / 0.1)
    return # TODO


n_neighbors = 5
wknn = # TODO
wknn.fit(X_train, Y_train)
plt.figure(4)
plot_2d(X_train, Y_train)


frontiere_new(knn, X_train, Y_train, w=None, step=50, alpha_choice=1)


print(wknn.predict(X_train))
