## K-nearest neighbors classifier

In [1]:
import numpy as np
import os

# Load the data
with np.load('cifar4-train.npz', allow_pickle=False) as npz_file:
    # Load items into a dictionary
    cifar = dict(npz_file.items())

print(cifar.keys())

pixels = cifar['pixels']
overfeat = cifar['overfeat']
labels = cifar['labels']
names = cifar['names']

print('pixels shape :',pixels.shape, ', dtype:', pixels.dtype)
print('overfeat shape :',overfeat.shape, ', dtype:', overfeat.dtype)
print('labels shape :',labels.shape, ', dtype:', labels.dtype)
print('Categories:', names)

dict_keys(['pixels', 'overfeat', 'labels', 'names', 'allow_pickle'])
pixels shape : (5000, 3072) , dtype: uint8
overfeat shape : (5000, 4096) , dtype: float32
labels shape : (5000,) , dtype: int64
Categories: ['truck' 'car' 'airplane' 'ship']


In [2]:
# split the Overfeat data into train/test sets w/ same proportion of classes in each subset

import pandas as pd
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(
    overfeat, labels, train_size=4000, test_size=1000, 
    stratify=labels , # same class distribution
    random_state=0)

print('Train:', X_tr.shape, y_tr.shape)
print('Test:', X_te.shape, y_te.shape)
print('class distribution in train set:','\n', pd.value_counts(y_tr, normalize=True))
print('class distribution in test set:','\n', pd.value_counts(y_te, normalize=True))

Train: (4000, 4096) (4000,)
Test: (1000, 4096) (1000,)
class distribution in train set: 
 3    0.25
1    0.25
2    0.25
0    0.25
dtype: float64
class distribution in test set: 
 3    0.25
2    0.25
1    0.25
0    0.25
dtype: float64


In [3]:
# Then, split the train set (4,000 points) into a (smaller) train and validation sets
# with respectively 3,200 and 800 samples

X_tr2, X_val, y_tr2, y_val = train_test_split(
    X_tr, y_tr, train_size=3200, test_size=800, 
    stratify=y_tr , # same class distribution
    random_state=0)

print('Train2:', X_tr2.shape, y_tr2.shape)
print('Val:', X_val.shape, y_val.shape)
print('Class distribution in train set:','\n',pd.value_counts(y_tr2, normalize=True))
print('Class distribution in test set:','\n', pd.value_counts(y_val, normalize=True))

Train2: (3200, 4096) (3200,)
Val: (800, 4096) (800,)
Class distribution in train set: 
 3    0.25
1    0.25
2    0.25
0    0.25
dtype: float64
Class distribution in test set: 
 3    0.25
2    0.25
1    0.25
0    0.25
dtype: float64


#### Create a k-NN classifier with PCA. Tune k and the distance metric

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Create k-NN classifier
pipe = Pipeline([
    #('scaler', StandardScaler()), # With standardization
    ('scaler', None),
    ('pca', PCA(n_components=172)), # PCA preprocessing, number of components to retain 90% of the variance explained.
    ('knn', KNeighborsClassifier(
        algorithm='brute', # Brute-force search
        n_jobs=-1 # As many parallel jobs as possible
    ))
])

In [7]:
from sklearn.model_selection import ParameterGrid

# grid of parameters 
grid = ParameterGrid({
    'scaler': [None, StandardScaler()],
    'pca' : [None, PCA(n_components=172)],
    'knn__n_neighbors': range(1,100,15), # k
    'knn__p': [1, 2], # L1 and L2 distance metrics
})

# Print the number of combinations
print('Number of combinations:', len(grid))

Number of combinations: 56


In [8]:
# Save accuracy on test set
test_scores = []

# Enumerate combinations starting from 1
for i, params_dict in enumerate(grid, 1):
    # Print progress
    print('Combination {}/{}'.format(
        i, len(grid) # Total number of combinations
    ))
    
    # Set parameters
    pipe.set_params(**params_dict)

    # Fit a k-NN classifier on smaller train set
    pipe.fit(X_tr2, y_tr2)

    # Save accuracy on validation set
    params_dict['accuracy'] = pipe.score(X_val, y_val)

    # Save result
    test_scores.append(params_dict)

print('done')

Combination 1/56
Combination 2/56
Combination 3/56
Combination 4/56
Combination 5/56
Combination 6/56
Combination 7/56
Combination 8/56
Combination 9/56
Combination 10/56
Combination 11/56
Combination 12/56
Combination 13/56
Combination 14/56
Combination 15/56
Combination 16/56
Combination 17/56
Combination 18/56
Combination 19/56
Combination 20/56
Combination 21/56
Combination 22/56
Combination 23/56
Combination 24/56
Combination 25/56
Combination 26/56
Combination 27/56
Combination 28/56
Combination 29/56
Combination 30/56
Combination 31/56
Combination 32/56
Combination 33/56
Combination 34/56
Combination 35/56
Combination 36/56
Combination 37/56
Combination 38/56
Combination 39/56
Combination 40/56
Combination 41/56
Combination 42/56
Combination 43/56
Combination 44/56
Combination 45/56
Combination 46/56
Combination 47/56
Combination 48/56
Combination 49/56
Combination 50/56
Combination 51/56
Combination 52/56
Combination 53/56
Combination 54/56
Combination 55/56
Combination 56/56
d

In [11]:
# Create DataFrame with test scores
scores_df = pd.DataFrame(test_scores)

# Print scores
scores_df.sort_values(by='accuracy', ascending=False).head(15)

Unnamed: 0,accuracy,knn__n_neighbors,knn__p,pca,scaler
28,0.78125,46,2,,
25,0.78,46,1,,"StandardScaler(copy=True, with_mean=True, with..."
33,0.77625,61,1,,"StandardScaler(copy=True, with_mean=True, with..."
24,0.775,46,1,,
29,0.77375,46,2,,"StandardScaler(copy=True, with_mean=True, with..."
37,0.77375,61,2,,"StandardScaler(copy=True, with_mean=True, with..."
47,0.7725,76,2,"PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
46,0.77125,76,2,"PCA(copy=True, iterated_power='auto', n_compon...",
23,0.77125,31,2,"PCA(copy=True, iterated_power='auto', n_compon...","StandardScaler(copy=True, with_mean=True, with..."
22,0.77125,31,2,"PCA(copy=True, iterated_power='auto', n_compon...",


In [10]:
best = scores_df.sort_values(by='accuracy', ascending=False)[0:1]
print('Top accuracy on validation set:', best.iloc[0,0], ' with k:', best.iloc[0,1],
    'and distance metric:', best.iloc[0,2])

Top accuracy on validation set: 0.78125  with k: 46 and distance metric: 2


In [12]:
# Create k-NN classifier with tuned parameters
knn_best = KNeighborsClassifier(p= best.iloc[0,2], 
                                n_neighbors= best.iloc[0,1], 
                                algorithm='brute', 
                                n_jobs=-1)

pipe = Pipeline([('scaler', None),
                 ('pca', None),
                  #('pca', PCA(n_components=172)),
                 ('knn', knn_best)])

# Fit it to the entire train data
pipe.fit(X_tr, y_tr)

# evaluate its accuracy on the test set
accuracy_best = pipe.score(X_te, y_te)
print ('K-NN accuracy on the test set: {:.3f}'.format(accuracy_best))

K-NN accuracy on the test set: 0.771


In [13]:
pipe.get_params

<bound method Pipeline.get_params of Pipeline(memory=None,
     steps=[('scaler', None), ('pca', None), ('knn', KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=46, p=2,
           weights='uniform'))])>

### Choose an image from the test set and plot its 10 nearest neighbors.

pick an image from the test set and plot it with its 10 nearest neighbors from the train one

In [None]:
# Pick a random image from the test set
import random
img = X_te[random.randint(1,len(X_te))].reshape(1, -1)
img