# Classifying CIFAR10 dataset using KNN

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
import utils.download as download
from utils.data_utils import load_CIFAR10

# Place the images in the notebook
%matplotlib inline

# set default size of plots
plt.rcParams['figure.figsize'] = (10.0, 8.0) 
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# User-defined functions

In [None]:
class KNearestNeighbor(object):
    def __init__(self):
        pass
    
    # In K-Nearest Neighbor, training means just loading the data
    def train(self, X, y):
        self.X_train = X
        self.y_train = y
    
    # Compute the distance between data points
    def compute_distances(self, X):
        num_test = X.shape[0]
        num_train = self.X_train.shape[0]
        dist = np.sqrt(np.sum(np.square(self.X_train), axis=1) + np.sum(np.square(X), axis=1)[:, np.newaxis] - 2 * np.dot(X, self.X_train.T))
        return dist
    
    # Predict the labels
    def predict(self, X, k=1):
        dist     = self.compute_distances(X)
        num_test = X.shape[0]
        y_pred   = np.zeros(num_test)
        for i in range(num_test):
            closest_y = []
            
            ind       = np.argsort(dist[i])
            mask      = list(range(k))
            ind       = ind[mask]
            closest_y = self.y_train[ind]
            
            y_pred[i] = np.argmax(np.bincount(closest_y))
            
        return y_pred



# Download the dataset if already not downloaded

In [None]:
# Download the dataset if already not downloaded
download.maybe_download_and_extract(url="http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz", download_dir="./data/")

# Load the dataset

In [None]:
X_train, y_train, X_test, y_test = load_CIFAR10("./data/cifar-10-batches-py")

print("Number of training images: ", len(X_train))
print("Number of testing images : ", len(X_test))
print("Size of each image       : ", len(X_train[0]), "x", len(X_train[0][0]), "x", len(X_train[0][0][0]))

# Visualizing some data

In [None]:
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
samples_per_class = 7
for y, cls in enumerate(classes):
    idxs = np.flatnonzero(y_train == y)
    idxs = np.random.choice(idxs, samples_per_class, replace=False)
    for i, idx in enumerate(idxs):
        plt_idx = i * num_classes + y + 1
        plt.subplot(samples_per_class, num_classes, plt_idx)
        plt.imshow(X_train[idx].astype('uint8'))
        plt.axis('off')
        if i == 0:
            plt.title(cls)
plt.show()

In [None]:
# Reshape the image data into rows
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))

In [None]:
classifier = KNearestNeighbor()
classifier.train(X_train, y_train)
y_test_pred = classifier.predict(X_test, k=5)

num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / len(X_test)
print('Got %d / %d correct => accuracy: %f' % (num_correct, len(X_test), accuracy))

# Perform cross validation to improve performance

In [None]:
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]
X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)

# Dictionary to store accuracy levels for different K values
dict_accuracy_k = {}

for k in k_choices:
    accuracy = []
    for i in range(num_folds):
        
        X_train_cv = np.concatenate(np.delete(X_train_folds,i,0))
        y_train_cv = np.concatenate(np.delete(y_train_folds,i,0))
        X_test_cv  = X_train_folds[i]
        y_test_cv  = y_train_folds[i]
        
        classifier = KNearestNeighbor()
        classifier.train(X_train_cv, y_train_cv)
        y_test_pred = classifier.predict(X_test_cv, k)
        num_correct = np.sum(y_test_pred == y_test_cv)
        accuracy.append(float(num_correct) / len(X_test_cv))
    dict_accuracy_k[k] = accuracy

In [None]:
# Print out the computed accuracies
for k in sorted(dict_accuracy_k):
    for accuracy in dict_accuracy_k[k]:
        print('k = %d, accuracy = %f' % (k, accuracy))

# plot the raw observations
for k in k_choices:
    accuracies = dict_accuracy_k[k]
    plt.scatter([k] * len(accuracies), accuracies)

# plot the trend line with error bars that correspond to standard deviation
accuracies_mean = np.array([np.mean(v) for k,v in sorted(dict_accuracy_k.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(dict_accuracy_k.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()

In [None]:
best_k = 1

classifier = KNearestNeighbor()

tic = time.time()
classifier.train(X_train, y_train)
toc = time.time()
training_time = toc - tic

tic = time.time()
y_test_pred = classifier.predict(X_test, k=best_k)
toc = time.time()
testing_time = toc - tic

# Compute and display the accuracy
num_correct = np.sum(y_test_pred == y_test)
accuracy = float(num_correct) / len(X_test)
print('Got %d / %d correct => accuracy: %f' % (num_correct, len(X_test), accuracy))

print('Time taken to train the data: ', round(training_time, 5), "seconds")
print('Time taken to test the data : ', round(testing_time, 5), "seconds")