In [188]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [198]:
np.argmin?

In [202]:
'''
Question 2.1 Skeleton Code

Here you should implement and evaluate the k-NN classifier.
'''


import data
import numpy as np
# Import pyplot - plt.imshow is useful!
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import KFold
from __future__ import division

np.random.seed(1337)

class KNearestNeighbor(object):
    '''
    K Nearest Neighbor classifier
    '''

    def __init__(self, train_data, train_labels):
        self.train_data = train_data
        self.train_norm = (self.train_data**2).sum(axis=1).reshape(-1,1)
        self.train_labels = train_labels

    def l2_distance(self, test_point):
        '''
        Compute L2 distance between test point and each training point
        
        Input: test_point is a 1d numpy array
        Output: dist is a numpy array containing the distances between the test 
        point and each training point
        '''
        # Process test point shape
        test_point = np.squeeze(test_point)
        if test_point.ndim == 1:
            test_point = test_point.reshape(1, -1)
        assert test_point.shape[1] == self.train_data.shape[1]

        # Compute squared distance
        train_norm = (self.train_data**2).sum(axis=1).reshape(-1,1)
        test_norm = (test_point**2).sum(axis=1).reshape(1,-1)
        dist = self.train_norm + test_norm - 2*self.train_data.dot(test_point.transpose())
        return np.squeeze(dist)

    def query_knn(self, test_point, k):
        '''
        Query a single test point using the k-NN algorithm

        You should return the digit label provided by the algorithm
        '''

        #Get all the distances
        distances = self.l2_distance(test_point)
        digit_list = []

        #Acquire indexes of min distances
        min_indexes = distances.argsort()[:k]

        #Identify list of digits
        digit_list = self.train_labels[min_indexes]

        #data structure for storing item counts
        DigitCounter = Counter(digit_list.tolist())
        max_num_occurances = max(DigitCounter.values())

        digit = [x[0] for x in DigitCounter.items() if x[1] == max_num_occurances][0]
        return digit

def cross_validation(train_data, train_labels, k_range=np.arange(1,16)):
    '''
    Perform 10-fold cross validation to find the best value for k

    Note: Previously this function took knn as an argument instead of train_data,train_labels.
    The intention was for students to take the training data from the knn object - this 
    should be clearer
    from the new function signature.
    '''
    
    #initialize split object
    kf = KFold(n_splits=10)

    Accuracy_Ktuples = []
    
    for k in k_range:
        print(k)
        Accuracies = []
        
        for train_index, test_index in kf.split(train_data):
            knn = KNearestNeighbor(train_data[train_index], train_labels[train_index])
            Accuracies.append(classification_accuracy(knn,k,
                                        train_data[test_index], train_labels[test_index]))
            
            
        Accuracy_Ktuples.append((k,np.mean(Accuracies).round(3)))
        
    return(Accuracy_Ktuples)


def classification_accuracy(knn, k, eval_data, eval_labels):
    '''
    Evaluate the classification accuracy of knn on the given 'eval_data'
    using the labels
    '''
    predicted_labels = []
    
#     for eval_index in range(len(eval_data)):
#         predicted_labels.append(knn.query_knn(eval_data[eval_index], k))
    predicted_labels = [knn.query_knn(eval_point, k) for eval_point in eval_data]
    
    zipped_preds = zip(eval_labels.tolist(),predicted_labels)
    
    TruePositive = sum([x[0] == x[1] for x in zipped_preds])
    
    Accuracy = TruePositive / float(len(zipped_preds))
    return Accuracy

def main():
    train_data, train_labels, test_data, test_labels = data.load_all_data('data')
    knn = KNearestNeighbor(train_data, train_labels)

    # Example usage:
    trainPredLab1 = np.zeros(len(train_data))
    trainPredLab15 = np.zeros(len(train_data))
    
    #Test Train Accuracy 
    
    for i in range(len(train_data)):
        trainPredLab1[i] = knn.query_knn(train_data[i],1)
        trainPredLab15[i] = knn.query_knn(train_data[i],15)
        
    AccuracyTrainK1 = (trainPredLab1 == train_labels).sum() / len(trainPredLab1)
    AccuracyTrainK15 = (trainPredLab15 == train_labels).sum() / len(trainPredLab15)
    
    print("Accuracy Training Set k = 1 {} ".format(AccuracyTrainK1))
    print("Accuracy Training Set k = 15 {} ".format(AccuracyTrainK15))

    # Test Test accuracy
    
    testPredLab1 = np.zeros(len(test_data))
    testPredLab15 = np.zeros(len(test_data))
        
    for i in range(len(test_data)):
        testPredLab1[i] = knn.query_knn(test_data[i],1)
        testPredLab15[i] = knn.query_knn(test_data[i],15)
        
    AccuracyTestK1 = (testPredLab1 == test_labels).sum() / len(testPredLab1)
    AccuracyTestK15 = (testPredLab15 == test_labels).sum() / len(testPredLab15)
    
    print("Accuracy Test Set k = 1 {} ".format(AccuracyTestK1))
    print("Accuracy Test Set k = 15 {} ".format(AccuracyTestK15))

    
    predicted_label = knn.query_knn(test_data[0], 1)
    
    Accuracy_Ktuples = cross_validation(train_data, train_labels,k_range=np.arange(1,16))
    
    MaxAccuracy = max([x[1] for x in Accuracy_Ktuples])
    
    for i in range(len(Accuracy_Ktuples)):
        print("K is {}, Accuracy is {}".format(Accuracy_Ktuples[i][0],
                                               Accuracy_Ktuples[i][1]))
        
    testClassAcc = np.zeros(15)
    for i in range(15):
        testClassAcc[i] = classification_accuracy(knn, i+1, test_data, test_labels)
        
    
    print('Best K {}'.format([x[0] for x in Accuracy_Ktuples if x[1] == MaxAccuracy][0]))
    
    print(testClassAcc)
    
if __name__ == '__main__':
    main()

Accuracy Training Set k = 1 1.0 
Accuracy Training Set k = 15 0.961 
Accuracy Test Set k = 1 0.96875 
Accuracy Test Set k = 15 0.96075 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
K is 1, Accuracy is 0.966
K is 2, Accuracy is 0.959
K is 3, Accuracy is 0.964
K is 4, Accuracy is 0.962
K is 5, Accuracy is 0.963
K is 6, Accuracy is 0.959
K is 7, Accuracy is 0.958
K is 8, Accuracy is 0.956
K is 9, Accuracy is 0.956
K is 10, Accuracy is 0.955
K is 11, Accuracy is 0.954
K is 12, Accuracy is 0.952
K is 13, Accuracy is 0.952
K is 14, Accuracy is 0.95
K is 15, Accuracy is 0.948
Best K 1
[ 0.96875  0.9645   0.97025  0.969    0.968    0.96775  0.965    0.9645
  0.96225  0.96125  0.9605   0.9605   0.959    0.9595   0.96075]
