In [1]:
import cPickle as pickle
import numpy as np
import pandas as pd
import os

def load_CIFAR_batch(filename):
  """ load single batch of cifar """
  with open(filename, 'rb') as f:
    datadict = pickle.load(f)
    X = datadict['data']
    Y = datadict['labels']
    X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
    Y = np.array(Y)
    return X, Y

def load_CIFAR10(ROOT):
  """ load all of cifar """
  xs = []
  ys = []
  for b in range(1,6):
    f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
    X, Y = load_CIFAR_batch(f)
    xs.append(X)
    ys.append(Y)    
  Xtr = np.concatenate(xs)
  Ytr = np.concatenate(ys)
  del X, Y
  Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
  return Xtr, Ytr, Xte, Yte

In [2]:
Xtr, Ytr, Xte, Yte = load_CIFAR10('/home/patrik/git/cs231n.github.io/assignments2016/assignment1/cs231n/datasets/cifar-10-batches-py/') # a magic function we provide
# flatten out all images to be one-dimensional
Xtr_rows = Xtr.reshape(Xtr.shape[0], 32 * 32 * 3) # Xtr_rows becomes 50000 x 3072
Xte_rows = Xte.reshape(Xte.shape[0], 32 * 32 * 3) # Xte_rows becomes 10000 x 3072

In [69]:
##Define NearestNeigbour function
import numpy as np

class NearestNeighbor(object):
  def __init__(self):
    pass

  def train(self, X, y):
    """ X is N x D where each row is an example. Y is 1-dimension of size N """
    # the nearest neighbor classifier simply remembers all the training data
    self.Xtr = X
    self.ytr = y

  def predict(self, X, k):
    """ X is N x D where each row is an example we wish to predict label for """
    num_test = X.shape[0]
    # lets make sure that the output type matches the input type
    Ypred = np.zeros(num_test, dtype = self.ytr.dtype)

    # loop over all test rows
    for i in xrange(num_test):
      # find the nearest training image to the i'th test image
      # using the L1 distance (sum of absolute value differences)
        dist=np.sum(np.abs(self.Xtr - X[i,:]), axis = 1)
        dist2=np.vstack((np.asarray(range(dist.shape[0])),dist)).T
        ind=dist2[dist2[:,1].argsort()][:k,0]
        num,counts=np.unique(self.ytr[np.intp(ind)], return_counts=True)
        
        Ypred[i] = num[max(counts)==counts][0] # predict the label of the nearest example

    return Ypred


In [70]:
nn=NearestNeighbor()
nn.train(Xtr_rows,Ytr)

In [59]:
#K nearest neghbors impl, changes
#dist=np.sum(np.abs(nn.Xtr-Xte_rows[0,:]),1)
#dist2=np.vstack((np.asarray(range(dist.shape[0])),dist)).T
#ind=dist2[dist2[:,1].argsort()][:10,0]
#num,counts=np.unique(nn.ytr[np.intp(ind)], return_counts=True)
#(num,counts)
#num[max(counts)==counts][0]



(array([2, 3, 4, 5, 6]), array([3, 1, 3, 2, 1]))

In [75]:
Yte_predict=nn.predict(Xte_rows[0:10,:],k=40)
np.vstack((Yte[0:10],Yte_predict))

array([[3, 8, 8, 0, 6, 6, 1, 6, 3, 1],
       [2, 8, 8, 8, 4, 6, 6, 4, 2, 8]])

In [76]:
print 'accuracy: %f' % ( np.mean(Yte_predict == Yte[0:10] ))

accuracy: 0.300000


In [78]:
Xval_rows = Xtr_rows[:30, :] # take first 1000 for validation
Yval = Ytr[:30]
Xtr_rows = Xtr_rows[1000:, :] # keep last 49,000 for train
Ytr = Ytr[1000:]

# find hyperparameters that work best on the validation set
validation_accuracies = []
for k in [1, 3, 5, 10, 20, 50, 100]:
  
  # use a particular value of k and evaluation on validation data
  nn = NearestNeighbor()
  nn.train(Xtr_rows, Ytr)
  # here we assume a modified NearestNeighbor class that can take a k as input
  Yval_predict = nn.predict(Xval_rows, k = k)
  acc = np.mean(Yval_predict == Yval)
  print 'accuracy: %f' % (acc,)

  # keep track of what works on the validation set
  validation_accuracies.append((k, acc))

accuracy: 0.200000
accuracy: 0.233333
accuracy: 0.166667
accuracy: 0.266667
accuracy: 0.300000
accuracy: 0.333333
accuracy: 0.333333
