# 1. k Nearest Neighbor

## 1.1 Size of k vs n
Pros and Cons of size of k:
- Small k
    - Good at capturing fine-grained patterns
    - may overfit
- Large k
    - Smoothed/average stable prediction
    - may underfit
- Rule: $k < \sqrt{n}$ where $n$ is the number of training examples

## 1.2 Essentials

- We note that kNN does not learn! The classification/regression is run at test time
- Curse of dimensionality. Let the input dimention be $d$ and the test sample size be $N$. Then each computation for an input consists of $O(dN)$ operators. Sorting costs $N \log(N)$ after computing the minimum distance.

## 1.3 A simple implementation
http://cs231n.github.io/classification/#nn

### 1NN

In [93]:
# Nearest Neighbor implementation
# 1 NN

import numpy as np

class NearestNeighbor(object):
    
    def __init__(self):
        pass
    
    def train(self, X, y): 
        self.Xtr = X # X is N times D
        self.ytr = y # y is N times 1
    
    def predict(self, X): # X is M times D, each row needs a prediction
        num_test = X.shape[0]
        
        ypred = np.empty(num_test, dtype=self.ytr.dtype) # initialize ypred to store the prediction
        
        for i in range(num_test):
            distance = np.sum(np.square(Xtr-X[i,:]), axis=1)
            ypred[i] = self.ytr[np.argmin(distance)] # find the closest one
        
        return ypred 

In [124]:
# Simple example
e = 0 # initialize error to 0 for the test

# do 100 tests
for i in range(100):
    nn = NearestNeighbor()
    
    # test case:
    # random uniform distribution on (0,1)
    # label = 1 if the random number x is 0.2 < x < 0.5.
    # label = 0 otherwise
    Xtr = np.random.random(1000) 
    ytr = (1*(Xtr < 0.5)) * (1*(0.2 < Xtr))

    nn.train(Xtr, ytr)

    X = np.random.random(20000)
    y = (1*(X < 0.5)) * (1 *(0.2 < X))

    Xtr= Xtr[:, np.newaxis]
    X = X[:, np.newaxis]

    ypred = nn.predict(X)
    e += len(ypred[ypred != y])/len(y) # number of wrong classifications
    
print(e/100) # print the error averaged over 100 tests

0.000981


### kNN

In [118]:
# Nearest Neighbor implementation
# k NN

import numpy as np

class kNN(object):
    
    def __init__(self, k=1): # k is assumed to be 1 if omiited
        self.k = k
    
    def train(self, X, y): 
        self.Xtr = X # X is N times D
        self.ytr = y # y is N times 1
    
    def predict(self, X): # X is M times D, each row needs a prediction
        num_test = X.shape[0]
        
        ypred = np.empty(num_test, dtype=self.ytr.dtype)
        
        for i in range(num_test):
            distance = np.sum(np.square(self.Xtr-X[i,:]), axis=1)
            idx = np.argpartition(distance, self.k)[:self.k]      
            ypred[i] = np.mean(self.ytr[idx])
        
        return ypred 

In [125]:
# Simple example
e = 0

for i in range(100):
    nn = kNN(15)
    Xtr = np.random.random(1000)
    ytr = (1*(Xtr < 0.5)) * (1*(0.2 < Xtr))

    nn.train(Xtr, ytr)

    X = np.random.random(20000)
    y = (1*(X < 0.5)) * (1 *(0.2 < X))

    Xtr= Xtr[:, np.newaxis]
    X = X[:, np.newaxis]

    ypred = nn.predict(X)
    e += len(ypred[ypred != y])/len(y) # error
    
print(e/100)

0.013764
