### NearestNeighbor classification

In [1]:
# Import packages
import pickle
import numpy as np

In [2]:
# Open dataset file
def unpickle(file):
    with open(file, 'rb') as f:
        data = pickle.load(f, encoding='bytes')
    return data

In [3]:
# Use dataset and divide to train set and test set
def load_CIFAR10(pos, n_chunks=1):
    '''
    Use data_batches (from data_batch_1) as training set and test_batch as test set
    '''
    Xtr = []
    Ytr = []
    for i in range(n_chunks):
        train = unpickle(pos + '/data_batch_{}'.format(i + 1))
        Xtr.extend(train[b'data'])
        Ytr.extend(train[b'labels'])
    test = unpickle(pos + '/test_batch')
    Xte = test[b'data']
    Yte = test[b'labels']
    return Xtr, Ytr, Xte, Yte

In [4]:
Xtr, Ytr, Xte, Yte = load_CIFAR10('cifar-10-batches-py')

In [5]:
# Check type
print(type(Xtr))
print(type(Ytr))
print(type(Xte))
print(type(Yte))

<class 'list'>
<class 'list'>
<class 'numpy.ndarray'>
<class 'list'>


In [6]:
# Change list types to numpy array type
Xtr = np.array(Xtr)
Ytr = np.array(Ytr)
Yte = np.array(Yte)
# Get shape
print(Xtr.shape)
print(Ytr.shape)
print(Xte.shape)
print(Yte.shape)

(10000, 3072)
(10000,)
(10000, 3072)
(10000,)


In [7]:
# Reshape (Seems pointless but explains how to use reshape method)
Xtr_rows = Xtr.reshape(Xtr.shape[0], 32 * 32 * 3)
Xte_rows = Xte.reshape(Xte.shape[0], 32 * 32 * 3)
print(Xtr_rows.shape)
print(Xte_rows.shape)

(10000, 3072)
(10000, 3072)


In [8]:
# Validation set
Xval_rows = Xtr_rows[:1000, :]
Yval = Ytr[:1000]
# Cut off training set because it will take a very long time
# Probably the best to use all
Xtr_rows = Xtr_rows[8000:, :]
Ytr = Ytr[8000:]
print(Xval_rows.shape)
print(Yval.shape)
print(Xtr_rows.shape)
print(Ytr.shape)

(1000, 3072)
(1000,)
(2000, 3072)
(2000,)


In [9]:
# Distribution
def compute_dist(X, point, dist_type):
    if dist_type == 'l1':
        # Manhattan norm
        return np.sum(np.abs(X - point), axis=1)
    elif dist_type == 'l2':
        # Euclidean norm
        return np.sqrt(np.sum(np.square(X - point), axis=1))

In [10]:
# Predict point
def predict_point(distances, ytr, k):
    if k == 1:
        min_index = np.argmin(distances)
        return ytr[min_index]
    elif k > 1:
        min_indices = np.argpartition(distances, k)[:k]
        labels = np.array([ytr[i] for i in min_indices])
        return np.argmax(np.bincount(labels))

In [11]:
class NearestNeighbor(object):
    def __init__(self):
        pass
    
    def train(self, X, y):
        self.Xtr = X
        self.ytr = y
        
    def predict(self, X, k=1, dist_type='l1'):
        num_test = X.shape[0]
        Ypred = np.zeros(num_test, dtype=self.ytr.dtype)
        
        for i in range(num_test):
            distances = compute_dist(self.Xtr, X[i,:], dist_type)
            Ypred[i] = predict_point(distances, self.ytr, k)
        return Ypred

In [12]:
validation_accuracies = []
for k in [1, 3, 5, 10, 20, 50, 100]:
    for dist_type in ['l1', 'l2']:
        nn = NearestNeighbor()
        nn.train(Xtr_rows, Ytr)
        
        Yval_predict = nn.predict(Xval_rows, k=k, dist_type=dist_type)
        acc = np.mean(Yval_predict == Yval)
        print('k: %d, dist_type: %s, accuracy: %f' % (k, dist_type, acc))

k: 1, dist_type: l1, accuracy: 0.202000
k: 1, dist_type: l2, accuracy: 0.169000
k: 3, dist_type: l1, accuracy: 0.177000
k: 3, dist_type: l2, accuracy: 0.178000
k: 5, dist_type: l1, accuracy: 0.181000
k: 5, dist_type: l2, accuracy: 0.186000
k: 10, dist_type: l1, accuracy: 0.187000
k: 10, dist_type: l2, accuracy: 0.180000
k: 20, dist_type: l1, accuracy: 0.181000
k: 20, dist_type: l2, accuracy: 0.195000
k: 50, dist_type: l1, accuracy: 0.173000
k: 50, dist_type: l2, accuracy: 0.196000
k: 100, dist_type: l1, accuracy: 0.167000
k: 100, dist_type: l2, accuracy: 0.210000


#### Tryout Ensemble (Unrelated to above)

In [13]:
# Import classifier packages
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [14]:
# Generate a random n-class classification problem
# 1000 samples, 4 features, 0 informative features, 0 redundant features
# np.random RNG, 0 classes, no shuffling sample or feature
X, y = make_classification(n_samples=1000, n_features=4, n_informative=2,
                           n_redundant=0, random_state=0, n_classes=2,
                           shuffle=False)

In [15]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)
print(clf.feature_importances_)
# Predict [0, 0, 0, 0](feature1=0, f2=0, f3=0, f4=0) class
print(clf.predict([[0, 0, 0, 0]]))

[ 0.17287856  0.80608704  0.01884792  0.00218648]
[1]
