In [None]:
# overfitting often caused by having more params than necessary to learn a specific dataset.
# in our case, we have so many params that it can memeorize every detail in training instead
# of learning high-level abstractions. when nns have many params but not very many training 
# examples, overfitting diffi to avoid.

In [None]:
# we used regularization previously to help with overfitting, but we have more tools avail

In [None]:
# overfitting concerned with the ratio between # of weights in the model and the number of 
# datapoints it has to learn. so there's a better method to combat overfitting.
# we use "structure" (loosely defined)

In [None]:
# structure is when we selectively choose to reuse weights for multi purposes b/c we
# believe the same pattern needs to be detected in multiple places (like a 2 vs a 3 in MNIST
# 2 and 3 have certain things in common so we can reuse weights for both). this will reduce
# weight-to-data ratio.

In [None]:
# have to be clever about it though. if we just removed params the model would be less expressive.
# we want equally expressive but more robust to overfitting. (model will be smaller though b/c
# fewer params to store)

In [None]:
# "convolutional layer" - most widely used structure in nns
#    lots of very small linear layers are reused in every position, instead of a single beeg one.
#    our large dense layer had a connection from efvery input to every output, but with our
#    convolutional layer, we instead have lots of very small linear layers, usually with fewer
#    than 25 inputs and a single output, which we use in every input position.
#    each mini layer is called a "convolutional kernel" - but it's basically a baby linear layer with 
#    small # of inputs and a single output. 

In [None]:
# in a 3x3 convolutional kernel, it would predict in its current location (top left) and move 1 pixel
# at a time to the right, then predict again, and so on. once its scanned across the image, it will 
# move down a single pixel and scan back to the left, and then repeating until it has made a pred
# in every possible positin within the image. result will be a smaller square of kernel predictions, 
# which are used in input to the next layer. result will be a 6x6 prediciton matrix.

In [None]:
# if we used 4 diff 3x3 convo kernels looking at the same spot of an 8x8 image of a 2
# each kernel results in a 6x6 pred matrix. so we have 4 6x6 pred matrices.
# (1) we can sum them elementwise (sum pooling)
# (2) take the mean elementwise (mean pooling)
# (3) compute the elementwise max value (max pooling)

# max pooling is most popular. for each position, look into each of the 4 kernel's outputs
# find the max, and copy it into the final 6x6 matrix. this 6x6 matrix will then forward 
# propogate into the next layers

In [None]:
# this technique allows each kernel to learn a particular pattern and then search for the 
# existence of that pattern somewhere in the image.

# a single small set of weights can train over a much larger set of training examples b/c
# even though the dataset hasn't changed, each mini-kernel is forward propagated multiple times
# on multiple segments of data, thus changing the ratio of weights to datapoints on which those 
# weights are being trained. this will drastically reduce the nns ability to overfit training data.
# / increase its ability to generalize

In [18]:
# implement in NumPy

import numpy as np, sys
np.random.seed(1)

from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

images, labels = (x_train[0:1000].reshape(1000, 28*28) / 255, y_train[0:1000])

one_hot_labels = np.zeros((len(labels), 10))
for i,l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = x_test.reshape(len(x_test), 28*28) / 255
test_labels = np.zeros((len(y_test), 10))
for i, l in enumerate(y_test):
    test_labels[i][l] = 1
    
def tanh(x):
    return np.tanh(x)

def tanh2deriv(output):
    return 1 - (output ** 2)

def softmax(x):
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

alpha, iterations = (2, 300)
pixels_per_image, num_labels = (784, 10)
batch_size = 128

input_rows = 28
input_cols = 28

# 16 3x3 -> 16 6x6
kernel_rows = 3
kernel_cols = 3
num_kernels = 16

hidden_size = ((input_rows - kernel_rows) * (input_cols - kernel_cols)) * num_kernels

def get_image_section(layer, row_from, row_to, col_from, col_to):
    '''select a subregion in a batch of images.'''
    section = layer[:,row_from:row_to,col_from:col_to]
    return section.reshape(-1, 1, row_to-row_from, col_to-col_from)



In [19]:
kernels = 0.02*np.random.random((kernel_rows*kernel_cols, num_kernels))-0.01 #kernals.shape == (9, 16)
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels))-0.1

In [20]:
for j in range(iterations):
    correct_cnt = 0
    
    # Iterate over mini-batches
    for i in range(int(len(images) / batch_size)):
        batch_start, batch_end=((i * batch_size), ((i+1)*batch_size))
        layer_0 = images[batch_start:batch_end]
        layer_0 = layer_0.reshape(layer_0.shape[0], 28, 28) # (128, 28, 28)
        
         # Slide a 3x3 kernel over the 28x28 image and extract subregions (sect)
        sects = list()
        for row_start in range(layer_0.shape[1]-kernel_rows):
            for col_start in range(layer_0.shape[2] - kernel_cols):
                sect = get_image_section(layer_0,
                                         row_start,
                                         row_start+kernel_rows,
                                         col_start,
                                         col_start+kernel_cols)
                sects.append(sect)
        
        # Flatten and concatenate all subregions to form the input for the next layer
        expanded_input = np.concatenate(sects,axis=1)
        es = expanded_input.shape #(128, 625, 3, 3)
        flattened_input = expanded_input.reshape(es[0]*es[1], -1) #(80000, 9)
        
        # Project the flattened input through the kernels to form the inputs for the next layer
        kernel_output = flattened_input.dot(kernels)
        layer_1 = tanh(kernel_output.reshape(es[0], -1))
        
        # Implement dropout: Randomly zero out some of the units in layer_1 to prevent overfitting
        dropout_mask = np.random.randint(2,size=layer_1.shape)
        layer_1 *= dropout_mask * 2
        
        # Calculate outputs of final layer by projecting layer_1 through the second set of weights, followed by softmax activation
        layer_2 = softmax(np.dot(layer_1, weights_1_2))
        
        # Increment correct count if the highest-scoring output unit matches the target label
        for k in range(batch_size):
            labelset = labels[batch_start+k:batch_start+k+1]
            _inc = int(np.argmax(layer_2[k:k+1]) == np.argmax(labelset))
            correct_cnt += _inc
        
        # backprop
        # Compute deltas for output and hidden layers and update weights    
        layer_2_delta = (labels[batch_start:batch_end]-layer_2) / (batch_size * layer_2.shape[0])
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * tanh2deriv(layer_1)
        layer_1_delta *= dropout_mask
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        l1d_reshape = layer_1_delta.reshape(kernel_output.shape)
        k_update = flattened_input.T.dot(l1d_reshape)
        kernels -= alpha * k_update
        
    test_correct_cnt = 0
    
    for i in range(len(test_images)):
        
        layer_0 = test_images[i:i+1]
        layer_0 = layer_0.reshape(layer_0.shape[0], 28, 28)
        layer_0.shape
        
        sects = list()
        for row_start in range(layer_0.shape[1] - kernel_rows):
            for col_start in range(layer_0.shape[2] - kernel_cols):
                sect = get_image_section(layer_0,
                                         row_start,
                                         row_start+kernel_rows,
                                         col_start,
                                         col_start+kernel_cols)
                sects.append(sect)
        
        expanded_input = np.concatenate(sects, axis=1)
        es = expanded_input.shape
        flattened_input = expanded_input.reshape(es[0]*es[1],-1)
        
        kernel_output = flattened_input.dot(kernels)
        layer_1 = tanh(kernel_output.reshape(es[0],-1))
        layer_2 = np.dot(layer_1, weights_1_2)
        
        test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))
        
    if (j % 1 == 0):
        sys.stdout.write("\n" + \
                         "i:" + str(j) + \
                         " test-acc:" + str(test_correct_cnt/float(len(test_images))) + \
                         " train-acc:" + str(correct_cnt/float(len(images))))
        


i:0 test-acc:0.0288 train-acc:0.055
i:1 test-acc:0.0273 train-acc:0.037
i:2 test-acc:0.028 train-acc:0.037
i:3 test-acc:0.0292 train-acc:0.04
i:4 test-acc:0.0339 train-acc:0.046
i:5 test-acc:0.0478 train-acc:0.068
i:6 test-acc:0.076 train-acc:0.083
i:7 test-acc:0.1316 train-acc:0.096
i:8 test-acc:0.2137 train-acc:0.127
i:9 test-acc:0.2941 train-acc:0.148
i:10 test-acc:0.3563 train-acc:0.181
i:11 test-acc:0.4023 train-acc:0.209
i:12 test-acc:0.4358 train-acc:0.238
i:13 test-acc:0.4473 train-acc:0.286
i:14 test-acc:0.4389 train-acc:0.274
i:15 test-acc:0.3951 train-acc:0.257
i:16 test-acc:0.2222 train-acc:0.243
i:17 test-acc:0.0613 train-acc:0.112
i:18 test-acc:0.0266 train-acc:0.035
i:19 test-acc:0.0127 train-acc:0.026
i:20 test-acc:0.0133 train-acc:0.022
i:21 test-acc:0.0185 train-acc:0.038
i:22 test-acc:0.0363 train-acc:0.038
i:23 test-acc:0.0928 train-acc:0.067
i:24 test-acc:0.1994 train-acc:0.081
i:25 test-acc:0.3086 train-acc:0.154
i:26 test-acc:0.4276 train-acc:0.204
i:27 test-acc

i:219 test-acc:0.8818 train-acc:0.797
i:220 test-acc:0.8793 train-acc:0.799
i:221 test-acc:0.8789 train-acc:0.815
i:222 test-acc:0.8791 train-acc:0.816
i:223 test-acc:0.8793 train-acc:0.809
i:224 test-acc:0.8814 train-acc:0.795
i:225 test-acc:0.8798 train-acc:0.799
i:226 test-acc:0.8805 train-acc:0.806
i:227 test-acc:0.88 train-acc:0.808
i:228 test-acc:0.8782 train-acc:0.801
i:229 test-acc:0.8802 train-acc:0.814
i:230 test-acc:0.8807 train-acc:0.8
i:231 test-acc:0.8809 train-acc:0.798
i:232 test-acc:0.8805 train-acc:0.82
i:233 test-acc:0.8795 train-acc:0.794
i:234 test-acc:0.8807 train-acc:0.806
i:235 test-acc:0.8806 train-acc:0.808
i:236 test-acc:0.8787 train-acc:0.802
i:237 test-acc:0.8796 train-acc:0.81
i:238 test-acc:0.8766 train-acc:0.805
i:239 test-acc:0.8781 train-acc:0.792
i:240 test-acc:0.8787 train-acc:0.809
i:241 test-acc:0.8762 train-acc:0.802
i:242 test-acc:0.8775 train-acc:0.811
i:243 test-acc:0.8804 train-acc:0.814
i:244 test-acc:0.8794 train-acc:0.804
i:245 test-acc:0.8

In [None]:
 # reshape       
    #     pretend each individual subregion is its own image.
        #     if we have batch size of 8 images, and 100 subregions per image, 
        #         then we pretend it was a batch of 800 smaller images.
        #     forward propogating them through a linear layer with one output neuron is the same as predicting
        #     that linear layer over every subregion in every batch.
        #     if we instead forward propagated using a linear layer with n output neurons, it will generate
        #     the outputs that are the same as predicting n linear layers (kernels) in every pos of the image.
        #     1 output neuron is both simpler and faster. 
        #     1 output neuron is akin to predicting a single propertty/feature in each subregion
        #     n output neurons is like predicting n features/values for each subregion.
        #     using more neurons in o0utput layer allows model to potentially map more complex features or combinations
        #     of features from the subregions. however, it comes at cost of requirng more data for robust training and
        #     being more expensive.

In [None]:
# when a nn needs to use the same idea in multiple places, endeavor to use the same weights in both places.