In [None]:
# activation functions must be  (1) continuous and (2) monotonic (never changing direction)
# for ex a parabala (y = x * x) would be bad b/c multiple values of x return the same y 
# so there would be multiple "correct" answers for weights. this is bad.

In [None]:
# should also be (3) non-linear i.e they squiggle or turn (like relu)

In [None]:
# if function was linear, it would just scale weighted averages coming in - this linear activation
# doesn't allow one weight to affect how correlated the neuron is to the other weights. we need
# selective correlation - given a neuron with an activation function we want one incoming signal to be 
# able to increase or decrease how correlated the neuron is to all the other incoming signals. all
# curved lines do this (to varying degrees)

In [None]:
# (4) activation function and its derivative should be efficiently computable since we'll be calling
# the function a ton

In [None]:
# standard hidden layer activation functions
#    sigmoid - let's you interpret the output of any individual neuron as a probability ((0, 1) output)
#    tanh (better than sigmoid for hidden layers) - basically a sigmoid except it's between ((-1, 1) output)
#        this means it can also throw in some negative correlation. not as useful for output layers unless
#        our output should be in the range (-1, 1)

In [None]:
# standard output layer activation functions
# main focus is ensuring the output nonlinearity can predict the right answers. signmoid or tanh obvi wouldn't
# be appropriate for something like temperature

#    raw values (no activation function) - appropriate if we want to train a NN to transform 1 matrix into another
#        where output is something other than a probability (like avg temperature in Colorado or something)
#    sigmoid
#    softmax (most common) - the more likely it's one label, the less likely it's any of the other labels
#        like for MNIST digit classifier - le'ts say we're looking at a 9
#        labels   0   1   2   3   4   5   6   7   8   9
#        raw      0   0   0   0   0   0   0   0   0   0
#        sigmoid .5  .5  .5  .5  .5  .5  .5  .5  .5  .99 (eww)
#        softmax  0   0   0   0   0   0   0   0   0   1  (always sums to 1)

#        even though it looks sigmoid predicts nearly perfectly, it will backprop a ton of error b/c for sigmoid
#        to reach 0 error, it doesn't just have to predict the highest positive number for the true output, it
#        has to predict 0 everywhere else. where softmax asks "which digit seems like the best fit for this input"
#        sigmoid says "it's only digit 9 and doesn't have anything in common with the other MNIST digits"

#        we want output that won't penalize labels that are similar, instead we want it to pay attention to all
#        the info that can be indicative of any potential input.

In [None]:
# activation in forward prop pretty straight forward (just pass the layer_0 * weights_0_1 into the activation function)
# backprop activation more nuanced. to generate layer_1 delta, we have to multiply the backpropagated delta from
# layer 2 (layer_2_delta.dot(weights_1_2.T)) by the slope (derivative) at the point predicited in forward prop.
# for pos numbers delta slope will be 1 and neg numbers will be 0 for relu

# what about for other activation functions? for sigmoid, slope get's steeper for values approaching 0 from either direction
# and slope approaches 0 as we move towards higher pos and lower neg values, so delta will matter much more for values
# closer to 0 and values further from 0 get multipled by a slope of closer to 0 so delta closer to 0. this also
# creates a notion of stickiness - weights that have previously been updated a lot in one direction confidently 
# these nonlinearities help make it harder for occasional erroneous training examples to corrupt intelligence
# that has been reinforced many times.


In [None]:
# converting output to slope (derivative)

# most nonlinearities (all the popular ones) use a method for computing derivative that's a lil strange.
# instead of computing derivative at a certain point on the curve the normal way, most great activation functions
# have a means by which the output of the layer (at forward prop) can be used to compute the derivative.

In [None]:
# let's try using tanh for hidden layer activation and softmax for output-layer activation on MNIST now.
# we'll need to initialize our weights differently for tanh. relu is okay with (-.1, 0.1) but tanh
# likes narrower values like (-0.01, 0.01) so we'll just scale down our weights from the previous example.

# BUT because we're using softmax now, error should be calculated with a different error function... not
# ready for that yet so we'll just remove lines to compute error fer now.

# also need to adjust alpha

In [12]:
import numpy as np, sys
np.random.seed(1)

from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

images, labels = (x_train[0:1000].reshape(1000, 28*28) / 255, y_train[0:1000])

one_hot_labels = np.zeros((len(labels), 10))
for i,l in enumerate(labels):
    one_hot_labels[i][l] = 1
labels = one_hot_labels

test_images = x_test.reshape(len(x_test), 28*28) / 255
test_labels = np.zeros((len(y_test), 10))
for i,l in enumerate(y_test):
    test_labels[i][l] = 1

def tanh(x):
    return np.tanh(x)

def tanh2deriv(output):
    return 1 - (output ** 2)

def softmax(x):
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

alpha, iterations, hidden_size = (2, 300, 100) # alpha diff
pixels_per_image, num_labels = (784, 10)
batch_size = 100

weights_0_1 = 0.02*np.random.random((pixels_per_image, hidden_size)) -0.01 # scale to narrower range for tanh activation
weights_1_2 = 0.2*np.random.random((hidden_size, num_labels)) - 0.1 #same range as before is fine

In [13]:
for j in range(iterations):
    correct_cnt = 0
    for i in range(int(len(images) / batch_size)):
        batch_start, batch_end = ((i * batch_size), ((i+1)*batch_size)) # train in batches
        layer_0 = images[batch_start:batch_end]
        layer_1 = tanh(np.dot(layer_0, weights_0_1))
        dropout_mask = np.random.randint(2, size=layer_1.shape) # randomly turn off nodes
        layer_1 *= dropout_mask * 2
        layer_2 = softmax(np.dot(layer_1, weights_1_2)) #softmax baybee
        
        for k in range(batch_size):
            correct_cnt += int(np.argmax(layer_2[k:k+1]) == \
                               np.argmax(labels[batch_start+k:batch_start+k+1]))
        
        layer_2_delta = (labels[batch_start:batch_end]-layer_2) / (batch_size * layer_2.shape[0])
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * tanh2deriv(layer_1) # use layer 1 output to calc deriv
        layer_1_delta *= dropout_mask
            
        weights_1_2 += alpha * layer_1.T.dot(layer_2_delta)
        weights_0_1 += alpha * layer_0.T.dot(layer_1_delta)
    
    test_correct_cnt = 0
    for i in range(len(test_images)):
        layer_0 = test_images[i:i+1]
        layer_1 = tanh(np.dot(layer_0, weights_0_1))
        layer_2 = np.dot(layer_1, weights_1_2)  # no softmax?
        test_correct_cnt += int(np.argmax(layer_2) == np.argmax(test_labels[i:i+1]))
            
    if(j % 10 == 0):
        sys.stdout.write("\n"+ "I:" + str(j) + \
                         " test-acc:" + str(test_correct_cnt/float(len(test_images)))+\
                         " train-acc:" + str(correct_cnt/float(len(images))))
        


I:0 test-acc:0.394 train-acc:0.156
I:10 test-acc:0.6867 train-acc:0.723
I:20 test-acc:0.7025 train-acc:0.732
I:30 test-acc:0.734 train-acc:0.763
I:40 test-acc:0.7663 train-acc:0.794
I:50 test-acc:0.7913 train-acc:0.819
I:60 test-acc:0.8102 train-acc:0.849
I:70 test-acc:0.8228 train-acc:0.864
I:80 test-acc:0.831 train-acc:0.867
I:90 test-acc:0.8364 train-acc:0.885
I:100 test-acc:0.8407 train-acc:0.883
I:110 test-acc:0.845 train-acc:0.891
I:120 test-acc:0.8481 train-acc:0.901
I:130 test-acc:0.8505 train-acc:0.901
I:140 test-acc:0.8526 train-acc:0.905
I:150 test-acc:0.8555 train-acc:0.914
I:160 test-acc:0.8577 train-acc:0.925
I:170 test-acc:0.8596 train-acc:0.918
I:180 test-acc:0.8619 train-acc:0.933
I:190 test-acc:0.863 train-acc:0.933
I:200 test-acc:0.8642 train-acc:0.926
I:210 test-acc:0.8653 train-acc:0.931
I:220 test-acc:0.8668 train-acc:0.93
I:230 test-acc:0.8672 train-acc:0.937
I:240 test-acc:0.8681 train-acc:0.938
I:250 test-acc:0.8687 train-acc:0.937
I:260 test-acc:0.8684 train-