In this lab, you should try to implement some of the techniques discussed in the lecture.
Here is a list of reasonable tasks.

Must implement:
 * Log-loss
 
Easy:
 * L1 and L2 regularization (you can choose one)
 * momentum, Nesterov's momentum (you can choose one)

Medium difficulty:
 * Adagrad, RMSProp (you can shoose one) - not much harder than momentum, really
 * dropout

Hard (and time-consuming):
 * batch-normalization

Try to test your network to see if these changes improve accuracy. They improve accuracy much more if you increase the layer size, and if you add more layers, say 1 or 2.

In [3]:
import random
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import timeit

In [4]:
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    # Derivative of the sigmoid
    return sigmoid(z)*(1-sigmoid(z))

In [5]:
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [6]:
mnist.train.images.shape

(55000, 784)

In [33]:
class Network(object):
    def __init__(self, sizes):
        # initialize biases and weights with random normal distr.
        # weights are indexed by target node first
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(sizes[:-1], sizes[1:])]
    def feedforward(self, a):
        # Run the network on a batch
        a = a.T
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.matmul(w, a)+b)
        return a
    def update_mini_batch(self, mini_batch, eta):
        # Update networks weights and biases by applying a single step
        # of gradient descent using backpropagation to compute the gradient.
        # The gradient is computed for a mini_batch which is as in tensorflow API.
        # eta is the learning rate      
        nabla_b, nabla_w = self.backprop(mini_batch[0].T,mini_batch[1].T) # CHANGE: Just one call!
            
        self.weights = [w-(eta/len(mini_batch[0]))*nw 
                        for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch[0]))*nb 
                       for b, nb in zip(self.biases, nabla_b)]
    def backprop(self, x, y):
        # For a single input (x,y) return a pair of lists.
        # First contains gradients over biases, second over weights.
        g = x
        gs = [g] # list to store all the gs, layer by layer
        fs = [] # list to store all the fs, layer by layer
        for b, w in zip(self.biases, self.weights):
            f = np.dot(w, g)+b
            fs.append(f)
            g = sigmoid(f)
            gs.append(g)
        # backward pass <- both steps at once
        dLdg = self.cost_derivative(gs[-1], y)
        dLdfs = []
        for w,g in reversed(zip(self.weights,gs[1:])):
            dLdf = np.multiply(dLdg,np.multiply(g,1-g))
            dLdfs.append(dLdf)
            dLdg = np.matmul(w.T, dLdf)
        
        dLdWs = [np.matmul(dLdf,g.T) for dLdf,g in zip(reversed(dLdfs),gs[:-1])] # automatic here
        dLdBs = [np.sum(dLdf,axis=1).reshape(dLdf.shape[0],1) for dLdf in reversed(dLdfs)] # CHANGE: Need to sum here
    
        return (dLdBs,dLdWs)

    def evaluate(self, test_data):
        # Count the number of correct answers for test_data
        pred = np.argmax(self.feedforward(test_data[0]),axis=0)
        corr = np.argmax(test_data[1],axis=1).T
        return sum(pred==corr)
    
    def cost_derivative(self, output_activations, y):
        return (output_activations-y) 
    
    def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
        train_size = training_data.images.shape[0]
        if test_data:
            test_size = test_data.images.shape[0]
        for j in xrange(epochs):
            t1 = timeit.default_timer()
            for k in range(train_size/mini_batch_size):
                self.update_mini_batch(training_data.next_batch(mini_batch_size), eta)
            if test_data:
                res = np.mean([self.evaluate(test_data.next_batch(mini_batch_size)) for k in range(test_size/mini_batch_size)])/mini_batch_size
                t2 = timeit.default_timer()
                print "Epoch {0}: {1}  in {2}".format(j, res,t2-t1)
            else:
                print "Epoch {0} complete".format(j)


network = Network([784,30,10])
network.SGD(mnist.train,epochs=10,mini_batch_size=200,eta=3.0,test_data=mnist.test)



Epoch 0: 0.579  in 0.785964012146
Epoch 1: 0.6501  in 0.875180006027
Epoch 2: 0.6895  in 0.773838043213
Epoch 3: 0.7035  in 0.757263183594
Epoch 4: 0.7608  in 0.792301893234
Epoch 5: 0.7914  in 0.790660142899
Epoch 6: 0.8641  in 0.792309045792
Epoch 7: 0.8881  in 0.805778980255
Epoch 8: 0.896  in 0.800841093063
Epoch 9: 0.903  in 1.17869400978
