In [None]:
# For each step in the training phase (each 'epoch'), we update the network's weights and biases using 
# the update_mini_batch method. Let's look closer at what this is. 
# Intuitively, we want to gradually move the weights and biases towards the values that produce the 
# correct output from the whole network. We do this using gradient descent and backpropagation
# The rate at which we move towards this optimum depends on the learning rate, eta

def update_mini_batch(mini_batch, eta):
    # 'nabla' is the word for the upside down triangle symbol we've been using to denote gradient vectors
    # np.zeros produces zeros in the same shape as the biases and weights
    # at the start, we don't know what the nablas are, so we initialise them to 0
    nabla_b = [np.zeros(b.shape) for b in biases]
    nabla_w = [np.zeros(w.shape) for w in weights]
    
    # Look over our set of "mini batches", or the groups we've broken the training data into
    for x, y in mini_batch:
        
        # we use backpropagation to get gradient vectors, treating this as a black box at the moment
        # delta_nabla_b holds the gradients for each bias, or how much the gradient changes if 
        # we alter each bias and hold everything else constant
        # delta_nabla_w holds the gradients for the weights
        delta_nabla_b, delta_nabla_w = backprop(x, y)
        
        # we update the gradient vectors
        # I'm actually not sure why this step is here, can anyone explain? 
        # seems odd that we'd keep adding gradient estimates on top of each other as we loop over the mini-batch
        nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
        nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
    
    # now, using the gradients we've gotten from the backprop algorith, we're updating the 
    # weights and bias vectors by the opposite of the gradients 
    # this updates the parameters to values that decrease the cost function
    # take each weight in weights, add -1*gradient (scaled by learning rate + number of training examples)
    weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(weights, nabla_w)]
    
    # take each bias in biases, add -1*gradient (scaled by learning rate + number of training examples)
    biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(biases, nabla_b)]

In [None]:
# This method returns nabla_b, nabla_w, which are layer by layer lists of gradients of the cost function for the weights and biases
# specifically, they are the partial derivatives of the cost function for each given parameter (each weight, each bias term)
# we need to know these gradients for each parameter to figure out which direction to push them in
def backprop(x, y):
    
    # we begin by knowing nothing about the partial derivatives - we initialise the gradient vectors (nablas) with zeros
    nabla_b = [np.zeros(b.shape) for b in biases]
    nabla_w = [np.zeros(w.shape) for w in weights]
    
    # we'll be storing lots of activation values along the way
    # recall that the activation of a neuron is its output, e.g. sigma(z)
    activation = x
    activations = [x] # list to store all the activations, layer by layer
    zs = [] # list to store all the z vectors, layer by layer
    
    # given some activation, we multiply by the weights and add the biases to get the activity at the next layer over
    for b, w in zip(biases, weights):
        
        # z is the input to a neuron, the weight sum of inputs + a bias
        # forward propagate information through the network: calculate the inputs and outputs for each layer 
        z = np.dot(w, activation)+b
        zs.append(z)
        activation = sigmoid(z)
        activations.append(activation)
        
        # we calculate the error of the output layer first
        # we get the delta using BP equation 1
        delta = cost_derivative(activations[-1], y) * sigmoid_prime(zs[-1])
        
        # the bias gradients, for example in the last layer as here, are just equivalent to the delta
        # this is BP equation 3
        nabla_b[-1] = delta
        # the gradient of each weight is given by: the delta of the neuron the weight leads to multiplied by 
        # the activation of the neuron the weight leads away from (BP equation 4)
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())
       
        # for each layer in the network aside from the input layer
        # that's why the range starts at 2 instead of 1, we've already got nablas for the output layer, above
        # we must now update the nabla_b and nabla_w vectors for all other layers in the network
        for l in xrange(2, self.num_layers):
            
            # fetch the activation for e.g. the second from the last layer
            z = zs[-l]
            # get the derivatives of the activation function at the values of z
            sp = sigmoid_prime(z)
            
            # we can get the errors for this layer by referencing the error in the next layer
            # i.e. we backpropagate the error
            # we multiply the error by the weights "backwards", and multiply by derivative of the
            # activation function. This is BP equation 2
            delta = np.dot(weights[-l+1].transpose(), delta) * sp
            
            # update nabla_b and nabla_w, same procedure as when we updated it for the output layer
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
            
        # once the nablas for each layer have been updated (after looping through all the layers), return them 
        return (nabla_b, nabla_w)