In [235]:
import _pickle as cPickle
import gzip 
import numpy as np
import sys
import time
from scipy.special import expit


# Load the dataset
f = gzip.open('mnist.pkl.gz', 'rb')
train_set, valid_set, test_set = cPickle.load(f,encoding='latin1')
f.close()

print(train_set[1])

[5 0 4 ..., 8 4 8]


In [239]:
#these are just various mathematical tools needed for the NN

def sigmoidPrime(x):
    return expit(x)*(1-expit(x)) #use expit to prevent overflow with large values

def sigmoid(x):
    return expit(x) #admittedly this is a little unnecessary but I think it makes sense to have 
    #sigmoid and sigmoidPrime instead of expit and sigmoidPrime

def relu(x):
    if x<0:
        return 0
    else:
        return x

def reluPrime(x):
    if x<0:
        return 0
    else:
        return 1

def elementWise(f, x):
    for i in range(len(x)):
        x[i] = f(x[i])
    return x    

def softMax(x):
    z = np.exp(x)
    return z/sum(z)

def softMaxPrime(x):
    z = np.exp(x)
    c = sum(z)
    for i in range(len(z)):
        z[i] = (c-z[i])*z[i]/(c**2)
    return z

def display(x, label, act):
    strn = ""
    for i in range(len(x)):
        if i%28==0:
            print(strn)
            strn = ""
        if x[i]==0:
            strn += " "
        if x[i]>=act:
            strn += "x"
    print(label)
            
def logLoss(x, target):
    loss = 0
    for i in range(len(x)):
        loss += target[i]*np.log(x[i])+(1-target[i])*np.log(1-x[i])
    return (-1.0/len(x))*loss
    
def logLossPrime(x, target):
    grad = np.zeros(len(x))
    for i in range(len(x)):
        grad[i]=(-1.0/len(x))*(target[i]/x[i]-(1-target[i])/(1-x[i]))
    return grad

def MSE(x, target):
    a = x-target
    return np.dot(a,a)

def MSEPrime(x, target):
    return 2*(x-target)

def crossEntropy(output, target):
    cost = 0
    for i in range(len(target)):
        cost -= target[i]*np.log(output[i])
    return cost

def trainingGraph(trainingData):
    costs = trainingData[0]
    accuracies = trainingData[1]
    plt.plot(costs, 'ro')
    plt.ylabel("total cost")
    plt.xlabel("iteration")
    plt.show()
    plt.plot(100.0*(1.0-accuracies), 'bo')
    plt.plot("total error percentage")
    plt.xlabel("iteration")
    plt.show()
    
    

class Neural_Network:
    defaultSize = 16 #default number of neurons in hidden layers if no shape list given
    inputChecks = True #this will change whether inputs that match the MNIST format are given
    #can be turned off to allow for debugging on smaller examples
    
    #activation must be an activation function that works for a single scalar 
    #and activation prime is its derivative. costFunction is a cost function for a vector representing an output layer
    #and the target vector. costDeriv is its derivative with respect to the vector of activations.
    #shape is list of integers where shape[i] = the number of neurons in ith layer
    #layers is an integer describing the number of layers
    #shape and layers are optional. if both are given the shape list will be followed, if neither are given
    #it will use the default value for layers and make all hidden layers have defaultSize many neurons
    def __init__(self, activation, activationPrime, costFunction, costDeriv, shape = None, layers = 4):
        if layers<2:
            raise NameError("Too few layers. Need an input layer and an output layer.")
        if Neural_Network.inputChecks and (shape[0] != 28**2 or shape[len(shape)-1] != 10):
            raise NameError("Improper input or output layer size. \
                            Must be 28^2 input neurons and 10 output neurons to work with MNIST.")
        if shape==None:
            shape = [28**2] + [defaultSize for i in range(layers-2)] + [10]
        self.activation = activation
        self.activationPrime = activationPrime
        self.shape = shape
        self.costFunction = costFunction
        self.costDeriv = costDeriv
        self.weights = Neural_Network.constructWeights(shape) 
        #weights[i] is the weights going into layer i
        self.bias = Neural_Network.constructBias(shape)
        #bias[i] is the bias on layer i 
        self.activations = []
        self.zs = []
        
    #returns a list of matrices of weights. weights[i] is the set of weights going into ith layer
    #weights[i][j][k] represents the weight going from the kth neuron in layer i-1 to jth neuron in layer i 
    def constructWeights(shape):
        weights = [None]
        for i in range(len(shape)-1): 
            weights.append(np.random.uniform(-1,1,shape[i]*shape[i+1]).reshape((shape[i+1],shape[i])))
        return weights
    
    #returns a list of vectors of biases. bias[i] is the set of biases on the ith layer
    #bias[i][j] is the bias in the ith layer on the jth neuron
    def constructBias(shape):
        bias = [None]
        for i in range(1,len(shape)): 
            bias.append(np.random.uniform(-1,1,shape[i]))
        return bias
    
    #performs forward propagation on the input x with the current weights and biases
    #using activation function from the constructor returns the activations on the last layer 
    #updates the zs and activations attributes
    def forwardProp(self, x):
        if Neural_Network.inputChecks and len(x) != 28**2:
            raise NameError("improper input")
        prevAct = x
        act = []
        self.activations = []
        self.zs = []
        self.activations.append(prevAct)
        for i in range(1,len(self.shape)): #each layer
            z = np.dot(self.weights[i], prevAct) + self.bias[i]
            act = elementWise(self.activation, z)
            self.activations.append(act)
            self.zs.append(z)
            prevAct = act
        return act
    
    #returns the total cost for the neural network on all datapoints in data
    #using cost function costFunc for each point. returns cost as a scalar
    def totalCost(self, data, costFunc):
        images = data[0]
        labels = data[1]
        if Neural_Network.inputChecks and len(images) != len(labels):
            raise NameError("improper input")
        target = np.zeros(10)
        cost = 0 
        for i in range(len(images)):
            target[labels[i]] = 1
            out = self.forwardProp(images[i])
            cost += costFunc(out, target)
            target[labels[i]] = 0
        return cost
        
    #returns the classification as a scalar based on the outputActivations
    #picks the index with highest activation
    def classification(outputActivations):
        maximum = -1
        index = -1
        for i in range(len(outputActivations)):
            if outputActivations[i] >= maximum:
                index = i
                maximum = outputActivations[i]
        return index
     
    #randomly initializes all weights and biases    
    def randomInitialization(self):
        self.weights = Neural_Network.constructWeights(self.shape) 
        self.bias = Neural_Network.constructBias(self.shape)
    
    #using backProp this will perform gradient descent from the current initialization of the weights
    #and biases on the given data. data in form touple of array of images and array of labels
    #will stop after iterations many iterations. returns a touple of the best weights and biases
    #and a list of the costs over time. will update the weights and biases in the neural network
    def gradientDescent(self, data, iterations, learningRate):
        return self.stochasticGradientDescent(data, len(data[0]), iterations, learningRate)
    
    #will apply the gradient with stepSize where the gradient is in form 
    #given by the backProp function
    def applyGradient(self, gradient, stepSize):
        for i in range(1, len(gradient[0])): #for each matrix in the weight update, 
            #first value is None for convenience in indexing
            self.weights[i] -= stepSize * gradient[0][i]
        for i in range(1, len(gradient[1])): #for each vector in the bias update
            self.bias[i] -= stepSize * gradient[1][i]
    
    #computes the correct and incorrect number of classified data points on data
    #prints the number correct, number incorrect, percent correct, and percent incorrect
    #returns a touple of the number correct and total number of data points
    def validation(self, data):
        images = data[0]
        labels = data[1]
        if len(images) != len(labels):
            raise NameError("improper input")
        correct = 0 
        wrong = 0
        for i in range(len(images)):
            if (Neural_Network.classification(self.forwardProp(images[i]))) == labels[i]:
                correct += 1
            else:
                wrong += 1
        print("correct: ", correct)
        print("wrong: ", wrong)
        print("accuracy:", 100.0*correct/len(images), "%")
        print("error:", 100.0*wrong/len(images), "%")
        return (correct, len(images))
    
    #chooses a random batch of size batchSize from images and labels
    #note it does not replace when sampling
    def randomBatch(images, labels, batchSize):
        indices = np.random.choice(len(images), batchSize, replace = False)
        #if you set replace to True this will break the gradientDescent function
        imageBatch = []
        labelBatch = []
        for i in range(len(indices)):
            imageBatch.append(images[indices[i]])
            labelBatch.append(labels[indices[i]])
        return (imageBatch, labelBatch)
        
    #using backProp this will perform stoachastic gradient descent from the current initialization of the weights
    #and biases on the given data. will choose batchSize many samples from data 
    #data in form touple of array of images and array of labels
    #will stop after iterations many iterations. returns a touple of the best weights and biases
    #and a list of the costs over time. will update the weights and biases in the neural network
    def stochasticGradientDescent(self, data, batchSize, iterations, learningRate):
        start = time.time()
        bestCost = sys.maxsize
        bestWandB = []
        costs = []
        accuracies = []
        images = data[0]
        labels = data[1]
        for _ in range(iterations):
            for i in range(5): #performs 5 gradient steps before re-computing the cost for all examples
                #this is somewhat of a hyper-parameter that just allows things to run pretty fast
                imageSet, labelSet = Neural_Network.randomBatch(images, labels, batchSize)
                target = np.zeros(10)
                gradients = []
                for i in range(len(imageSet)):
                    target[labelSet[i]] = 1
                    output = self.forwardProp(imageSet[i])
                    gradients.append(self.backProp(target))
                    target[labelSet[i]] = 0
                self.applyGradient(self.averageGradient(gradients), learningRate)
            cost = self.totalCost(data, self.costFunction)
            costs.append(cost)
            a = self.validation(data)
            accuracies.append(a[0]/a[1])
            print(cost)
            if cost <= bestCost:
                bestCost = cost
                bestWandB = (self.weights, self.bias)
        trainingData = (costs, accuracies)
        end = time.time()
        print(iterations, "iterations took", end - start, "seconds.")
        return (bestWandB, trainingData)
    
    #given a list of touples of the gradient in the form given by backProp
    #will return the average gradient
    def averageGradient(self, gradients):
        averageGrad = (Neural_Network.constructWeights(self.shape), Neural_Network.constructBias(self.shape))
        for i in range(len(averageGrad)): #weights then biases
            for j in range(len(gradients)): #grad from each sample
                for k in range(1, len(gradients[j][i])): #grad for each matrix
                    averageGrad[i][k] += gradients[j][i][k]
            for k in range(1,len(averageGrad[i])):#done at end to prevent rounding small numbers to zero
                averageGrad[i][k] /= len(gradients) #if overflow is a problem then divide at each step
        return averageGrad
    
    #this will return the gradient of the cost on the single target
    #the form is a touple with the weights and then the biases
    #in the same format as the weights and biases attributes for the Neural_Network class
    def backProp(self, target):
        if Neural_Network.inputChecks and len(target) != 10:
            raise NameError("improper input")
        biasGrad = [None for _ in range(len(self.shape))]
        weightGrad = [None for _ in range(len(self.shape))]
        delta = self.costDeriv(self.activations[-1], target) * self.activationPrime(self.zs[-1])
        biasGrad[-1] = delta
        weightGrad[-1] = np.tile(np.array([delta]).transpose(), (1,self.shape[-2]))*np.tile(np.array(self.activations[-2]),(self.shape[-1],1))
        #this line (and the version of it below can be a little confusing, see readme)
        for l in range(2, len(self.shape)):
            z = self.zs[-l]
            actDeriv = self.activationPrime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * actDeriv
            biasGrad[-l] = delta
            weightGrad[-l] = np.tile(np.array([delta]).transpose(), (1,self.shape[-l-1]))*np.tile(np.array(self.activations[-l-1]),(self.shape[-l],1))
        return (weightGrad, biasGrad)
    

The line right before the for loop in ```backProp()``` is a little confusing so I'll explain. $\frac{\partial C}{\partial W^{l}_{j,k}} = a^{l-1}_{k} \Delta^{l}_{j}$ where $C$ is the cost and $W^{l}_{j,k}$ is the weight going from the $k$th neuron in the $l-1$th layer to the $j$th neuron in the $l$th layer, $a^{l-1}_{k}$ is the activation on the $k$th neuron in the $l-1$th layer, and  $\Delta^{l}_{j}$ is the derivative of the cost with respect to the $j$th component of $z^{l}$. If you've never seen this before, it can be confusing so draw out a simple NN with very few neurons and write out the gradient of the weight matrix. You will notice that if you make a matrix with the same dimensions as $W^{l}$ where the columns are the vector $\Delta^{l}$ and the same dimension matrix where the rows are the row vector $(a^{l-1})^{T}$ and do element-wise multiplication (not matrix multiplication) of these matrices, then the resulting matrix is the gradient of the cost with respect to the weight matrix $W^{l}$.

```np.tile``` allows you to create matrices from repeating column or row vectors. The [NumPy documentation](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.tile.html) explains it much better than I can.

In [240]:
b = Neural_Network(expit, sigmoidPrime, MSE, MSEPrime, shape=[28**2, 30, 30, 10])
train1 = b.stochasticGradientDescent(train_set, 2000, 40, 5)


correct:  11104
wrong:  38896
accuracy: 22.208 %
error: 77.792 %
44789.5725365
correct:  14005
wrong:  35995
accuracy: 28.01 %
error: 71.99 %
43584.0692002
correct:  22897
wrong:  27103
accuracy: 45.794 %
error: 54.206 %
40019.6974792
correct:  23590
wrong:  26410
accuracy: 47.18 %
error: 52.82 %
38143.7754353
correct:  29943
wrong:  20057
accuracy: 59.886 %
error: 40.114 %
33246.2197922
correct:  30307
wrong:  19693
accuracy: 60.614 %
error: 39.386 %
31592.0676809
correct:  33231
wrong:  16769
accuracy: 66.462 %
error: 33.538 %
28148.3615751
correct:  34328
wrong:  15672
accuracy: 68.656 %
error: 31.344 %
27126.4580038
correct:  34458
wrong:  15542
accuracy: 68.916 %
error: 31.084 %
25714.3637737
correct:  35422
wrong:  14578
accuracy: 70.844 %
error: 29.156 %
24907.8291787
correct:  35556
wrong:  14444
accuracy: 71.112 %
error: 28.888 %
23749.8453958
correct:  36243
wrong:  13757
accuracy: 72.486 %
error: 27.514 %
22947.8592803
correct:  36624
wrong:  13376
accuracy: 73.248 %
error: 

AttributeError: 'Neural_Network' object has no attribute 'validate'

In [241]:
train1
b.validation(valid_set)
b.validation(test_set)
train2 = b.gradientDescent(train_set, 100, 0.1)
b.validation(valid_set)
b.validation(test_set)
train3 = b.gradientDescent(train_set, 20, 0.01)
b.validation(valid_set)
b.validation(test_set)

correct:  8231
wrong:  1769
accuracy: 82.31 %
error: 17.69 %
correct:  8118
wrong:  1882
accuracy: 81.18 %
error: 18.82 %
correct:  40065
wrong:  9935
accuracy: 80.13 %
error: 19.87 %
15719.9204202
correct:  40058
wrong:  9942
accuracy: 80.116 %
error: 19.884 %
15718.9583885
correct:  39993
wrong:  10007
accuracy: 79.986 %
error: 20.014 %
15718.4445488
correct:  39975
wrong:  10025
accuracy: 79.95 %
error: 20.05 %
15718.2595221
correct:  39970
wrong:  10030
accuracy: 79.94 %
error: 20.06 %
15718.262981
correct:  39955
wrong:  10045
accuracy: 79.91 %
error: 20.09 %
15718.324262
correct:  39970
wrong:  10030
accuracy: 79.94 %
error: 20.06 %
15718.325023
correct:  39970
wrong:  10030
accuracy: 79.94 %
error: 20.06 %
15718.1790087
correct:  39968
wrong:  10032
accuracy: 79.936 %
error: 20.064 %
15717.8325462
correct:  39957
wrong:  10043
accuracy: 79.914 %
error: 20.086 %
15717.2756343
correct:  39960
wrong:  10040
accuracy: 79.92 %
error: 20.08 %
15716.4980976
correct:  39955
wrong:  1004

(8120, 10000)