In [1]:
## ARTIFICIAL NEURAL NETWORK FOR RECOGNITION OF HANDWRITTEN LETTERS ##
# Code follows very heavily from online tutorial of Michael A. Nielsen,
# "Neural Networks and Deep Learning", Determination Press, 2015

# Currently, the bulk of my personal contributions are cosmetic and/or
# updating the code for Python 3.5 from Python 2.7 as originally written.

# Changes yet to be implimented:
#  1) cross-entropy cost fucntion in stochastic gradient descent method
#  2) automated testing algorithm (for parameter tuning)
#  3) testing and output of mutliple layers/sizes

In [6]:
import numpy as np
import scipy
import random
import gzip
import pickle
import pandas
import sklearn.datasets
from sklearn.datasets import load_digits

In [13]:
# Load MNIST dataset
def load_data():
    f = gzip.open('./data/mnist.pkl.gz', 'rb')
    training_data, validation_data, test_data = pickle.load(f, encoding='latin1') # Needed due to python 2v3 pickle issues
    f.close()
    return (training_data, validation_data, test_data)
    
def vectorized_result(j):
    """Return a 10-dimensional unit vector with a 1.0 in the jth
    position and zeroes elsewhere.  This is used to convert a digit
    (0...9) into a corresponding desired output from the neural
    network."""
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

def load_data_wrapper():
    tr_d, va_d, te_d = load_data()
    training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
    training_results = [vectorized_result(y) for y in tr_d[1]]
    training_data = list(zip(training_inputs, training_results))
    validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
    validation_data = list(zip(validation_inputs, va_d[1]))
    test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
    test_data = list(zip(test_inputs, te_d[1]))
    return (training_data, validation_data, test_data)

In [14]:
training_data, validation_data, test_data = load_data_wrapper()

In [5]:
class Network(object):

    def __init__(self, sizes):
        """The list ``sizes`` contains the number of neurons in the respective layers of the network.  For example, 
        if the listwas [2, 3, 1] then it would be a three-layer network, with thefirst layer containing 2 neurons, 
        the second layer 3 neurons,and the third layer 1 neuron.  The biases and weights for the network are 
        initialized randomly, using a Gaussian distribution with mean 0, and variance 1.  Note that the first layer 
        is assumed to be an input layer, and by convention we won't set any biases for those neurons, since biases 
        are only ever used in computing the outputs from later layers."""
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x)
                        for x, y in zip(sizes[:-1], sizes[1:])]

    def feedforward(self, a):
        for b, w in zip(self.biases, self.weights):
            a = sigmoid(np.dot(w, a)+b)
        return a

    def SGD(self, training_data, epochs, mini_batch_size, eta,test_data=None):
        
        if test_data: n_test = len(test_data)
        n = len(training_data)
        for j in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.update_mini_batch(mini_batch, eta)
            if test_data:
                print("Epoch {0}: {1} / {2}".format(j, self.evaluate(test_data), n_test))
            else:
                print("Epoch {0} complete".format(j))

    def update_mini_batch(self, mini_batch, eta):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        for x, y in mini_batch:
            delta_nabla_b, delta_nabla_w = self.backprop(x, y)
            nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
            nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
        self.weights = [w-(eta/len(mini_batch))*nw for w, nw in zip(self.weights, nabla_w)]
        self.biases = [b-(eta/len(mini_batch))*nb for b, nb in zip(self.biases, nabla_b)]

    def backprop(self, x, y):
        nabla_b = [np.zeros(b.shape) for b in self.biases]
        nabla_w = [np.zeros(w.shape) for w in self.weights]
        
        # feedforward
        activation = x
        activations = [x] # list to store all the activations, layer by layer
        zs = [] # list to store all the z vectors, layer by layer
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w, activation)+b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
            
        # backward pass
        delta = self.cost_derivative(activations[-1], y) * \
            sigmoid_prime(zs[-1])
        nabla_b[-1] = delta
        nabla_w[-1] = np.dot(delta, activations[-2].transpose())

        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_prime(z)
            delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
            nabla_b[-l] = delta
            nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
        return (nabla_b, nabla_w)

    def evaluate(self, test_data):
        test_results = [(np.argmax(self.feedforward(x)), y) for (x, y) in test_data]
        return sum(int(x == y) for (x, y) in test_results)

    def cost_derivative(self, output_activations, y):
        return (output_activations-y)

def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1-sigmoid(z))

In [6]:
net = Network([784,30,10])

In [8]:
"""Often times, this method will not optemize hyper parameters well--which chan be laborious.
Instead, this is an opportunity to create my own tuning algorithm. Good idea to research
target-value-finding algorithms. (how to estimate function's zeros, etc)"""
net.SGD(training_data, 10, 10, 3.0, test_data=test_data)

Epoch 0: 8503 / 10000
Epoch 1: 8502 / 10000
Epoch 2: 8505 / 10000
Epoch 3: 8449 / 10000
Epoch 4: 8512 / 10000
Epoch 5: 8535 / 10000
Epoch 6: 8521 / 10000
Epoch 7: 8517 / 10000
Epoch 8: 8540 / 10000
Epoch 9: 8508 / 10000


In [17]:
training_data[0]

(array([[ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0.        ],
        [ 0

In [23]:
len(training_data[0])

2