In [3]:
# Import necessary libraries
import numpy as np
import random
from keras.datasets import mnist
from keras.utils import to_categorical

def load_MNIST():
    
    # Load MNIST data using Keras
    (train_images, train_labels), (test_images, test_labels) = mnist.load_data()

    # Preprocess the data
    # Reshape and normalize image data
    train_images = train_images.reshape(train_images.shape[0], 28*28)
    train_images = train_images.astype('float32') / 255
    test_images = test_images.reshape(test_images.shape[0], 28*28)
    test_images = test_images.astype('float32') / 255

    # Convert labels to one-hot encoding
    train_labels = to_categorical(train_labels, 10)
    test_labels = to_categorical(test_labels, 10)

    # Convert data to a list of tuples as required by the Network class
    training_data = list(zip([np.reshape(x, (784, 1)) for x in train_images], [np.reshape(y, (10, 1)) for y in train_labels]))
    test_data = list(zip([np.reshape(x, (784, 1)) for x in test_images], [np.reshape(y, (10, 1)) for y in test_labels]))
    return (training_data, test_data)


In [4]:
# dimensions: [784, 30, 10]

# Set up some training data before recording

In [163]:
import numpy as np
import random


class Network:
    
    def __init__(self, sizes):
        self.biases = [np.random.randn(l,1) for l in sizes[1:]]
        self.weights = [np.random.randn(l,r) for r, l in zip(sizes[:-1], sizes[1:])]
        self.n = len(sizes)
    
    
    # Just runs through the weights and returns final layer's activations (result)
    def feedforward(self, a):
        for w, b in zip(self.weights, self.biases):
            a = np.dot(w,a) + b
        return a
    
    
    def mini_batch_update(self, batch, eta):
        
        # What we'll update our weights + biases with after computing
        # the gradient of cost function
        w_update = [np.zeros(w.shape) for w in self.weights]
        b_update = [np.zeros(b.shape) for b in self.biases]
        
        for x, y in batch:
            w,b = self.backprop(x,y)
            
            # small updates
            w_update = [w+nw for w,nw in zip(w_update, w)]
            b_update = [b+nb for b,nb in zip(b_update, b)]
        
        # Now update the actual parameters!
        
        self.weights = [w - (eta/len(batch)) * update for w, update in zip(self.weights, w_update) ]
        self.biases = [b - (eta/len(batch)) * update for b, update in zip(self.biases, b_update) ]
        
    # now for the most exciting part!!!
    def backprop(self, x, y):
        
        # Feedforward: we need all the activations and weighted inputs zs
        activation = x
        activations = [activation]
        zs = []
        
        w_nabla = [np.zeros(w.shape) for w in self.weights]
        b_nabla = [np.zeros(b.shape) for b in self.biases]
        
        
        for w,b in zip(self.weights, self.biases):
            
            z = np.dot(w,activation) + b
            
            
            activation = sigmoid(z)
            zs.append(z)
            activations.append(activation)
            
        # Final Layer
        delta = self.cost_derivative(activations[-1],y) * sigmoid_prime(zs[-1])
        
        # Now we can update the nabla_b and nabla_b for final layer
        b_nabla[-1] = delta
        w_nabla[-1] = np.dot(delta, activations[-2].transpose())
    
        # Backprop
        for l in range(2,self.n):
            delta = np.dot(self.weights[-l+1].transpose(),delta) * sigmoid_prime(zs[-l])
            b_nabla[-l] = delta
            w_nabla[-l] = np.dot(delta, activations[-l-1].transpose())
        return (w_nabla, b_nabla)
        
        
    # we're using a very simple cost function
    def cost_derivative(self, final_activation, actual_output):
        return (final_activation - actual_output)
        
        
    # Training!
    def SGD(self, training_data, epochs, batch_size, eta, test_data=None):
        for epoch in range(epochs):
            random.shuffle(training_data)
            batches = [training_data[x:x+batch_size] for x in range(0,len(training_data), batch_size)]
            
            for batch in batches:
                self.mini_batch_update(batch, eta)
                
            if (test_data):
                print(f"Epoch {epoch}: {self.evaluate(test_data) / len(test_data)}")
                

    
    
    
    def evaluate(self, test_data):
        
        # x: input layer
        # y: right answer
        results = [(int(np.argmax(self.feedforward(x))), int(np.argmax(y))) for x, y in test_data]
        
#         print(type(results[0][0]))
#         print(type(results[0][1]))
        correct = 0
        for _ in range(len(results)):
            if (results[_][0]==results[_][1]):
                correct += 1
        return correct
    
def sigmoid(z):
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    return sigmoid(z)*(1.0-sigmoid(z))


In [195]:
nn = Network([784, 30, 10])

In [196]:
len(training_data)

60000

In [198]:
nn.SGD(training_data,
       30, batch_size=1000, eta=3, test_data=test_data)

# Not good! Something's wrong. lets try a higher learning rate. more promising
# Let's try on full data size
# Solidly better than random! Let's see if it keeps improving...
# hmmmm let's fix it
# Back from break! Let's first make sure that the hyperparameters
# aren't causing the issue.
# We've experimented with eta, but batch size is still ridiculously large
# - 6 batches for all the data
# Let's try smaller batch size- should learn muc h faster.
# That's better! Let's skip to the end of the 30 epochs.

# It worked! Not super high accuracy, but high enough to validate our code works!
# Thanks for watching!


Epoch 0: 0.262
Epoch 1: 0.3282
Epoch 2: 0.3697
Epoch 3: 0.4004
Epoch 4: 0.4189
Epoch 5: 0.4372
Epoch 6: 0.4536
Epoch 7: 0.4739
Epoch 8: 0.4998
Epoch 9: 0.5218
Epoch 10: 0.5397
Epoch 11: 0.5562
Epoch 12: 0.5762
Epoch 13: 0.6097
Epoch 14: 0.6342
Epoch 15: 0.6443
Epoch 16: 0.6526
Epoch 17: 0.6566
Epoch 18: 0.6631
Epoch 19: 0.6668
Epoch 20: 0.6716
Epoch 21: 0.6747
Epoch 22: 0.6778
Epoch 23: 0.6814
Epoch 24: 0.6853
Epoch 25: 0.6888
Epoch 26: 0.6896
Epoch 27: 0.694
Epoch 28: 0.6952
Epoch 29: 0.6971


In [None]:
nn.evaluate(test_data) / len(test_data) 