In [1]:
import os
import numpy as np 
import struct
import matplotlib.pyplot as plt
from array import array
import random

In [2]:
class MNIST:
    def __init__(self, path):
        self.path = path
        self.train_images_path = f'{self.path}/train-images.idx3-ubyte'
        self.train_labels_path = f'{self.path}/train-labels.idx1-ubyte'
        self.test_images_path = f'{self.path}/t10k-images.idx3-ubyte'
        self.test_labels_path = f'{self.path}/t10k-labels.idx1-ubyte'
        
    def vectorize(self, x):
        y = np.zeros((10,1))
        y[x] = 1
        return y
        
    def load(self, images_path, labels_path):
        with open(labels_path, 'rb') as f:
            magic, size = struct.unpack('>II', f.read(8))
            
            if magic != 2049:
                raise ValueError(f'bad magic number, expected 2049 got {magic}')
            
            labels = [self.vectorize(x) for x in array('B', f.read())]
        
        with open(images_path, 'rb') as f:
            magic, size, rows, cols = struct.unpack('>IIII', f.read(16))
            
            if magic != 2051: 
                raise ValueError(f'bad magic number, expected 2051 got {magic}')
            
            image_data = array('B', f.read())
            images = []
            for i in range(size):
                img = np.array(image_data[i * rows * cols : (i+1) * rows * cols]).reshape(784, 1) / 255
                images.append(img)
            images = np.array(images) 
        
        return list(zip(images, labels))
    
    def load_train(self):
        return self.load(self.train_images_path, self.train_labels_path)
    
    def load_test(self):
        return self.load(self.test_images_path, self.test_labels_path)

In [3]:
class DifferentiableFunction:
    def __init__(self, f, df):
        self.f = f
        self.deriv = df
        
    def __call__(self, *args):
        return self.f(*args)
    
sigma = lambda z: 1.0/(1.0 + np.exp(-z))
    
squared_loss = DifferentiableFunction(lambda y, yhat: (1/2)*np.linalg.norm(yhat - y)**2, lambda y, yhat: yhat - y)
sigmoid = DifferentiableFunction(sigma, lambda z: sigma(z) * (1 - sigma(z)))
relu = DifferentiableFunction(lambda z: z * (z > 0.5), lambda z: 1 * (z > 0.5))

In [4]:
class NeuralNet:
    def __init__(self, sizes, phis, weights=None, biases=None):
        self.sizes = sizes
        self.n = len(sizes) - 1
        self.phis = phis
        self.weights = [np.random.randn(sizes[i+1], sizes[i]) for i in range(self.n)]
        self.biases = [np.random.randn(sizes[i+1], 1) for i in range(self.n)]
    
    def backprop(self, x, y, loss):
        weight_deriv = [np.zeros(w.shape) for w in self.weights]
        bias_deriv = [np.zeros(b.shape) for b in self.biases]
        zs = []
        a = x
        acts = [x]
        
        # forward pass
        for w, b, phi in zip(self.weights, self.biases, self.phis):
            z = w @ a + b
            a = phi(z)
            zs.append(z)
            acts.append(a)
                                
        # backward pass
        delta = loss.deriv(y, acts[-1]) * self.phis[-1].deriv(zs[-1])
        bias_deriv[-1] = delta
        weight_deriv[-1] = delta @ acts[-2].T
        for l in range(2, len(self.sizes)):
            z = zs[-l]
            phi = self.phis[-l]
            delta = (self.weights[-l+1].T @ delta) * phi.deriv(z)
            bias_deriv[-l] = delta
            weight_deriv[-l] = delta @ acts[-l-1].T
        
        return weight_deriv, bias_deriv
    
    def learn(self, dataset, epochs=2, loss=squared_loss, eta=5, minibatch_size=10):
        N = len(dataset)
        for i in range(epochs):
            random.shuffle(dataset)
            for j in range(0, N, minibatch_size):
                minibatch = dataset[j:j+minibatch_size]
                self.learn_minibatch(minibatch, loss, eta)
            print(f'Epoch {i}: accuracy = {self.test(dataset)}')
    
    def learn_minibatch(self, minibatch, loss, eta):
        dw = [np.zeros(w.shape) for w in self.weights]
        db = [np.zeros(b.shape) for b in self.biases]
        
        for x, y in minibatch:
            weight_deriv, bias_deriv = self.backprop(x, y, loss)
            for l in range(self.n):
                dw[l] += weight_deriv[l]
                db[l] += bias_deriv[l]
        
        for l in range(self.n):
            self.weights[l] -= (eta/len(minibatch)) * dw[l]
            self.biases[l] -= (eta/len(minibatch)) * db[l]
        
    def predict(self, a):
        for w, b, phi in zip(self.weights, self.biases, self.phis):
            a = phi(w @ a + b)
        return a  
    
    def test(self, test_data):
        return sum(int(np.argmax(self.predict(x)) == np.argmax(y)) for x, y in test_data) / len(test_data)
        

In [6]:
mnist = MNIST('mnist')
dataset = mnist.load_train()

nn = NeuralNet([784, 30, 10], [sigmoid, sigmoid])
nn.learn(dataset, minibatch_size=10, epochs=30, eta=3)

Epoch 0: accuracy = 0.84025
Epoch 1: accuracy = 0.8541833333333333
Epoch 2: accuracy = 0.8598166666666667
Epoch 3: accuracy = 0.86225
Epoch 4: accuracy = 0.9445333333333333
Epoch 5: accuracy = 0.9479
Epoch 6: accuracy = 0.9502166666666667
Epoch 7: accuracy = 0.95365
Epoch 8: accuracy = 0.9559666666666666
Epoch 9: accuracy = 0.9551333333333333
Epoch 10: accuracy = 0.9585833333333333
Epoch 11: accuracy = 0.9596666666666667
Epoch 12: accuracy = 0.9618
Epoch 13: accuracy = 0.9640666666666666
Epoch 14: accuracy = 0.9629166666666666
Epoch 15: accuracy = 0.9651666666666666
Epoch 16: accuracy = 0.9665666666666667
Epoch 17: accuracy = 0.9668666666666667
Epoch 18: accuracy = 0.9676
Epoch 19: accuracy = 0.9681333333333333
Epoch 20: accuracy = 0.9676166666666667
Epoch 21: accuracy = 0.9680333333333333
Epoch 22: accuracy = 0.9696333333333333
Epoch 23: accuracy = 0.9712833333333334
Epoch 24: accuracy = 0.9705666666666667
Epoch 25: accuracy = 0.96905
Epoch 26: accuracy = 0.9713
Epoch 27: accuracy = 0

In [7]:
nn.test(mnist.load_test())

0.9531