Multilayer perceptron implementation from stratch.

In [148]:
import numpy as np
from tensorflow.keras.datasets import mnist

In [149]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.reshape(-1, 28*28) / 255.0
X_test = X_test.reshape(-1, 28*28) / 255.0

y_train = np.eye(10)[y_train]
y_test = np.eye(10)[y_test]

In [150]:
layer_sizes = (28*28, 64, 32, 10)

network = dict()

# Initialize the weights and biases.
# If this doesn't work, try using He initialization.
for i in range(1, len(layer_sizes)):
    # He initialization for weights
    network[f'W{i}'] = np.random.randn(layer_sizes[i], layer_sizes[i-1]) * np.sqrt(2 / layer_sizes[i-1])
    # Biases initialized to zero (common practice)
    network[f'B{i}'] = np.zeros(layer_sizes[i])

In [151]:
# Functions.

def forward_pass(X, network):
    network['A0'] = X
    network['Z1'] = network['A0'] @ network['W1'].T + network['B1']
    network['A1'] = np.where(network['Z1'] > 0, network['Z1'], 0)
    network['Z2'] = network['A1'] @ network['W2'].T + network['B2']
    network['A2'] = np.where(network['Z2'] > 0, network['Z2'], 0)
    network['Z3'] = network['A2'] @ network['W3'].T + network['B3']
    network['A3'] = np.exp(network['Z3']) / np.sum(np.exp(network['Z3']), axis=1, keepdims=True)

def back_propagation(network, y):
    gradients = dict()

    m = y.shape[0]

    gradients['Z3'] = network['A3'] - y
    gradients['W3'] = gradients['Z3'].T @ network['A2'] / m
    gradients['B3'] = np.sum(gradients['Z3'], axis=0) / m
    gradients['A2'] = gradients['Z3'] @ network['W3']
    gradients['Z2'] = np.where(network['Z2'] > 0, gradients['A2'], 0)
    gradients['W2'] = gradients['Z2'].T @ network['A1'] / m
    gradients['B2'] = np.sum(gradients['Z2'], axis=0) / m
    gradients['A1'] = gradients['Z2'] @ network['W2']
    gradients['Z1'] = np.where(network['Z1'] > 0, gradients['A1'], 0)
    gradients['W1'] = gradients['Z1'].T @ network['A0'] / m
    gradients['B1'] = np.sum(gradients['Z1'], axis=0) / m

    return gradients

def update_network(network, gradients, lr):
    for i in range(1, len(layer_sizes)):
        network[f"W{i}"] -= lr * gradients[f"W{i}"]
        network[f"B{i}"] -= lr * gradients[f"B{i}"]

# Training loop.

def train_network(X_train, y_train, network, lr, epochs):
    for epoch in range(epochs):
        forward_pass(X_train, network)
        gradients = back_propagation(network, y_train)
        update_network(network, gradients, lr)

def test_network(X_test, y_test, network):
    forward_pass(X_test, network)
    predictions = np.argmax(network['A3'], axis=1)
    accuracy = np.mean(predictions == np.argmax(y_test, axis=1))
    return accuracy

In [154]:
train_network(X_train, y_train, network, 0.1, 50)


In [None]:
accuracy = test_network(X_test, y_test, network)
print(f"Accuracy: {accuracy}")

In [None]:
# Demonstrate the network's predictions.

import matplotlib.pyplot as plt

for i in range(10):
    forward_pass(X_test[i:i+1], network)
    prediction = np.argmax(network['A3'])
    plt.imshow(X_test[i].reshape(28, 28))
    plt.title(f"Prediction: {prediction} Actual: {np.argmax(y_test[i])}")
    plt.show()