### MLP from scratch
This notebook implements multilayer perceptrons from scratch using Numpy. The networks are tested on the MNIST digit dataset and achieve 99% accuracy.

### Importing libraries

In [None]:
import numpy as np

### Setting up datasets

In [None]:
import keras
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [None]:
x_train_encoded = np.reshape(x_train, (x_train.shape[0], 784)) / 255
x_test_encoded = np.reshape(x_test, (x_test.shape[0], 784)) / 255

In [None]:
y_train_encoded = np.zeros((y_train.shape[0], 10))
y_train_encoded[np.arange(y_train.shape[0]), y_train] = 1

y_test_encoded = np.zeros((y_test.shape[0], 10))
y_test_encoded[np.arange(y_test.shape[0]), y_test] = 1

### Building the neural network

In [None]:
class Linear:
  def __init__(self, in_dim: int, out_dim: int, lr: float) -> None:
    self.weights = np.random.normal(0, np.sqrt(2/in_dim), (in_dim, out_dim))
    self.biases = np.random.normal(0, np.sqrt(2/in_dim), (1, out_dim))
    self.lr = lr

  def forward(self, x: np.ndarray) -> np.ndarray:
    self.input = x
    return np.dot(x, self.weights) + self.biases

  def backward(self, grad: np.ndarray) -> np.ndarray:
    input_grad = np.dot(grad, self.weights.T)
    weights_grad = np.dot(self.input.T, grad)
    self.weights -= weights_grad * self.lr / grad.shape[0]
    self.biases -= np.sum(grad, axis=0) * self.lr / grad.shape[0]
    return input_grad

In [None]:
class ReLU:
  def __init__(self):
    pass

  def forward(self, x: np.ndarray) -> np.ndarray:
    self.input = x
    return np.maximum(x, np.zeros(x.shape))

  def backward(self, grad: np.ndarray) -> np.ndarray:
    return (self.input > 0).astype(np.float32) * grad

In [None]:
class Softmax:
  def __init__(self):
    pass

  def forward(self, x: np.ndarray) -> np.ndarray:
    e = np.exp(x - np.max(x, axis=1, keepdims=True))
    self.probs = e / np.sum(e, axis=1, keepdims=True)
    return self.probs

  def backward(self, grad: np.ndarray) -> np.ndarray:
    self.backprop = np.zeros((self.probs.shape[0], self.probs.shape[1]))
    for i in range(self.probs.shape[0]):
      self.backprop[i] = np.dot(grad[i], (np.diag(self.probs[i]) - np.outer(self.probs[i], self.probs[i])))
      # self.backprop[i] = np.sum((np.diag(self.probs[i]) - np.outer(self.probs[i], self.probs[i])) * grad[i], axis=1).T
    # self.backprop = np.diag(np.sum(self.probs, axis=0)) - np.outer(np.sum(self.probs, axis=0), np.sum(self.probs, axis=0))

    # x = np.sum(self.backprop * np.sum(grad, axis=0), axis=1, keepdims=True).T

    # return x
    # print(self.backprop)
    return self.backprop

In [None]:
class CrossEntropyLoss:
  def __init__(self):
    self.EPSILON = 1e-15

  def forward(self, pred_prob, true_prob) -> np.ndarray:
    return -1/pred_prob.shape[0] * true_prob * np.log(pred_prob + self.EPSILON)

  def backward(self, pred_prob, true_prob) -> np.ndarray:
    return -1/pred_prob.shape[0] * true_prob / (pred_prob + self.EPSILON)

### Training the neural network

In [None]:
lr = 1e-3
batch_size = 3
verbose = False

layer1 = Linear(784, 100, lr)
layer2 = Linear(100, 50, lr)
layer3 = Linear(50, 10, lr)
softmax = Softmax()
relu1 = ReLU()
relu2 = ReLU()
ce = CrossEntropyLoss()

for epoch in range(100):
  correct = 0
  incorrect = x_train.shape[0]
  total_loss = 0

  for i in range(0, x_train.shape[0], batch_size):
    x = x_train_encoded[i:i+batch_size]
    if verbose: print(f"x shape: {x.shape}")
    y = y_train_encoded[i:i+batch_size]
    if verbose: print(f"y shape: {y.shape}")

    x = layer1.forward(x)
    if verbose: print(f"layer 1 output: {x.shape}")
    x = relu1.forward(x)
    if verbose: print(f"relu 1 output: {x.shape}")
    x = layer2.forward(x)
    if verbose: print(f"layer 2 output: {x.shape}")
    x = relu2.forward(x)
    if verbose: print(f"relu 2 output: {x.shape}")
    x = layer3.forward(x)
    if verbose: print(f"layer 3 output: {x.shape}")
    x = softmax.forward(x)
    if verbose: print(f"softmax output: {x.shape}")

    y_pred = np.argmax(x, axis=1)
    if verbose: print(f"pred argmax output: {y_pred.shape}")
    y_true = np.argmax(y, axis=1)
    if verbose: print(f"true argmax output: {y_true.shape}")
    correct += np.sum(y_pred == y_true)

    total_loss += np.sum(ce.forward(x, y))

    grad = ce.backward(x, y)
    if verbose: print(f"ce grad: {grad.shape}")
    grad = softmax.backward(grad)
    if verbose: print(f"softmax grad: {grad.shape}")
    grad = layer3.backward(grad)
    if verbose: print(f"layer 3 grad: {grad.shape}")
    grad = relu2.backward(grad)
    if verbose: print(f"relu 2 grad: {grad.shape}")
    grad = layer2.backward(grad)
    if verbose: print(f"layer 2 grad: {grad.shape}")
    grad = relu1.backward(grad)
    if verbose: print(f"relu 1 grad: {grad.shape}")
    grad = layer1.backward(grad)
    if verbose: print(f"layer 1 grad: {grad.shape}")

  print(f"average loss: {total_loss/x_train.shape[0]}")
  print(f"accuracy: {correct/incorrect}")

In [None]:
correct = 0
incorrect = x_test.shape[0]
total_loss = 0

for i in range(x_test.shape[0]):
  x = x_test_encoded[i]
  y = y_test_encoded[i]

  x = layer1.forward(x)
  x = relu1.forward(x)
  x = layer2.forward(x)
  x = relu2.forward(x)
  x = layer3.forward(x)
  x = softmax.forward(x)

  y_pred = np.argmax(x)
  y_true = np.argmax(y)
  correct += np.sum(y_pred == y_true, axis=0)

  total_loss += np.sum(ce.forward(x, y))

print(f"accuracy: {correct/incorrect}")
print(f"total loss: {total_loss}")