In [1]:
from collections import namedtuple
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import morknn.matplotlib_themes
from morknn.layers import Layer
import morknn.activators
from morknn.datasets import mnist_dataset

%config InlineBackend.figure_formats = ['svg']

In [2]:
train_x, train_y, val_x, val_y = mnist_dataset(verbose=False, transpose_x=False)

## Multiply 2 matrices

$$
\large{
\mathbf{C}_{q, p} = \mathbf{A}_{q, n} \cdot \mathbf{B}_{n, p}\,, \qquad \text{where}\,\qquad  c_{i, j} = \sum_{k=0}^{n}a_{i, k}b_{k, j}
}
$$

# Basic example (1 layer)

<img src="https://i2.wp.com/www.adeveloperdiary.com/wp-content/uploads/2019/04/Understand-and-Implement-the-Backpropagation-Algorithm-From-Scratch-In-Python-adeveloperdiary.com-3.jpg?w=600&ssl=1" width=450>

[reference doc](https://www.adeveloperdiary.com/data-science/machine-learning/understand-and-implement-the-backpropagation-algorithm-from-scratch-in-python/)

## the math

$$
\large{\begin{align}
\mathbf{\hat{Y}} &= \mathbf{\sigma}\left\{\mathbf{Z}\right\}\\
\mathbf{\hat{Y}} &= \mathbf{\sigma} \left\{ \mathbf{W} \cdot \mathbf{X} + \mathbf{B}\right\}\\
\end{align}}
$$

the shape of each element is

$$
\large{\begin{align}
    \mathbf{X}_{784, 1}&=\left(
        \begin{matrix}
            x_0 \\ x_1 \\ \vdots \\ x_{783}
        \end{matrix}\right)\\[10pt]
\mathbf{W}_{10, 784}&=\left( 
    \begin{matrix}
        w_{0, 0} & w_{0, 1} & \dots & w_{0, 783} \\
        w_{1, 0} & w_{1, 1} & \dots & w_{1, 783} \\ 
        \vdots   & \vdots   & \ddots& \vdots     \\ 
        w_{9, 0} & w_{9, 1} & \dots & w_{9, 783} \\ 
    \end{matrix}\right)\\[10pt]
    \mathbf{B}_{10, 1}&=\left(
        \begin{matrix}
            b_0 \\ b_1 \\ \vdots \\ b_{9}
        \end{matrix}\right)
\end{align}}
$$

Going from $\mathbf{X}$ (_inout image data_) to $\mathbf{\hat{Y}}$ (_output label data_), do the following _forward pass_. 

$$
\large{\begin{align}
    \mathbf{\hat{Y}}&=\sigma\left\{ \mathbf{Z} \right\}\\
    \mathbf{\hat{Y}}&=\sigma\left\{ \mathbf{W} \cdot \mathbf{X} + \mathbf{B} \right\}\\
    \mathbf{\hat{Y}}_{10, 1}&=\sigma \bigg\{\, \underbrace{\mathbf{W}_{10, 784} \cdot \mathbf{X}_{784, 1}}_{\mathbf{R}_{10, 1}} + \mathbf{B}_{10, 1} \bigg\}\\
    \mathbf{\hat{Y}}_{10, 1}&=\sigma\left\{ \mathbf{R}_{10, 1} + \mathbf{B}_{10, 1} \right\}\\
    \mathbf{\hat{Y}}_{10, 1}&=\sigma\left\{ \mathbf{Z}_{10, 1} \right\}\\
    \mathbf{\hat{Y}}_{10, 1}&=\mathbf{A}_{10, 1}
\end{align}}
$$

### Define the loss function

$$
\large{\text{Loss} \rightarrow \mathcal{L}}
$$

Write the parital derivatives of $\mathcal{L}$ interms of $\mathbf{W}$ and $\mathbf{B}$

$$
\large{\begin{alignat}{2}
    \partial_W\mathcal{L}&=\frac{d\mathcal{L}}{dA}\mathbf{A} &&\partial_B\mathcal{L}=\frac{d\mathcal{L}}{dA}\mathbf{A}\\
    \partial_W\mathcal{L}&=\frac{d\mathcal{L}}{dA}\left(\frac{dA}{dZ}\mathbf{Z} \right) &&\partial_B\mathcal{L}=\frac{d\mathcal{L}}{dA}\left(\frac{dA}{dZ}\mathbf{Z}\right)\\
    \partial_W\mathcal{L}&=\frac{d\mathcal{L}}{dA}\left(\frac{dA}{dZ}\left(\frac{dZ}{dW}\mathbf{Z}\right)\right)\qquad &&\partial_B\mathcal{L}=\frac{d\mathcal{L}}{dA}\left(\frac{dA}{dZ}\left(\frac{dZ}{dB}\mathbf{Z}\right)\right)
\end{alignat}}
$$

to update the weights, it is

$$
\large{\begin{align}
    \mathbf{W}_{new} &= \mathbf{W}_{prev} + \alpha \partial_W \mathcal{L}\\[10pt]
    \mathbf{B}_{new} &= \mathbf{B}_{prev} + \alpha \partial_B \mathcal{L}
\end{align}}
$$

Loss function (square mean)

$$
\large{\begin{align}
    \mathcal{L} &= -\frac{1}{n} \bigg(Y - \mathbf{A}\bigg)^2 \\[10pt]
\end{align}}
$$

$$
\large{\begin{align}
    \partial_W\left\{\mathcal{L}\right\} &= -\frac{1}{n} \partial_W\left\{\partial_Z\left\{\partial_A\left\{\bigg(Y - \mathbf{A}\bigg)^2\right\}\right\}\right\} \\[10pt]
\end{align}}
$$

$$
\large{\begin{align}
\partial_W\left\{\mathcal{L}\right\} &= -\frac{1}{n} \partial_W\bigg\{\partial_Z\big\{ -2A\big\}\bigg\}\\[10pt]
\partial_W\left\{\mathcal{L}\right\} &= \frac{2}{n} \partial_W\bigg\{\partial_Z\big\{ \sigma\big( \mathbf{Z} \big) \big\}\bigg\}\\[10pt]
\end{align}}
$$

$$
\large{\begin{align}
\partial_W\left\{\mathcal{L}\right\} &= \frac{2}{n} \partial_W\bigg\{ d\sigma \big(\mathbf{Z}\big) \bigg\}\\[10pt]
% \partial_W\left\{\mathcal{L}\right\} &= \frac{1}{n} \partial_W\bigg\{\partial_Z\big\{ 2 \sigma\big( \mathbf{W} \cdot \mathbf{X} + \mathbf{B} \big) \big\}\bigg\}\\[10pt]
\end{align}}
$$

In [3]:
def one_hot(data_y):
    n_labels = data_y.shape[0]
    n_options = int(data_y.max())
    one_hoty = np.zeros((n_labels, n_options + 1))
    one_hoty[np.arange(n_labels), data_y] = 1
    return one_hoty.transpose()

def _forward_prop(data_x):
    for layer in layers:
        print(layer.weights.shape, data_x.shape)
        data_z = matmul(layer.weights, data_x) + layer.bias
        data_x = layer.activation._act()(data_z)
    return data_z, data_x

## Define the model in code

In [21]:
model_outputs = 10
n_images, n_pixels_per_image, n_image_dims = train_x.shape
# 39900, 784, 1

image_len = train_x.shape[1]
layer_args = namedtuple('layer', ['input_dim', 'output_dim', 'activation', 'name'])
layer_1 = layer_args(
    input_dim=n_pixels_per_image, 
    output_dim=model_outputs, 
    activation='sigmoid', 
    name='layer1'
)

# 1 layer model
model = Layer(
    input_dim=layer_1.input_dim,
    output_dim=layer_1.output_dim,
    activation=layer_1.activation,
    name=layer_1.name
)

print('model shapes:')
print(f' - shape of X = {train_x[0].shape}')
print(f' - shape of W = {model.weights.shape}')
print(f' - shape of B = {model.bias.shape}')
print(f' - activation = {model.activation.name}')

model shapes:
 - shape of X = (784, 1)
 - shape of W = (10, 784)
 - shape of B = (10, 1)
 - activation = sigmoid


In [30]:
# Forward

z = np.matmul(model.weights, train_x[0]) + model.bias
a = model.activation._act()(z)

# Backward

