<a href="https://colab.research.google.com/github/mahausmani/deep_learning/blob/main/digit-recognition/mlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Introduction:**

This is a from scratch implementation of a multi layer perceptron that classifies hand written digits. The architecture contains an nout layer with 784 (28x 28) neurons, 1 hidden layer with 16 neurons and output layer with 10 neurons corresponding to each of the nine digits.

<img src="images/1.png" style="width:650px;height:400px;">
<img src="images/2.png" style="width:650px;height:400px;">
<img src="images/2.png" style="width:650px;height:400px;">

## Imports

In [None]:
!pip install tensor-sensor

In [24]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tsensor import explain as exp

In [25]:
!git clone https://github.com/mahausmani/deep_learning.git

fatal: destination path 'deep_learning' already exists and is not an empty directory.


## Constants

In [152]:
epochs = 20
lr = 0.01
batck_size = 64
m = len(X_train)

## Data Loading and Preprocessing

In [26]:
!unzip "/content/deep_learning/digit-recognition/data/mnist_data.zip"

Archive:  /content/deep_learning/digit-recognition/data/mnist_data.zip
replace mnist_test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace mnist_train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: mnist_train.csv         


In [27]:
train = pd.read_csv("/content/mnist_train.csv")
test = pd.read_csv("/content/mnist_test.csv")

In [28]:
def rename_columns(df):
    cols = test.columns[1:]
    renamed_cols = [i for i in range(1,785)]
    d = {}
    for index,i in enumerate(cols):
        d[i] = renamed_cols[index]
    df.rename(columns=d, inplace = True)


In [30]:
rename_columns(train)
rename_columns(test)

## Preparing Dataset

In [31]:
X_train = np.array(train.iloc[:,1:])
X_test = np.array(test.iloc[:,1:])
Y_train = np.array(train.iloc[:,0])
Y_test = np.array(test.iloc[:,0])

In [90]:
Y_train2 = [[0] * 10 for i in range(len(Y_train))]
for i in range(len(Y_train2)):
    Y_train2[i][Y_train[i]] = 1
Y_train = Y_train2

## Helper functions


In [147]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [None]:
def initialize_weights(input_units, output_units):
    weights = np.random.normal(loc=0.0, scale = np.sqrt(2/(input_units+output_units)), size = (output_units,input_units))
    return weights

def initialize_bias(output_units):
    biases = np.zeros(output_units)
    return biases

Unvectorized implementation

In [None]:
def forward(W1, W2, X):
    z2 = np.dot(W1,X)
    a2 = sigmoid(z2)

    z3 = np.dot(W2, a2)
    a3 = sigmoid(z3)

    return {"a3": a3, "z3":z3, "a2": a2, "z2": z2}

def backward(x, z2, a2, z3, a3, y, w2):
    da3 = y - a3                                                                           # [10 x 1] - [10 x 1] = [10 x 1]
    dz3 = (1 - a3) * (a3)                                                                  # [10 x 1] * [10 x 1] = [10 x 1]
    dw2 = a2                                                                               # [16 x 1]
    dw2 = (da3 *  dz3).reshape(-1,1) @ dw2.reshape(-1,1).T                                 # [10 x 1] * [1 x 16] = [10 x 16]

    da2 = w2                                                                               # [16  x 784]
    dz2 = (1 - a2) * a2                                                                    # [16  x 1  ]
    dw1 = x                                                                                # [784 x 1  ]
    dw1 = ((w2.T @ (da3 * dz3).reshape(-1,1)) * dz2.reshape(-1,1)) * x.reshape(-1,1).T    # [16  x 10 ] * [10 x 1] * [16 x 1] * [1 x 784] = [16 x 784]
    return dw1, dw2


Vectorized Implementation

In [127]:
def forward_vectorized(W1, W2, X):
    z2 = np.dot(W1,X)                                                                      # [512 x 784] @ [784 x 600000] = [512 x 6000000]
    a2 = sigmoid(z2)                                                                       # [512 x 600000]

    z3 = np.dot(W2, a2)                                                                    # [10 x 512] * [512 x 60000]
    a3 = sigmoid(z3)                                                                       # [10 x 600000]

    return {"a3": a3, "z3":z3, "a2": a2, "z2": z2}

def backward_vectorized(x, z2, a2, z3, a3, y, w2):
    da3 = y - a3                                                                           # [10 x 60000] - [10 x 60000] = [10 x 60000]
    dz3 = (1 - a3) * (a3)                                                                  # [10 x 60000] * [10 x 60000] = [10 x 60000]
    dw2 = a2                                                                               # [512 x 60000]
    dw2 = (da3 *  dz3) @ dw2.T                                                             # [10 x 60000] * [60000 x 512] = [10 x 512]

    da2 = w2                                                                               # [10  x 512]
    dz2 = (1 - a2) * a2                                                                    # [512 x 60000]
    dw1 = x                                                                                # [784 x 60000]
    dw1 = ((w2.T @ (da3 * dz3)) * dz2) @ x.T                                                 # (([512 x 10 ] * [10 x 60000]) * [512 x 60000])* [60000 x 784] = [512 x 784]
    return dw1, dw2


In [93]:
def update_weights(w1, w2,dw1, dw2):
    w1 -= lr * dw1
    w2 -= lr * dw2
    return w1, w2

## Loss Function

Loss function unvectorized

In [142]:
def calculate_loss(a,y):
    mse = np.sum(np.sum((a - y) ** 2, axis = 0)/10)/a.shape[1]
    return mse

Loss function vectorized

In [None]:
def calculate_loss_vectorized(a,y):
    mse = np.sum((a - y)**2)/len(a)
    return mse

## Training Loop SGD

In [None]:
W1 = initialize_weights(784,512)
W2 = initialize_weights(512,10)

for epoch in range(epochs):
    for i in range(len(X_train)):
        image = X_train[i]                                           #
        y = Y_train[i]
        cache = forward(W1, W2, image)
        z2 = cache["z2"]
        a2 = cache["a2"]
        z3 = cache["z3"]
        a3 = cache["a3"]
        loss = calculate_loss(a3,y)
        dw1, dw2 = backward(image, z2, a2, z3, a3, y, W2)
        update_weights(W1, W2, dw1, dw2)
        if i % 5000 == 0:
            print(f"Epoch: {epoch}, Loss = {loss}")


## Training Loop Vectorized

In [None]:
W1 = initialize_weights(784,512)
W2 = initialize_weights(512,10)

for epoch in range(epochs):
    image = X_train                              # [60000 x 784]
    y = np.array(Y_train)                        # [60000 x 10]
    y = y.T                                      # [10 x 600000]
    image = image.T                              # [784 x 60000]
    cache = forward_vectorized(W1, W2, image)
    z2 = cache["z2"]
    a2 = cache["a2"]
    z3 = cache["z3"]
    a3 = cache["a3"]
    loss = calculate_loss_vectorized(a3,y)
    dw1, dw2 = backward_vectorized(image, z2, a2, z3, a3, y, W2)
    W1, W2 = update_weights(W1, W2, dw1, dw2)
    print(f"Epoch: {epoch}, Loss = {loss}")


In [None]:
from tsensor import explain as exp
a = np.array([[0,1],[0,1],[0,1]])
b = np.array([[2],[1]])
with exp() as c:
    c = a @ b