# NumpyNet: Neural Network from Scratch
#### Garrett McCue

### Table of Contents
[1. Load Data](#load)

[2. Preprocessing ](#process)

[3. Initialize Parameters ](#init)

[4. Activation Functions ](#act)

[5. Forward Propagation ](#for)

[6. Back Propagation ](#back)

[7. Update Parameters](#update)

[8. Model](#model)

[9. Train Model](#tain)

[10. Test Model ](#test)

![ANN_URL](https://miro.medium.com/max/1160/0*u5-PcKYVfUE5s2by.gif)

In [1]:
# Import libraries
import tensorflow as tf
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import numpy as np

## Load Data  <a class="anchor" id="load"></a>

In [2]:
(X_train, Y_train),(X_test, Y_test) = tf.keras.datasets.mnist.load_data()

## Preprocess data <a class="anchor" id="process"></a>

In [3]:
#def normalize(x):
 #   return (x - np.min(x)) / (np.max(x) - np.min(x))

# Preprocessing image data
train_images = X_train.reshape((60000, 28*28)).T # reshape to feed into network
train_images = train_images.astype('float32') / 255 # normalize
#train_images = normalize(train_images)


test_images = X_test.reshape((10000, 28 * 28)).T # reshape to feed into network
test_images = test_images.astype('float32') / 225 # normalize
#test_images = normalize(test_images)

print("Shape of train_images: {}".format(train_images.shape))

Shape of train_images: (784, 60000)


In [4]:
# preprocess label data

# one hot encode the labels of shape (num examples, 1)
def one_hot(labels):
    # create matrix of correct sizing with zeros
    oh_labels = np.zeros((labels.size, labels.max() + 1))
    # specify for each row to place a 1 in the column corresponding to the correct label
    # i.e. if label = 2 ; [0 0 1 0 0 0 0 0 0 0]
    oh_labels[np.arange(labels.size), labels] = 1
    # flip so each column is an example
    oh_labels = oh_labels.T
    
    return oh_labels

train_labels = one_hot(Y_train)
test_labels = one_hot(Y_test)

print("Shape of train_labels: {}".format(train_labels.shape))

Shape of train_labels: (10, 60000)


## Initialize Parameters <a class="anchor" id="init"></a>


Network dimensions:  
$$ n^{[0]} = 784 $$
$$ n^{[1]} =  64 $$
$$ n^{[2]} = 10 $$

Initialize $W^{[l]}$ with He initialization and $b^{[l]}$ as zeros:
$$W^{[l]} = np.random.randn(n^{[l]}, n^{[l -1]}) \times \sqrt{\frac{2}{n^{[l-1]}}}$$
$$b^{[l]} = np.zeros(n^{[l]}, 1) $$

In [5]:
def initialize_params(seed):
    np.random.seed(seed)
    W1 = np.random.randn(64, 784) * np.sqrt(2./784)
    b1 = np.zeros((64, 1))
    W2 = np.random.randn(10, 64) * np.sqrt(2./64)
    b2 = np.zeros((10, 1))
   
    return W1, b1, W2, b2


## Activation Functions <a class="anchor" id="act"></a>
$ReLU$
$$
\mathrm{\sigma^{[l]}}(x) =  \begin{cases} 
    0 & if\;  x < 0 \\
    x & if\; x \geq 0
    \end{cases} \qquad \qquad
 \mathrm{\sigma^{\prime{[l]}}}(x) =  \begin{cases} 
    0 & if\;  x < 0 \\
    x & if\; x \geq 0
    \end{cases}$$
$Softmax$
$$ 
\sigma^{[L]}(x) = \frac{e^x}{\sum e^x} $$


In [6]:
# ReLU for hidden layers
def ReLU(Z):
    return np.maximum(Z, 0)

# for backprop through hidden layers
def ReLU_deriv(Z):
    return Z > 0

# Softmax for output layer
def softmax(Z):
    A = np.exp(Z)/ sum(np.exp(Z))
    return A

## Forward Propagation <a class="anchor" id="for"></a>
$$
Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}  
$$ 
$$
A^{[l]} = \sigma^{[l]}(Z^{[l]}) 
$$
$$ . . . $$
$$
A^{[L]} = \sigma^{[L]}(Z^{[L]}) = \hat{Y}
$$

In [7]:
def forward_prop(X, Y, W1, b1, W2, b2):
    m = X.shape[1]
    Z1 = np.dot(W1, X) + b1
    A1 = ReLU(Z1)
    Z2 = np.dot(W2, A1) + b2
    A2 = softmax(Z2) # make predictions
    
    cost = (-1/m) * np.sum(Y * np.log(A2+1e-9))

    return Z1, A1, Z2, A2, cost

## Back Propagation <a class="anchor" id="back"></a>
$$dZ^{[L]} = A^{[L]} - Y$$
$$ dW^{[L]} = \frac{1}{m}dZ^{[L]}A^{[L]^T}$$
$$ db^{[L]} = \frac{1}{m}\sum dZ^{[L]} $$
$$ dZ^{[L - 1]} = dW^{[L]^T}dZ^{[L]}\sigma^{\prime{[L]}}(Z^{[L-1]})$$
$$ ...$$
$$dZ^{[1]} = dW^{[1]^T}dZ^{[2]}\sigma^{\prime{[1]}}(Z^{[1]})$$
$$dW^{[1]} = \frac{1}{m}dZ^{1}A^{[1]^T}$$
$$db^{[1]} = \frac{1}{m}\sum dZ^{[1]}$$ 


In [8]:
def back_prop(X, Y, Z1, A1, Z2, A2, W1, W2): # Z2, W1 not needed
    # total sample count
    m = X.shape[1]

    # layer 2
    dZ2 = A2 - Y
    dW2 = (1 / m) * np.dot(dZ2, A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)

    # layer 1
    dZ1 = np.dot(W2.T, dZ2) * ReLU_deriv(Z1)
    dW1 = (1 / m) * np.dot(dZ1, X.T)
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)

    return dW1, db1, dW2, db2

## Update Parameters <a class="anchor" id="update"></a>
$$W^{[l]} = W^{[l]} - \eta \, dW^{[l]} $$
$$b^{[l]} = b^{[l]} - \eta \, db^{[l]} $$

In [9]:
def update_parameters(W1, b1, W2, b2, dW1, dW2, db1, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2

    return W1, b1, W2, b2

## Evaluation functions

In [10]:
def get_labels(A2):
    return np.argmax(A2, axis=0)

def get_accuracy(A2, Y):
    return np.mean(A2.argmax(axis=0) == Y.argmax(axis=0))


## Model <a class="anchor" id="model"></a>

In [11]:
def nn(X, Y, alpha, iterations, seed=33):
    W1, b1, W2, b2 = initialize_params(seed)
    # loop through gradient descent based on the number of iterations and add accuracy to empty list
    acc = []
    costs = []
    for i in range(iterations):
        # forward prop
        Z1, A1, Z2, A2, cost = forward_prop(X, Y, W1, b1, W2, b2)
        costs.append(cost)
        # back prop
        dW1, db1, dW2, db2 = back_prop(X, Y, Z1, A1, Z2, A2, W1, W2)
        # update parameters
        W1, b1, W2, b2 = update_parameters(W1, b1, W2, b2, dW1, dW2, db1, db2, alpha)
        predictions = get_labels(A2)
        accuracy = get_accuracy(A2, Y)
        acc.append(accuracy)
        # display accuracy and predictions every 100 iterations of gradient descent
        if i % 100 == 0:
        
            print("Iteration: {} | Accuracy: {:.3f}% | Loss:{:.3f}".format(i, accuracy*100, costs[i]))
            
        
    return W1, b1, W2, b2, acc, costs

## Train Model <a class="anchor" id="train"></a>

In [12]:
W1, b1, W2, b2, accuracy, costs = nn(train_images, train_labels, 0.1, 500)
loss = pd.DataFrame(costs)
acc = pd.DataFrame(accuracy)
fig = make_subplots(rows=1, cols=2)
fig.add_trace(
    go.Scatter(x=loss.index, y=loss[0], name='loss'),
    row=1, col=1
)      
fig.add_trace(
    go.Scatter(x=acc.index, y=acc[0], name='accuracy'),
    row=1, col=2
)
fig.update_layout(title='Training Results: {:.2f}%, learning rate=0.1'.format(acc[0].iat[-1]*100))
fig.write_image('figures/train.png')

Iteration: 0 | Accuracy: 13.878% | Loss:2.426
Iteration: 100 | Accuracy: 87.897% | Loss:0.467
Iteration: 200 | Accuracy: 89.910% | Loss:0.367
Iteration: 300 | Accuracy: 90.827% | Loss:0.327
Iteration: 400 | Accuracy: 91.552% | Loss:0.302


![](figures/train.png)

## Test model <a class="anchor" id="test"></a>

In [13]:
def nn_eval(X, Y, W1, b1, W2, b2):
    # forward pass with the weights from trained model
    Z1, A1, Z2, A2, loss = forward_prop(X, Y, W1, b1, W2, b2)
    preds = get_labels(A2)
    acc = get_accuracy(A2, Y)
   
    return preds, acc

In [14]:
pred_labels, accuracy = nn_eval(test_images, test_labels, W1, b1, W2, b2)
print("Accuracy: {:.2f} %".format(accuracy*100))

Accuracy: 92.18 %
