# Neural Network From Scratch

Simply implementing a Feed-Forward Neural Network From Scratch

In [69]:
# Load some libraries
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from keras.datasets import mnist # cheating a little, loading easy mnist dataset from keras library
from keras.utils import np_utils
from sklearn.metrics import log_loss
from scipy.special import expit # more robust sigmoid

## Load and Preprocess our images

In [70]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalize Pixel Values
X_train /= 255
X_test /= 255

# Convert array of ints (digit values) to one-hot encoded categorical
y_train = np_utils.to_categorical(y_train, 10)
y_test = np_utils.to_categorical(y_test, 10)

# convert images from 28x28 to 1x784
X_train = np.reshape(X_train, (60000, 784))
X_test = np.reshape(X_test, (10000, 784))

print("Input:", X_train.shape[1])
print("Output:", y_train.shape[1])

Input: 784
Output: 10


In [71]:
X_train[0]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [72]:
y_train[0]

array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0.])

In [73]:
# Define some functions

# Activation Function
def activation(x, derivative=False):
    return relu(x, derivative)

def tanh(x, derivative=False):
    return 1 - np.power(x, 2) if derivative else np.tanh(x)

def sigmoid(x, derivative=False):
    return x * (1 - x) if derivative else expit(x)

def relu(x, derivative=False):
    return (x>0).astype(x.dtype) if derivative else np.maximum(x, 0, x)

def softmax(x):
    return np.apply_along_axis(_softmax, 1, x)

def _softmax(x):
    exps = np.exp(x - np.max(x))
    return exps / np.sum(exps)    


def calculate_loss(model): 
    W1, b1, W2, b2= model['W1'], model['b1'], model['W2'], model['b2']
    
    # Forward propagation train to calculate our predictions 
    l1 = activation(X_train.dot(W1) + b1) # Input -> Hidden 1 || activation(x.t * W + bias) 
    output_train = softmax(l1.dot(W2) + b2) # Hidden 1 -> Output || Softmax Probabilites
    
    # Forward propagation train to calculate our predictions 
    l1 = activation(X_test.dot(W1) + b1) # Input -> Hidden 1 || activation(x.t * W + bias) 
    output_test = softmax(l1.dot(W2) + b2) # Hidden 1 -> Output || Softmax Probabilites
    
    # Calculating the loss
    return log_loss(y_train, output_train), log_loss(y_test, output_test)


def get_mini_batches(X, y, batch_size):
    random_idxs = np.random.choice(len(y), len(y), replace=False)
    X_shuffled = X[random_idxs,:]
    y_shuffled = y[random_idxs]
    mini_batches = [(X_shuffled[i:i+batch_size,:], y_shuffled[i:i+batch_size]) for
                   i in range(0, len(y), batch_size)]
    return mini_batches

In [74]:
# Define some parameters

# Layer Parameters
num_examples = X_train.shape[0] # training set size (60000)
nn_input_dim = X_train.shape[1] # input layer dimensionality (784)
nn_hdim_1 = 15
nn_output_dim = y_train.shape[1] # output layer dimensionality (10)

# Gradient descent parameters
epochs = 700 # How many times be forward and back propigate the network
epsilon = .001 # learning rate for gradient descent
reg_lambda = 0 # regularization strength
mu = 0.001  # momentum constant (mu in [0, 1])
batch_size = 128 # size of batches for minibatch gradient descent
print_loss = 10  #prints loss (and checks for early stopping) every 10 epochs
stop_threshold = 0.00001  # threshold for early stopping

In [75]:
# Input -> Hidden 1
W1 = np.random.randn(nn_input_dim, nn_hdim_1).astype(np.float32)
v1 = np.zeros_like(W1)  # to track previous W1 gradients for momentum
b1 = np.zeros((1, nn_hdim_1))

# Hidden 1 -> Output
W2 = np.random.randn(nn_hdim_1, nn_output_dim).astype(np.float32)
v2 = np.zeros_like(W2)  # to track previous W2 gradients for momentum
b2 = np.zeros((1, nn_output_dim))

W1.shape # for each node in input, there is a weight that corresponds with a node in hidden layer 1 (23520 total weights)

(784, 15)

In [None]:
# Now our Network

model = {}
losses_log = []  #simple list to plot loss over time
prev_loss = 10000  # init previous loss to track differences for stoping threshold

# Gradient descent... 
for i in range(0, epochs):
    
    mini_batches = get_mini_batches(X_train, y_train, batch_size)
    for mb in mini_batches:
        X_mb = mb[0]
        y_mb = mb[1]
 
        # Forward propagation
        l1 = activation(X_mb.dot(W1) + b1) # Input -> Hidden 1 || activation(x.t * W + bias)
        output = softmax(l1.dot(W2) + b2) # Hidden 1 -> Output || Softmax Probabilites

        # Backpropagation   
        output_error = output - y_mb # technically, you'd need a derived softmax activation, but that equals 1, so we don't add it
        l1_error = output_error.dot(W2.T) * activation(l1, True)
    
        dW2 = np.dot(l1.T, output_error)
        db2 = np.average(output_error, axis=0)
        dW1 = np.dot(X_mb.T, l1_error)
        db1 = np.average(l1_error, axis=0)
    
        # add regularization terms to weights
        dW2 += reg_lambda * W2 
        dW1 += reg_lambda * W1
    
        # update velocity by taking momentum multiplied by previous gradient (velocity) plus gradient multipled by learning rate
        v1 = mu * v1 + epsilon * dW1
        v2 = mu * v2 + epsilon * dW2
        
        # Update weights
        W1 -= v1
        b1 += -epsilon * db1 
        W2 -= v2
        b2 += -epsilon * db2
    
    model = { 'W1': W1, 'b1': b1, 'W2': W2, 'b2': b2} 
    
    # Optionally print the loss. 
    # This is expensive because it uses the whole dataset, so we don't want to do it too often. 
    # In the future, we should use a validation set to detect overfitting.
    if i % print_loss == 0 or i == epochs-1:
        loss_train, loss_test = calculate_loss(model)
        losses_log.append([i,loss_train, loss_test])
        print("Loss after iteration %i: %f" %(i, loss_train))
        
        # Check to see if stop early
        if np.absolute(loss_train - prev_loss) <= stop_threshold:
            print("Gradient Descent Stopped Early!")
            break
        else:
            prev_loss = loss_train


Loss after iteration 0: 1.163329
Loss after iteration 10: 0.452741
Loss after iteration 20: 0.384089
Loss after iteration 30: 0.345098
Loss after iteration 40: 0.309269
Loss after iteration 50: 0.292867
Loss after iteration 60: 0.271944
Loss after iteration 70: 0.259494
Loss after iteration 80: 0.260454
Loss after iteration 90: 0.249945
Loss after iteration 100: 0.241630
Loss after iteration 110: 0.229690
Loss after iteration 120: 0.229492
Loss after iteration 130: 0.210972
Loss after iteration 140: 0.207654
Loss after iteration 150: 0.207518
Loss after iteration 160: 0.208228
Loss after iteration 170: 0.191769
Loss after iteration 180: 0.190699
Loss after iteration 190: 0.186778
Loss after iteration 200: 0.184160
Loss after iteration 210: 0.177734
Loss after iteration 220: 0.178292
Loss after iteration 230: 0.171745
Loss after iteration 240: 0.168617
Loss after iteration 250: 0.166656
Loss after iteration 260: 0.172095
Loss after iteration 270: 0.160377
Loss after iteration 280: 0.159

In [None]:
# let's plot our loss over the iterations

iters = []
losses_train = []
losses_test = []
for ll in losses_log:
    iters.append(ll[0])
    losses_train.append(ll[1])
    losses_test.append(ll[2])
    
plt.plot(iters, losses_train)
plt.plot(iters, losses_test)
plt.show()

In [None]:
# let's see how we did

# Forward Propigate to get outputs on train data
l1_train = activation(X_train.dot(W1) + b1) # Input -> Hidden 1 || activation(x * W + bias)
output_train = softmax(l1_train.dot(W2) + b2)

# Forward Propigate to get outputs on test data
l1_test = activation(X_test.dot(W1) + b1) # Input -> Hidden 1 || activation(x * W + bias)
output_test = softmax(l1_test.dot(W2) + b2) # Hidden 1 -> Output || Softmax Probabilites

correct_train = 0
for i in range(0, output_train.shape[0]):
    if np.argmax(output_train[i]) == np.argmax(y_train[i]):
        correct_train += 1

correct_test = 0
for i in range(0, output_test.shape[0]):
    if np.argmax(output_test[i]) == np.argmax(y_test[i]):
        correct_test += 1

train_accuracy = correct_train / y_train.shape[0]
test_accuracy = correct_test / y_test.shape[0]

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)