In [1]:
import math
import random
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from data import load_synth 
from data import load_mnist
from data import download_mnist
from data import save_mnist
from data import init
from data import load

In [3]:
# Activation functions
def sigmoid_array(z): 
    sigmoid = 1.0/(1.0 + np.exp(-z))
    return sigmoid 

def softmax_array(z):
    output = (np.exp(z)/np.exp(z).sum())
    return output

In [4]:
(x_train, y_train), (x_test, y_test), num_cls = load_mnist()

In [5]:
def normalize_tensor(z):
    factor = np.linalg.norm(z)
    normalized = z/factor
    
    return normalized

In [6]:
normal_x_train = normalize_tensor(x_train)

In [7]:
def initialize_parameters_tensor(x_input, hidden_nodes=300, output_labels=10):
    
    attributes = x_input.shape[1]

    W = np.random.normal(0, 1, size=(attributes, hidden_nodes))
    b = np.random.normal(0, 1, hidden_nodes)
    b = np.expand_dims(b, axis=1)
    V = np.random.normal(0, 1, size=(hidden_nodes, output_labels))
    c = np.random.normal(0, 1, output_labels)
    c = np.expand_dims(c, axis=1)
    
    weights = {'W': W, 'b': b, 'V':V, 'c': c}
    
    return weights

In [21]:
def forward_prop_vector(x, W, b, V, c):
    """
    forward propagation for a two layer neural network
    
    Arguments:
    x - input
    weights -- 
        W - weights, layer 1
        b - bias, layer 1
        V - weights, layer 2
        c - bias, layer 2
    Return: parameters
    """
    x = np.expand_dims(x, axis=1)
    #print('x:', x.shape)
    k = np.dot(W.T, x) + b
    #print('k:', k.shape)
    h = sigmoid_array(k)
    #print('h:', h.shape)
    o = np.dot(V.T, h) + c
    #print('o:', o.shape)
    yhat = softmax_array(o) 
    #print('yhat:', yhat.shape)
    parameters = {'k': k, 'h': h, 'o': o, 'yhat': yhat}
    
    return parameters

In [9]:
def compute_loss(ypred, y):
    
    loss = -(np.log(ypred[y]))
    
    return loss/float(ypred.shape[0])

In [10]:
def vector_loss(yhat, y):
    y_vector = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    y_vector[y] = 1
    y_vector = np.expand_dims(y_vector, axis=1)
    
    loss = -np.sum(np.dot(y_vector.T, np.log(yhat)))
    
    return loss/float(yhat.shape[0])

In [11]:
def sigmoid_derivative(x):
    sig_d = np.dot(sigmoid_array(x).T, (1-sigmoid_array(x)))
    return(sig_d)

In [12]:
def back_prop_vector(x, W, b, V, c, y, parameters):
    
    o = parameters['o']
    h = parameters['h']
    k = parameters['k']
    yhat = parameters['yhat']
    
    x = np.expand_dims(x, axis=1)
    # one-hot vector
    one_hot_vector = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
    one_hot_vector[y] = 1
    one_hot_vector = np.expand_dims(one_hot_vector, axis=1) 

    grad_o = yhat - one_hot_vector
    do_dV = h
    grad_V = np.dot(do_dV, grad_o.T)
    grad_c = grad_o
    do_dh = V
    grad_h = np.dot(do_dh, grad_o)
    dh_dk = sigmoid_derivative(k)
    dh_dW = x
    grad_W = np.dot(dh_dW, (dh_dk * grad_h).T)
    grad_b = grad_h * dh_dk
    gradients = {'grad_o': grad_o,
                 'grad_V': grad_V,
                 'grad_c': grad_c,
                 'grad_h': grad_h,
                 'grad_W': grad_W,
                 'grad_b': grad_b,
                }
    
    return gradients

In [13]:
def vector_propagation(x, W, b, V, c, y):
    
    parameters = forward_prop_vector(x, W, b, V, c)
    prediction = parameters['yhat']
    cost = compute_loss(prediction, y)
    gradients = back_prop_vector(x, W, b, V, c, y, parameters)
    
    return cost, gradients

In [14]:
q5_weights = initialize_parameters_tensor(x_train, hidden_nodes=300, output_labels=10)

In [15]:
W1 = q5_weights['W']
print('W1: ', W1.shape)
b1 = q5_weights['b']
print('b1: ', b1.shape)
W2 = q5_weights['V']
print('W2: ', W2.shape)
b2 = q5_weights['c']
print('b2: ', b2.shape)

W1:  (784, 300)
b1:  (300, 1)
W2:  (300, 10)
b2:  (10, 1)


In [18]:
cost, grad = vector_propagation(normal_x_train[0], W1, b1, W2, b2, y_train[0])

x: (784, 1)
k: (300, 1)
h: (300, 1)
o: (10, 1)
yhat: (10, 1)


In [22]:
def gradient_descent(x_set, W, b, V, c, y_set, lr, epochs):
    loss_record = []
    avg_loss_per_epoch = []
    std_loss_per_epoch = []
    
    for num in range(epochs):
        loss_per_epoch = []
        for x, y in zip(x_set, y_set):
            loss, grads = vector_propagation(x, W, b, V, c, y)

            grad_W = grads['grad_W']
            grad_b = grads['grad_b']
            grad_V = grads['grad_V']
            grad_c = grads['grad_c']
            
            # Updating weights:

            W -= lr * grad_W
            
            b -= lr * grad_b
            
            V -= lr * grad_V
            
            c -= lr * grad_c
            
            weights = {'W': W, 'b': b, 'V': V, 'c': c}
            loss_record.append(loss)
            loss_per_epoch.append(loss)
            
        avg_loss_per_epoch.append(np.mean(loss_record))
        std_loss_per_epoch.append(np.std(loss_record))
        
    return weights, grads, loss_record, avg_loss_per_epoch, std_loss_per_epoch

In [23]:
weights1, grads1, loss1, avg_loss, std_loss = gradient_descent(normal_x_train, W1, b1, W2, b2, y_train, 0.01, 5)

In [None]:
avg_loss

In [None]:
std_loss

In [None]:
weights1