Neural Networks and Deep Learning
Cracow University of Technology

Lab Assignment 5:

The purpose of this laboratory is to implement a neural network for a classification task:



1.   The network is trained using minibatch stochastic gradient descent.
2.   You have images of handwritten digits from the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) and you should train the network to predict the value of the digit for images.

Network specification:

1.   Input layer - one hidden layer - output layer
2.   Activation functions: for hidden layer "ReLU" and for output layer "softmax"
3.   Loss function: categorical cross-entropy



In [1]:
import numpy as np

def sigmoid(x):
    x = np.clip(x, -500, 500)
    return 1 / (1 + np.exp(-x))


def der_sigmoid(x):
    return sigmoid(x) * (1 - sigmoid(x))


def relu(x):
    return x * (x > 0)


def der_relu(x):
    return 1. * (x > 0)


def softmax(x):
    b = x.max()
    y = np.exp(x - b)
    return y / y.sum()


def cross_entropy_loss(y, yHat):
    return -np.sum(y * np.log(yHat))


def mean_squared_error(y, yHat):
    return np.square(np.subtract(y, yHat)).mean()

Your code consists of at least five functions:

* Network initialization
* Forward pass
* Backward pass
* Train 
* Evaluate

You are free to add more functions for the sake of having better organization for your code.

Tune your network by changing hyperparametes of the network:
* Number of epochs
* Number of neurons in hidden layer
* Different learning rates
* Different minibatch sizes

Also, try the following changes to the network:
* Apply different optimziation algorithms: Momentum, Adagrad, RMSprop, and ADAM
* Apply L2 regularization techniques to the loss function

Please submit your code with report on the error rate. You can also compare your results with the MNIST performance results exists on the MNIST website.
Please also report the effect of different changes you made in the network.

In [2]:
import tensorflow as tf

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(path="mnist.npz")

In [3]:
input_size = len(x_train[0]) * len(x_train[0][0])
hidden_layer_nodes = 32
output_size = 10

weights = [
    np.random.randn(hidden_layer_nodes, input_size),
    np.random.randn(output_size, hidden_layer_nodes)
]

biases = [np.zeros(hidden_layer_nodes), np.zeros(output_size)]

In [4]:
def one_hot_encoder(x, output_num):
    result = np.zeros(output_num)
    result[x] = 1
    return result

In [5]:
def calc_activation(x, weights):
    activations = []
    activation = x.flatten()
    for idx, w in enumerate(weights):
        z = np.dot(w, activation) + biases[idx]
        if idx < len(weights) - 1:
            activation = sigmoid(z)
        else:
            activation = softmax(z)
        activations.append(activation)
    return activations

In [6]:
def feed_forward_sample(x, y, l1=0.01, l2=0.02):
    activations_arr = calc_activation(x, weights)
    activation = activations_arr[-1]
    one_hot_y = one_hot_encoder(y, output_size)
    loss = cross_entropy_loss(one_hot_y, activation)
    # apply penalties
    for w, b in zip(weights, biases):
        loss += l1 * (np.sum(abs(w)) + np.sum(abs(b))) + l2 * (np.sum((w) ** 2) + np.sum((b) ** 2))

    one_hot_prediction = np.zeros_like(activation)
    one_hot_prediction[np.argmax(activation)] = 1
    return loss, one_hot_prediction

In [7]:
def evaluate_dataset(x, y):
    loss_arr = np.empty(x.shape[0])
    one_hot_predictions = np.empty((x.shape[0], output_size))

    for i in range(x.shape[0]):
        loss_arr[i], one_hot_predictions[i] = feed_forward_sample(x[i], y[i])

    print("Average loss=", np.round(np.average(loss_arr), decimals=2))

    y_one_hot = np.zeros((y.size, output_size))
    y_one_hot[np.arange(y.size), y] = 1

    correct_predictions = np.sum(y_one_hot * one_hot_predictions)
    correct_pred_percent = format((correct_predictions / y.shape[0]) * 100, ".2f")
    print("Accuracy (% of correct predictions):", correct_predictions, "/", y.shape[0], "[", correct_pred_percent, "%]")

In [8]:
def train_one_sample(x, y, lr_rate=0.001, l1=0.01, l2=0.02, momentum=0.9):
    sample = x.flatten()
    activations = calc_activation(sample, weights)
    a = activations[-1]
    one_hot_y = one_hot_encoder(y, output_size)
    # loss = cross_entropy_loss(one_hot_y, sample)
    # # apply penalties
    # for w, b in zip(weights, biases):
    #     loss += l1 * (np.sum(abs(w)) + np.sum(abs(b))) + l2 * (np.sum((w) ** 2) + np.sum((b) ** 2))

    one_hot_guess = np.zeros_like(a)
    one_hot_guess[np.argmax(a)] = 1

    weight_gradients = [None] * len(weights)
    bias_gradients = [None] * len(weights)
    activation_gradients = [None] * (len(weights) - 1)
    # change_w = np.zeros(len(weights))
    # change_b = np.zeros(len(biases))

    for i in reversed(range(len(weights))):

        if i == len(weights) - 1:
            y = one_hot_y.reshape(-1, 1)
            a = activations[i].reshape(-1, 1)
            a_prev = activations[i - 1].reshape(-1, 1)

            weight_gradients[i] = np.dot((a - y), a_prev.T)
            bias_gradients[i] = a - y
        else:
            w_next = weights[i + 1]
            a_next = activations[i + 1].reshape(-1, 1)
            y = one_hot_y.reshape(-1, 1)
            a = activations[i].reshape(-1, 1)
            if i > 0:
                a_prev = activations[i - 1].reshape(-1, 1)
            else:
                a_prev = x.flatten().reshape(-1, 1)

            if i == len(weights) - 2:
                dA = np.dot(w_next.T, (a_next - y))
                activation_gradients[i] = dA
            else:
                dA_next = activation_gradients[i + 1]
                dA = np.dot(w_next.T, (der_sigmoid(a_next) * dA_next))
                activation_gradients[i] = dA

            z = der_sigmoid(a) * dA
            weight_gradients[i] = np.dot(z, a_prev.T)
            bias_gradients[i] = z

        #implementing Nesterov Momentum
        # change_w[i] = -weight_gradients[i] * lr_rate + (momentum * change_w[i])
        # change_w[i] = -weight_gradients[i] * lr_rate + (momentum * change_w[i])
        weights[i] -= weight_gradients[i] * lr_rate
        biases[i] -= bias_gradients[i].flatten() * lr_rate

In [9]:
def train_dataset(learning_rate=0.0001):
    for idx in range(x_train.shape[0]):
        train_one_sample(x_train[idx], y_train[idx], learning_rate)

In [10]:
def train_test_epoch():
    train_dataset()
    evaluate_dataset(x_test, y_test)

In [11]:
epochs = 5

for e in range(epochs):
    print("\nEpoch {} / {}".format(e + 1, epochs))
    train_test_epoch()


Epoch 1 / 5
Average loss= 1360.59
Accuracy (% of correct predictions): 8371.0 / 10000 [ 83.71 %]

Epoch 2 / 5
Average loss= 2311.15
Accuracy (% of correct predictions): 8520.0 / 10000 [ 85.20 %]

Epoch 3 / 5
Average loss= 3458.39
Accuracy (% of correct predictions): 8627.0 / 10000 [ 86.27 %]

Epoch 4 / 5
Average loss= 4764.76
Accuracy (% of correct predictions): 8736.0 / 10000 [ 87.36 %]

Epoch 5 / 5
Average loss= 6229.29
Accuracy (% of correct predictions): 8715.0 / 10000 [ 87.15 %]
