**Numpy neural network**

In [81]:
import numpy as np
import time
np.random.seed(1)

In [82]:
def relu(x):
    return (x > 0) * x 

def relu_grad(x):
    return x > 0

In [83]:
streetlights = np.array([[1,0,1], 
                         [0,1,1], 
                         [0,0,1], 
                         [1,1,1], 
                         [0,1,1], 
                         [1,0,1]])

In [84]:
walk_vs_stop = np.array([[0], [1], [0], [1], [1], [0]])

In [85]:
X,y = streetlights, walk_vs_stop

In [86]:
hidden_nodes = 8

In [87]:
epochs = 100  # number of iterations to go through the network

lr = 0.01      # how much we change the weights of the network each iteration

In [88]:
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, y.shape[1]) - 0.5

In [89]:
correct_predictions = 0

for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # Forward pass/prediction
        layer_1 = relu(layer_in.dot(ws_1))
        layer_out = layer_1.dot(ws_2)
        
        #calc error/distance (how far are we from goal)
        delta_2 = layer_out - y[i:i+1]
        
        # Update weights
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))

        #calc the the error each node in prev layer contributed
        delta_1 = delta_2.dot(ws_2.T) * relu_grad(layer_1)
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))
        
        if abs(layer_out[0][0] - y[i:i+1][0][0]) < 0.5:
            correct_predictions += 1

    if epoch % 10 == 0:
        accuracy = correct_predictions / X.shape[0]
        error = delta_2 ** 2
        print(f"Epoch {epoch}: Error = {round(error[0][0], 6)}, Accuracy = {accuracy * 100:.2f}%")
    correct_predictions = 0

Epoch 0: Error = 0.019496, Accuracy = 50.00%
Epoch 10: Error = 0.046335, Accuracy = 83.33%
Epoch 20: Error = 0.063009, Accuracy = 100.00%
Epoch 30: Error = 0.062283, Accuracy = 100.00%
Epoch 40: Error = 0.052676, Accuracy = 100.00%
Epoch 50: Error = 0.041539, Accuracy = 100.00%
Epoch 60: Error = 0.031829, Accuracy = 100.00%
Epoch 70: Error = 0.0242, Accuracy = 100.00%
Epoch 80: Error = 0.018454, Accuracy = 100.00%
Epoch 90: Error = 0.015026, Accuracy = 100.00%


**Replacing ReLU activation function with Sigmoid activation function**

In [90]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def sigmoid_grad(x):
    return sigmoid(x)*(1-sigmoid(x))

In [91]:
np.random.seed(1)
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, y.shape[1]) - 0.5

correct_predictions = 0

for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # Forward pass/prediction
        layer_1 = sigmoid(layer_in.dot(ws_1))
        layer_out = layer_1.dot(ws_2)
        
        #calc error/distance (how far are we from goal)
        delta_2 = layer_out - y[i:i+1]
        
        # Update weights
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        delta_1 = delta_2.dot(ws_2.T) * sigmoid_grad(layer_1)
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))
        
        if abs(layer_out[0][0] - y[i:i+1][0][0]) < 0.5:
            correct_predictions += 1

    if epoch % 10 == 0:
        accuracy = correct_predictions / X.shape[0]
        error = delta_2 ** 2
        print(f"Epoch {epoch}: Error = {round(error[0][0], 6)}, Accuracy = {accuracy * 100:.2f}%")
    correct_predictions = 0

Epoch 0: Error = 1e-05, Accuracy = 50.00%
Epoch 10: Error = 0.118293, Accuracy = 50.00%
Epoch 20: Error = 0.195965, Accuracy = 66.67%
Epoch 30: Error = 0.217885, Accuracy = 100.00%
Epoch 40: Error = 0.21912, Accuracy = 100.00%
Epoch 50: Error = 0.21389, Accuracy = 100.00%
Epoch 60: Error = 0.206761, Accuracy = 100.00%
Epoch 70: Error = 0.199111, Accuracy = 100.00%
Epoch 80: Error = 0.191344, Accuracy = 100.00%
Epoch 90: Error = 0.183578, Accuracy = 100.00%


As one can see, when using the ReLU function, the accuracy is 50%  in the beginning, before it gradually increases and quickly converges towards 100%. On the other hand, the model seems to require more epochs to converge when using the Sigmoid function - both converge towards 100%

There could be several reasons for the observation described above to occur. One reason could be that the sigmoid activation function is much more sensitive to the initial weights and learning rate than the ReLU function is. Also, one can experience the vanishing gradient problem where the perceptron training rule does little to change the "update term" being extremely small - the sigmoid activation function is susceptible to this problem due to the gradient never going higher than 0.25 as seen in the graph below, thus requiring more epochs to converge.

<img src="https://miro.medium.com/v2/resize:fit:1400/1*6A3A_rt4YmumHusvTvVTxw.png"></img>

Image source: https://miro.medium.com/v2/resize:fit:1400/1*6A3A_rt4YmumHusvTvVTxw.png

**Learning rates and epochs**

In [92]:
learning_rates = [0.001, 0.01, 0.1, 1, 10]
epochs = 120

for learning_rate in learning_rates:
    print(f"Learning rate: {learning_rate}")
    np.random.seed(1)
    ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
    ws_2 = np.random.rand(hidden_nodes, y.shape[1]) - 0.5

    correct_predictions = 0

    for epoch in range(epochs):
        for i in range(X.shape[0]):
            layer_in = X[i:i+1]
            
            # Forward pass/prediction
            layer_1 = relu(layer_in.dot(ws_1))
            layer_out = layer_1.dot(ws_2)
            
            #calc error/distance (how far are we from goal)
            delta_2 = layer_out - y[i:i+1]
            
            # Update weights
            ws_2 -= learning_rate * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
            delta_1 = delta_2.dot(ws_2.T) * relu_grad(layer_1)
            ws_1 -= learning_rate * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))
            
            if abs(layer_out[0][0] - y[i:i+1][0][0]) < 0.5:
                correct_predictions += 1

        accuracy = correct_predictions / X.shape[0]
        error = delta_2 ** 2
        if accuracy == 1.0:
            print("Reached 100% accuracy at epoch", epoch, "\n")
            break
        if epoch == epochs - 1:
            print(f"Does not converge after {epochs} epochs\n")

        correct_predictions = 0

Learning rate: 0.001
Reached 100% accuracy at epoch 116 

Learning rate: 0.01
Reached 100% accuracy at epoch 12 

Learning rate: 0.1
Reached 100% accuracy at epoch 1 

Learning rate: 1
Does not converge after 120 epochs

Learning rate: 10
Does not converge after 120 epochs



  error = delta_2 ** 2


From experimenting with the necessary epochs to find the best learning rate, it was found that among the learning rates that enable the model to converge, the learning rate 0.001 required the most epochs (116), therefore setting epochs to 120 seems like a reasonable choice.

The largest learning rates (1 and 10) does not converge at all. In most cases a learning rate greater than 1 is not advisable, due to the risk of overshooting, fluctuation and divergence - as demonstrated in this case.

The best learning rate (0.1) reaches 100% accuracy already at 1 epoch and is picked as the preferred learning rate proceeding forward.

It is worth noting that the observations above would be different if we proceeded with using the Sigmoid activation function instead of the ReLU function, here I have chosen to go back to the ReLU function as it seems to increase the performance.

**Adding another hidden layer**

Here, the learning rate has been set to the optimal value found previously and epochs has been set to 5 as this is more than enough to illustrate further improvements of the model.

In [93]:
np.random.seed(1)
hidden_nodes_2 = 6
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5
lr = 0.1
epochs = 5

correct_predictions = 0
for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # Forward pass/prediction
        layer_1 = relu(layer_in.dot(ws_1))
        layer_2 = relu(layer_1.dot(ws_2))
        layer_out = layer_2.dot(ws_3)
        
        #calc error/distance (how far are we from goal)
        delta_3 = layer_out - y[i:i+1]
        delta_2 = delta_3.dot(ws_3.T) * relu_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * relu_grad(layer_1)
        
        # Update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))
        
        if abs(layer_out[0][0] - y[i:i+1][0][0]) < 0.5:
            correct_predictions += 1

    accuracy = correct_predictions / X.shape[0]
    error = delta_3 ** 2
    print(f"Epoch {epoch}: Error = {round(error[0][0], 6)}, Accuracy = {accuracy * 100:.2f}%")
    correct_predictions = 0

Epoch 0: Error = 0.003139, Accuracy = 50.00%
Epoch 1: Error = 0.016075, Accuracy = 50.00%
Epoch 2: Error = 0.050922, Accuracy = 50.00%
Epoch 3: Error = 0.08964, Accuracy = 100.00%
Epoch 4: Error = 0.096032, Accuracy = 100.00%


Adding another hidden layer seems to make the network less performant. This could be due to the fact that the problem that is attempted to be modelled, is a very simple one, hence increasing the model complexity might cause overfitting.

**Understanding the effect of the activation function**

(a)

In [94]:
np.random.seed(1)
hidden_nodes_2 = 6
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5
lr = 0.1
epochs = 5

correct_predictions = 0
for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # Forward pass/prediction
        layer_1 = relu(layer_in.dot(ws_1))
        layer_2 = layer_1.dot(ws_2)
        layer_out = layer_2.dot(ws_3)
        
        #calc error/distance (how far are we from goal)
        delta_3 = layer_out - y[i:i+1]
        delta_2 = delta_3.dot(ws_3.T) 
        delta_1 = delta_2.dot(ws_2.T) * relu_grad(layer_1)
        
        # Update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))
        
        if abs(layer_out[0][0] - y[i:i+1][0][0]) < 0.5:
            correct_predictions += 1

    accuracy = correct_predictions / X.shape[0]
    error = delta_3 ** 2
    print(f"Epoch {epoch}: Error = {round(error[0][0], 6)}, Accuracy = {accuracy * 100:.2f}%")
    correct_predictions = 0

Epoch 0: Error = 0.014366, Accuracy = 50.00%
Epoch 1: Error = 0.050342, Accuracy = 50.00%
Epoch 2: Error = 0.091095, Accuracy = 100.00%
Epoch 3: Error = 0.093618, Accuracy = 100.00%
Epoch 4: Error = 0.075225, Accuracy = 100.00%


Adding the activation function in the first hidden layer gives the model a degree of non-linearity, which in turn would be beneficial if the data that is to be modelled was complex. In this case it does not seem necessary as demonstrated in (b).

(b)

In [95]:
np.random.seed(1)
hidden_nodes_2 = 6
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5
lr = 0.1
epochs = 5

correct_predictions = 0
for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # Forward pass/prediction
        layer_1 = layer_in.dot(ws_1)
        layer_2 = relu(layer_1.dot(ws_2))
        layer_out = layer_2.dot(ws_3)
        
        #calc error/distance (how far are we from goal)
        delta_3 = layer_out - y[i:i+1]
        delta_2 = delta_3.dot(ws_3.T) * relu_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) 
        
        # Update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))

        if abs(layer_out[0][0] - y[i:i+1][0][0]) < 0.5:
            correct_predictions += 1

    accuracy = correct_predictions / X.shape[0]
    error = delta_3 ** 2
    print(f"Epoch {epoch}: Error = {round(error[0][0], 6)}, Accuracy = {accuracy * 100:.2f}%")
    correct_predictions = 0

Epoch 0: Error = 0.00423, Accuracy = 50.00%
Epoch 1: Error = 0.064511, Accuracy = 83.33%
Epoch 2: Error = 0.07154, Accuracy = 100.00%
Epoch 3: Error = 0.043882, Accuracy = 100.00%
Epoch 4: Error = 0.022942, Accuracy = 100.00%


Due to the problem at hand being relatively simple without the raw input sporting a lot of complex relationships, it seems that keeping the first layer linear is okay due to the second layer introducing sufficient non-linearity. Out of the three alternatives, (b) gives the best results.

(c)

In [96]:
np.random.seed(1)
hidden_nodes_2 = 6
ws_1 = np.random.rand(X.shape[1], hidden_nodes) - 0.5
ws_2 = np.random.rand(hidden_nodes, hidden_nodes_2) - 0.5
ws_3 = np.random.rand(hidden_nodes_2, y.shape[1]) - 0.5
lr = 0.1
epochs = 5

correct_predictions = 0
for epoch in range(epochs):
    for i in range(X.shape[0]):
        layer_in = X[i:i+1]
        
        # Forward pass/prediction
        layer_1 = relu(layer_in.dot(ws_1))
        layer_2 = relu(layer_1.dot(ws_2))
        layer_out = layer_2.dot(ws_3)
        
        #calc error/distance (how far are we from goal)
        delta_3 = layer_out - y[i:i+1]
        delta_2 = delta_3.dot(ws_3.T) * relu_grad(layer_2)
        delta_1 = delta_2.dot(ws_2.T) * relu_grad(layer_1)
        
        # Update weights
        ws_3 -= lr * (layer_2.T.reshape(hidden_nodes_2, 1).dot(delta_3))
        ws_2 -= lr * (layer_1.T.reshape(hidden_nodes, 1).dot(delta_2))
        ws_1 -= lr * (layer_in.T.reshape(X.shape[1], 1).dot(delta_1))

        if abs(layer_out[0][0] - y[i:i+1][0][0]) < 0.5:
            correct_predictions += 1

    accuracy = correct_predictions / X.shape[0]
    error = delta_3 ** 2
    print(f"Epoch {epoch}: Error = {round(error[0][0], 6)}, Accuracy = {accuracy * 100:.2f}%")
    correct_predictions = 0

Epoch 0: Error = 0.003139, Accuracy = 50.00%
Epoch 1: Error = 0.016075, Accuracy = 50.00%
Epoch 2: Error = 0.050922, Accuracy = 50.00%
Epoch 3: Error = 0.08964, Accuracy = 100.00%
Epoch 4: Error = 0.096032, Accuracy = 100.00%


Option (c) gives the highest level of non-linearity which causes the model to be able to capture complex relationships in the data. In this case however, it leads to decreased performance due to the problem at hand being relatively simple.

In the models above, one could potentially reduce the number of weight parameters by reducing the amount of neurons in each of the hidden layers (as long as it is not done excessively). One could also implement weight sharing for the first or second layer.