#### Forward prop for a 3 layer NN

<img src="..\Screenshots\Screenshot (712).png">

In [8]:
import numpy as np

In [26]:
x = np.random.randn(3, 1) # random input vector of three numbers (3, 1)

W1 = 2 * np.random.random((4, 3)) - 1 #(4, 3)
W2 = 2 * np.random.random((4, 4)) - 1 #(4, 3)
W3 = 2 * np.random.random((1, 4)) - 1 #(1, 4)

b1 = np.random.random((4, 1))
b2 = np.random.random((4, 1))
b3 = np.random.random((1, 1))

In [27]:
f = lambda x: 1.0/(1.0 + np.exp(-x)) # computes sigmoid activation

'''
    Every neuron in hidden layer 1 computes w1 * x, ie (1, 3) * (3, 1) => (1, 1), passes this through 
    sigmoid activation. As the hidden layer 1 has 4 neurons, the weight matrix for the layer, W1 is 4 * (1, 3) => (4, 3)
    and the output is 4 * (1, 1) => (4, 1)
    
    Each neuron will have a bias value and each edge carries a weight. As bias is the property of a neuron, size of bias 
    is (no.of neurons, 1)
'''
h1 = f(np.dot(W1, x) + b1) # calculate first hidden layer activation (4, 1)
'''
    Every neuron in hidden layer 2 computes w1 * h1, ie (1, 4) * (4, 1) => (1, 1), passes this through 
    sigmoid activation. As the hidden layer 2 has 4 neurons, the weight matrix for the layer, W2 is 4 * (1, 4) => (4, 4)
    and the output is 4 * (1, 1) => (4, 1)
'''
h2 = f(np.dot(W2, h1) + b2) # calculate second hidden layer activation (4, 1)

'''
    The single neuron in the output layer computes w3 * h2, ie (1, 4) * (4, 1) => (1, 1).
'''
out = np.dot(W3, h2) + b3

print(out)

[[-1.10223226]]


#### Backprop for a 2 layer NN with sigmoid activation

Reading: [A Neural Network in 11 lines of Python](https://iamtrask.github.io/2015/07/12/basic-python-network/)

Input - 3 neurons
Hidden layer 1 - 4 nuerons
Ouput - 1 neuron

In [53]:
import numpy as np

In [88]:
# seed random numbers to make calculation
# deterministic (just a good practice)
np.random.seed(1)

# input dataset
X = np.array([ [0, 0, 1], [0, 1, 1], [1, 0, 1], [1, 1, 1]]).T #(3, 4)
# output dataset           
y = np.array([[0, 1, 1, 0]])  #(1, 4)

# initialize weights randomly with mean 0
W1 = 2 * np.random.random((4, 3)) - 1 #(4, 3)
W2 = 2 * np.random.random((1, 4)) - 1 #(1, 4)

# input layer
for j in range(60000):
    # forward prop for first hidden layer
    l1 = 1/(1 + np.exp(- (np.dot(W1, X)) )) #(4, 4)
    # forward prop for second hidden layer
    l2 = 1/(1 + np.exp(- (np.dot(W2, l1)) ))#(1, 4)

    
    # gradient from its forward node
    l2_forward_grad = l2 - y # error - amount by which the NN missed
    
    # local gradient
    l2_local_grad = l2 * (1 - l2) 
    
    # Gradient of output layer
    l2_delta = l2_forward_grad * l2_local_grad #(1, 4) This is the error of the network scaled by the confidence
    
    
     # how much did each l1 value contribute to the l2 error (according to the weights)?
    # Weighting l2_delta by the weights in W2, we can calculate the error in the middle/hidden layer.
    l1_forward_grad = W2.T.dot(l2_delta)
    # local gradient
    l1_local_grad = l1 * (1 - l1)
    # gradient of hidden layer 1
    l1_delta = l1_forward_grad * l1_local_grad #(4, 4)
    
    W1 -= l1_delta.dot(X.T) #(4, 3)
    W2 -= l2_delta.dot(l1.T) #(4, 4)

    if j%20000 == 0:
        error = np.mean( pow(abs(l2 - y), 2) )
        print("Iteration {} - MSE: {}".format(j, error))

print("Predicted: \n {}, \n Actual: \n {}".format(l2, y))

Iteration 0 - MSE: 0.2521887279676235
Iteration 20000 - MSE: 4.62903354203041e-05
Iteration 40000 - MSE: 2.209679630566262e-05
Predicted: 
 [[0.0040161  0.99558957 0.99692832 0.00356169]], 
 Actual: 
 [[0 1 1 0]]
