In [36]:
#Reference
#https://medium.com/towards-artificial-intelligence/nothing-but-numpy-understanding-creating-neural-networks-with-computational-graphs-from-scratch-6299901091b0

## [1] Basic Neural network

In [37]:
import numpy as np
import pandas as pd

<img src="images/img_01_nn.png">

In [38]:
#https://medium.com/towards-artificial-intelligence/nothing-but-numpy-understanding-creating-neural-networks-with-computational-graphs-from-scratch-6299901091b0

In [39]:
data = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])
data

array([[0, 0, 0],
       [0, 1, 1],
       [1, 0, 1],
       [1, 1, 1]])

In [40]:
"""
weights: Contains weight whose value neural network would learn.
"""
weights = np.array([0.1, 0.6])

print(weights.shape)
weights

(2,)


array([0.1, 0.6])

In [41]:
"""
Initially set bias to Zero
"""
bias = 0

In [42]:
"""
Just extracting input features from data points
"""
X_O = data[:, :2]
print(X_O.shape)
X_O

(4, 2)


array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]])

In [43]:
"""
As we need to perform dot product in next step, therefore to define valid dimesions
we have to take transpose of X_O
"""
X = X_O.T
print(X.shape)
X

(2, 4)


array([[0, 0, 1, 1],
       [0, 1, 0, 1]])

In [44]:
"""
Extracting output values corresponding to input features
"""
Y =  data[:, -1]
Y

array([0, 1, 1, 1])

### [1.1] Forward propagation

In [45]:
"""
Here we are doing linear computations of all examples (On example contains available all input features) in one training
dataset simulataneously.
         

Where X : array([[0, 0, 0],
                 [0, 1, 1],
                 [1, 0, 1],
                 [1, 1, 1]])

And

weights: array([0.1, 0.6])
"""
Z = np.add(bias, np.dot(weights, X))
print(Z.shape)
Z

(4,)


array([0. , 0.6, 0.1, 0.7])

In [46]:
def compute_sigmoid(z):
    """
    We are here trying to squash output to range (0, 1)
    Note: Parenthesis implies exclusive boundary values.
    
    Here we input linear computation to sigmoid function and gets output range (0, 1)
    
    z: Linear equation.
    For this neural network it would be
    [
        weight[0] * X[0][0] + weight[1] * X[1][0] + b
        weight[0] * X[0][1] + weight[1] * X[1][1] + b
        weight[0] * X[0][2] + weight[1] * X[1][2] + b
        weight[0] * X[0][3] + weight[1] * X[1][3] + b
    ]    
    
    sigmoid output
    """
    
    sig = np.divide(1, np.add(1, np.exp(-z)))
    
    return sig

In [47]:
def compute_cost(y, y_hat, m, const = 2):
    """
    As we're considering all example simulataneously therefore we're using
    cost function instead loss function.
    
    We have to sum up all the loos due to each example and compute average.
    """
    
    total_cost = np.sum(np.divide(np.subtract(y, y_hat) ** 2, const))
    
    avg_cost = np.divide(total_cost, m)

    return avg_cost

In [48]:
y_hat = compute_sigmoid(Z)
y_hat

array([0.5       , 0.64565631, 0.52497919, 0.66818777])

In [49]:
avg_cost = compute_cost(Y, y_hat, len(y_hat))
avg_cost

0.08891294752305501

### [1.2] Backward propagation

In [50]:
def del_cost__by__del_y_hat(y, y_hat, m):
    """
    here we'are computing gradient of cost function w.r.t y_hat
    as we know cost function is (y - y_hat)^2 / ( 2 * m )
    Therefore it's gardien would be
    
    (-1 / m)(y - y_hat)
    
    avg_grad: vector of m length. Each element contains loss corresponding to each example.
    """
    
    grad = -np.subtract(y, y_hat)
    
    avg_grad = np.divide(grad, m)
    
    return avg_grad

In [51]:
local__del_cost__by__del_y_hat = del_cost__by__del_y_hat(Y, y_hat, len(y_hat))
local__del_cost__by__del_y_hat

array([ 0.125     , -0.08858592, -0.1187552 , -0.08295306])

In [52]:
def compute__del_y_hat__by__del_z(y_hat):
    """
        del_y_hat__by__del_z [$(y_hat)/$(z)] 
            As we know
            y_hat = sigmoid(z), therefore,
            $(sigmoid(z)) / $(z) : sigmoid(z)[1 - sigmoid(z)] OR [y_hat * (1 - y_hat)] 
    """

    one_sub_sigma = np.subtract(1, y_hat)
    
    grad = np.multiply(y_hat, one_sub_sigma)

    return grad

In [53]:
"""
Local gradient at z node of 'y_hat' w.r.t 'z'
"""
local__del_y_hat__by__del_z = compute__del_y_hat__by__del_z(y_hat)
local__del_y_hat__by__del_z

array([0.25      , 0.22878424, 0.24937604, 0.22171287])

In [54]:
def final_grad_at__z(local__del_cost__by__del_y_hat, local__del_y_hat__by__del_z):
    """
    Here we're computing final gradient at Z node i.e. multiplication of
    local gradient at Z node and incming gradient from cost.
    ( $cost/$y_hat * $y_hat/$z )
    
    local__del_cost__by__del_y_hat : Gradient of cost function i.e. ($cost/$y_hat)
    grad_y_hat__by__del_z: Local gradient at node Z ($y_hat/$z)
    
    grad_at_node__z: final gradient at node z
    """
    grad_at_node__z = np.multiply(local__del_cost__by__del_y_hat, local__del_y_hat__by__del_z)
    
    return grad_at_node__z

In [55]:
final__del_cost__by__del_z = final_grad_at__z(local__del_cost__by__del_y_hat, local__del_y_hat__by__del_z)
final__del_cost__by__del_z

array([ 0.03125   , -0.02026706, -0.0296147 , -0.01839176])

##### Compute z/$w

In [56]:
"""
1. w is a vector i.e. np.array([0.1, 0.6])

2. We do have 4 examples in training dataset

array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]])

jacob__del_z__by__del_w matrix would contains

 __       __               __                    __               __                    __
|           |             |                        |             |                        |
|   $z1/$w  |             |   $z1/$w1    $z1/$w2   |             |   x[0][0]    x[0][1]   |
|           |             |                        |             |                        |
|   $z2/$w  |             |   $z2/$w1    $z2/$w2   |             |   x[1][0]    x[1][1]   |
|           |    ===>     |                        |    ===>     |                        | 
|   $z3/$w  |             |   $z3/$w1    $z3/$w2   |             |   x[2][0]    x[2][1]   |
|           |             |                        |             |                        |
|   $z4/$w  |             |   $z4/$w1    $z4/$w2   |             |   x[3][0]    x[3][1]   |
|__       __|             |__                    __|             |__                    __|

In above matrix: 

    z1 = weight[0] * X[0][0] + weight[1] * X[0][1] + b
    z2 = weight[0] * X[1][0] + weight[1] * X[1][1] + b
    z3 = weight[0] * X[2][0] + weight[1] * X[2][1] + b
    z4 = weight[0] * X[3][0] + weight[1] * X[3][1] + b
    
    AND 
    
    w1, w2 = weights
"""
local__del_z__by__del_w = X_O
local__del_z__by__del_w

array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]])

In [57]:
"""
def del_z__by__del_weight0(X):
    
    del_z__by__del_weight0 OR del_z__by__del_w0
    
    As we know that at Z node linear equation would be
    [
        weight[0] * X[0][0] + weight[1] * X[1][0] + b
        weight[0] * X[0][1] + weight[1] * X[1][1] + b
        weight[0] * X[0][2] + weight[1] * X[1][2] + b
        weight[0] * X[0][3] + weight[1] * X[1][3] + b
    ]
    
    Therefore it's differentiation w.r.t weight[0] would be 
        [X[0][0]        X[0][1]        X[0][2]        X[0][3]]

    

    return X[0, :]

grad_z__by__del_weight0 = del_z__by__del_weight0(X)
grad_z__by__del_weight0
"""

"\ndef del_z__by__del_weight0(X):\n    \n    del_z__by__del_weight0 OR del_z__by__del_w0\n    \n    As we know that at Z node linear equation would be\n    [\n        weight[0] * X[0][0] + weight[1] * X[1][0] + b\n        weight[0] * X[0][1] + weight[1] * X[1][1] + b\n        weight[0] * X[0][2] + weight[1] * X[1][2] + b\n        weight[0] * X[0][3] + weight[1] * X[1][3] + b\n    ]\n    \n    Therefore it's differentiation w.r.t weight[0] would be \n        [X[0][0]        X[0][1]        X[0][2]        X[0][3]]\n\n    \n\n    return X[0, :]\n\ngrad_z__by__del_weight0 = del_z__by__del_weight0(X)\ngrad_z__by__del_weight0\n"

In [58]:
"""
def del_z__by__del_weight1(X):

    del_z__by__del_weight0 OR del_z__by__del_w1
    
    As we know that at Z node linear equation would be
    [
        weight[0] * X[0][0] + weight[1] * X[1][0] + b
        weight[0] * X[0][1] + weight[1] * X[1][1] + b
        weight[0] * X[0][2] + weight[1] * X[1][2] + b
        weight[0] * X[0][3] + weight[1] * X[1][3] + b
    ]
    
    Therefore it's differentiation w.r.t weight[1] would be 
        [X[1][0]        X[1][1]        X[1][2]        X[1][3]]



    return X[1, :]

grad_z__by__del_weight1 = del_z__by__del_weight1(X)
grad_z__by__del_weight1
"""

"\ndef del_z__by__del_weight1(X):\n\n    del_z__by__del_weight0 OR del_z__by__del_w1\n    \n    As we know that at Z node linear equation would be\n    [\n        weight[0] * X[0][0] + weight[1] * X[1][0] + b\n        weight[0] * X[0][1] + weight[1] * X[1][1] + b\n        weight[0] * X[0][2] + weight[1] * X[1][2] + b\n        weight[0] * X[0][3] + weight[1] * X[1][3] + b\n    ]\n    \n    Therefore it's differentiation w.r.t weight[1] would be \n        [X[1][0]        X[1][1]        X[1][2]        X[1][3]]\n\n\n\n    return X[1, :]\n\ngrad_z__by__del_weight1 = del_z__by__del_weight1(X)\ngrad_z__by__del_weight1\n"

In [59]:
def compute__del_z__by__del_bias():
    """
    compute__del_z__by__del_bias OR del_z__by__del_b
    
    As we know that at Z node linear equation would be (In general)
    z = np.mdot(weight, X) + b 
    
    Therefore it's differentiation w.r.t bias would be 1

    """

    return np.ones(1)

In [60]:
local__del_z__by__del_bias = compute__del_z__by__del_bias()
local__del_z__by__del_bias

array([1.])

In [61]:

"""
grad_cost__by__del_weight = np.dot( final__del_cost__by__del_z, X_O)
grad_cost__by__del_weight
"""
grad_cost__by__del_weight = local__del_z__by__del_w

In [62]:
"""
    Other way to compute :

    grad_cost__by__del_weight0
    and
    grad_cost__by__del_weight1
"""

"""
grad_cost__by__del_weight0 = np.sum(np.multiply(final__del_cost__by__del_z, grad_z__by__del_weight0))
print(grad_cost__by__del_weight0)

grad_cost__by__del_weight1 = np.sum(np.multiply(final__del_cost__by__del_z, grad_z__by__del_weight1))
print(grad_cost__by__del_weight1)
"""

'\ngrad_cost__by__del_weight0 = np.sum(np.multiply(final__del_cost__by__del_z, grad_z__by__del_weight0))\nprint(grad_cost__by__del_weight0)\n\ngrad_cost__by__del_weight1 = np.sum(np.multiply(final__del_cost__by__del_z, grad_z__by__del_weight1))\nprint(grad_cost__by__del_weight1)\n'

### [1.3] Update weights

In [63]:
print(weights)

[0.1 0.6]


In [64]:
print(local__del_z__by__del_w)

[[0 0]
 [0 1]
 [1 0]
 [1 1]]


In [65]:
"""
lr: Learning rate
"""
lr = 1

In [70]:
lr__mul__weights = np.multiply(local__del_z__by__del_w, 1)
print(lr__mul__weights)

[[0 0]
 [0 1]
 [1 0]
 [1 1]]


In [67]:
updated_weights = np.subtract(weights, lr__mul__weights)
updated_weights

array([[ 0.1,  0.6],
       [ 0.1, -0.4],
       [-0.9,  0.6],
       [-0.9, -0.4]])

### [1.4] Update bias

In [68]:
"""
final__del_cost__by__del_bias ($cost/$b)
$cost/$b: Change in cost function w.r.t bias
"""
final__del_cost__by__del_bias = np.sum(np.multiply( final__del_cost__by__del_z, local__del_z__by__del_bias))
final__del_cost__by__del_bias

-0.03702352612570836

In [69]:
updated_bias = bias - np.multiply(lr, final__del_cost__by__del_bias)
updated_bias

0.03702352612570836