In [32]:
#Reference
#https://medium.com/towards-artificial-intelligence/nothing-but-numpy-understanding-creating-neural-networks-with-computational-graphs-from-scratch-6299901091b0

## [1] Basic Neural network

In [1]:
import numpy as np
import pandas as pd

<img src="images/img_01_nn.png">

In [33]:
#https://medium.com/towards-artificial-intelligence/nothing-but-numpy-understanding-creating-neural-networks-with-computational-graphs-from-scratch-6299901091b0

In [2]:
data = np.array([[0, 0, 0], [0, 1, 1], [1, 0, 1], [1, 1, 1]])

In [35]:
"""
weights: Contains weight whose value neural network would learn.
"""
weights = np.array([0.1, 0.6])

print(weights.shape)
weights

In [None]:
"""
Initially set bias to Zero
"""
bias = 0

In [4]:
"""
Just extracting input features from data points
"""
X_O = data[:, :2]
print(X_O.shape)
X_O

(4, 2)


array([[0, 0],
       [0, 1],
       [1, 0],
       [1, 1]])

In [5]:
"""
As we need to perform dot product in next step, therefore to define valid dimesions
we have to take transpose of X_O
"""
X = X_O.T
print(X.shape)
X

(2, 4)


array([[0, 0, 1, 1],
       [0, 1, 0, 1]])

In [6]:
"""
Extracting output values corresponding to input features
"""
Y =  data[:, -1]
Y

array([0, 1, 1, 1])

### [1.1] Forward propagation

In [36]:
"""
Here we are doing linear computations of all examples (On example contains available all input features) in one training
dataset simulataneously.
"""
Z = np.dot(weights, X)
print(Z.shape)
Z

(4,)


array([0. , 0.6, 0.1, 0.7])

In [8]:
def compute_sigmoid(z):
    """
    We are here trying to squash output to range (0, 1)
    Note: Parenthesis implies exclusive boundary values.
    
    Here we input linear computation to sigmoid function and gets output range (0, 1)
    
    z: Linear equation.
    For this neural network it would be
    [
        weight[0] * X[0][0] + weight[1] * X[1][0] + b
        weight[0] * X[0][1] + weight[1] * X[1][1] + b
        weight[0] * X[0][2] + weight[1] * X[1][2] + b
        weight[0] * X[0][3] + weight[1] * X[1][3] + b
    ]    
    
    sigmoid output
    """
    
    sig = np.divide(1, np.add(1, np.exp(-z)))
    
    return sig

In [9]:
def compute_cost(y, y_hat, m, const = 2):
    """
    As we're considering all example simulataneously therefore we're using
    cost function instead loss function.
    
    We have to sum up all the loos due to each example and compute average.
    """
    
    total_cost = np.sum(np.divide(np.subtract(y, y_hat) ** 2, const))
    
    avg_cost = np.divide(total_cost, m)

    return avg_cost

In [40]:
def del_cost__by__del_y_hat(y, y_hat, m):
    """
    here we'are computing gradient of cost function w.r.t y_hat
    as we know cost function is (y - y_hat)^2 / ( 2 * m )
    Therefore it's gardien would be
    
    (-1 / m)(y - y_hat)
    
    avg_grad: vector of m length. Each element contains loss corresponding to each example.
    """
    
    grad = -np.subtract(y, y_hat)
    
    avg_grad = np.divide(grad, m)
    
    return avg_grad

In [11]:
y_hat = compute_sigmoid(Z)
y_hat

array([0.5       , 0.64565631, 0.52497919, 0.66818777])

In [12]:
avg_cost = compute_cost(Y, y_hat, len(y_hat))
avg_cost

0.08891294752305501

### [1.2] Backward propagation

In [41]:
grad_cost = del_cost__by__del_y_hat(Y, y_hat, len(y_hat))
grad_cost

array([ 0.125     , -0.08858592, -0.1187552 , -0.08295306])

In [14]:
def del_y_hat__by__del_z(y_hat):
    """
        del_y_hat__by__del_z: $(y_hat)/$(z)
            As we know
            y_hat = sigmoid(z), therefore,
            $(sigmoid(z)) / $(z) : sigmoid(z)[1 - sigmoid(z)] OR [y_hat * (1 - y_hat)] 
    """
    
    one_sub_sigma = np.subtract(1, y_hat)
    
    grad = np.multiply(y_hat, one_sub_sigma)

    return grad

grad_y_hat__by__del_z = del_y_hat__by__del_z(y_hat)

In [15]:
def final_grad_at__z(grad_cost, grad_y_hat__by__del_z):
    """
    Here we're computing final gradient at Z node i.e. multiplication of
    local gradient at Z node and incming gradient from cost
    
    grad_cost : Gradient of cost function
    grad_y_hat__by__del_z: Local gradient at node Z ($y_hat/$z)
    
    grad_at_node__z: final gradient at node z
    """
    grad_at_node__z = np.multiply(grad_cost, grad_y_hat__by__del_z)
    
    return grad_at_node__z

In [16]:
grad_cost__by__del_z = final_grad_at__z(grad_cost, grad_y_hat__by__del_z)
grad_cost__by__del_z

array([ 0.03125   , -0.02026706, -0.0296147 , -0.01839176])

In [17]:
def del_z__by__del_weight0(X):
    """
    del_z__by__del_weight0 OR del_z__by__del_w0
    
    As we know that at Z node linear equation would be
    [
        weight[0] * X[0][0] + weight[1] * X[1][0] + b
        weight[0] * X[0][1] + weight[1] * X[1][1] + b
        weight[0] * X[0][2] + weight[1] * X[1][2] + b
        weight[0] * X[0][3] + weight[1] * X[1][3] + b
    ]
    
    Therefore it's differentiation w.r.t weight[0] would be 
        [X[0][0]        X[0][1]        X[0][2]        X[0][3]]

    """

    return X[0, :]

In [18]:
grad_z__by__del_weight0 = del_z__by__del_weight0(X)
grad_z__by__del_weight0

array([0, 0, 1, 1])

In [19]:
def del_z__by__del_weight1(X):
    """
    del_z__by__del_weight0 OR del_z__by__del_w1
    
    As we know that at Z node linear equation would be
    [
        weight[0] * X[0][0] + weight[1] * X[1][0] + b
        weight[0] * X[0][1] + weight[1] * X[1][1] + b
        weight[0] * X[0][2] + weight[1] * X[1][2] + b
        weight[0] * X[0][3] + weight[1] * X[1][3] + b
    ]
    
    Therefore it's differentiation w.r.t weight[1] would be 
        [X[1][0]        X[1][1]        X[1][2]        X[1][3]]

    """

    return X[1, :]

In [20]:
grad_z__by__del_weight1 = del_z__by__del_weight1(X)
grad_z__by__del_weight1

array([0, 1, 0, 1])

In [21]:
def del_z__by__del_bias():
    """
    del_z__by__del_bias OR del_z__by__del_b
    
    As we know that at Z node linear equation would be
    [np.mdot(weight, X) + b]
    
    Therefore it's differentiation w.r.t bias would be 1

    """

    return np.ones(1)

In [22]:
grad_z__by__del_bias = del_z__by__del_bias()
grad_z__by__del_bias

array([1.])

In [23]:
grad_cost__by__del_weight = np.dot( grad_cost__by__del_z, X_O)
grad_cost__by__del_weight

array([-0.04800646, -0.03865882])

In [24]:
"""
    Other way to compute :

    grad_cost__by__del_weight0
    and
    grad_cost__by__del_weight1
"""

"""
grad_cost__by__del_weight0 = np.sum(np.multiply(grad_cost__by__del_z, grad_z__by__del_weight0))
print(grad_cost__by__del_weight0)

grad_cost__by__del_weight1 = np.sum(np.multiply(grad_cost__by__del_z, grad_z__by__del_weight1))
print(grad_cost__by__del_weight1)
"""

'\ngrad_cost__by__del_weight0 = np.sum(np.multiply(grad_cost__by__del_z, grad_z__by__del_weight0))\nprint(grad_cost__by__del_weight0)\n\ngrad_cost__by__del_weight1 = np.sum(np.multiply(grad_cost__by__del_z, grad_z__by__del_weight1))\nprint(grad_cost__by__del_weight1)\n'

### [1.3] Update weights

In [25]:
print(weights)

[0.1 0.6]


In [26]:
print(grad_cost__by__del_weight)

[-0.04800646 -0.03865882]


In [None]:
"""
lr: Learning rate
"""
lr = 1

In [27]:
weights__lr = np.multiply(grad_cost__by__del_weight, 1)
print(weights__lr)

[-0.04800646 -0.03865882]


In [28]:
updated_weights = np.subtract(weights, weights__lr)
updated_weights

array([0.14800646, 0.63865882])

### [1.4] Update bias

In [29]:
"""
$cost/$b: Change in cost function w.r.t bias
"""
grad_cost__by__del_bias = np.sum(np.multiply( grad_cost__by__del_z, grad_z__by__del_bias))
grad_cost__by__del_bias

-0.03702352612570836

In [30]:
updated_bias = bias - np.multiply(lr, grad_cost__by__del_bias)
updated_bias

0.03702352612570836