# Implementation of ANN from scratch using only Numpy...

In [1]:
# Only thing we need is Numpy...
import numpy as np

In [2]:
# Generate a Random seed...
# np.andom.seed() provides an essential input that enables NumPy
# to generate pseudo-random numbers for random processes.
np.random.seed(1)

In [3]:
# Sigmoidal Activation Function...

def sigmoid(x):
    return 1 / (1 + np.exp( -1 * x))

# To simplify this, we will represent that differential equation as d_sigmoid. 

def d_sigmoid(x):
    return sigmoid(x) * (1- sigmoid(x))

In [4]:
# Some sample data for our ANN.
# We take input value 'x' and
# target value 'y'

x = np.array([
    [0,0,1],
    [0,1,1],
    [1,0,1],
    [1,1,1]
])

y = np.array([
    [0],
    [0],
    [0],
    [1]
])

In [5]:
# Declare Hyper Parameters...

# Initialize weight(s)...
w1 = np.random.randn(3,1)

# Set the number of epoch
numer_of_epoch = 100

# Set the learning rate
learning_rate = 1

In [6]:
# Here is the set of training opertions...
for iter in range(numer_of_epoch):

    # 1. Make the Dot Product operation
    layer_1 = x.dot(w1)
    layer_1_act = sigmoid(layer_1)

    # loss function - (Mean Square Error)MSE 0.5
    loss = np.square(layer_1_act - y) / (len(layer_1_act)  * 2)

    print ("Current Epoch : ",iter ," current loss :", loss.sum())

    # Stocastic Gradient Descent(SGD) - BATCH
    # this is the training algorithm here...
    grad_1_part_1 = (layer_1_act - y) / len(layer_1_act)
    grad_1_part_2 = d_sigmoid(layer_1)
    grad_1_part_3 = x
    grad_1 = grad_1_part_3.T.dot(grad_1_part_1 * grad_1_part_2)

    # Weight Update
    w1 -=  learning_rate * grad_1

Current Epoch :  0  current loss : 0.11293240452690462
Current Epoch :  1  current loss : 0.11061045469556602
Current Epoch :  2  current loss : 0.10841998634013196
Current Epoch :  3  current loss : 0.10634746444246652
Current Epoch :  4  current loss : 0.10438119645547413
Current Epoch :  5  current loss : 0.10251107800511905
Current Epoch :  6  current loss : 0.1007283585804454
Current Epoch :  7  current loss : 0.09902543244860601
Current Epoch :  8  current loss : 0.09739565671979347
Current Epoch :  9  current loss : 0.09583319624216735
Current Epoch :  10  current loss : 0.09433289358769031
Current Epoch :  11  current loss : 0.09289016158456455
Current Epoch :  12  current loss : 0.09150089548643037
Current Epoch :  13  current loss : 0.0901614018051529
Current Epoch :  14  current loss : 0.08886834096772975
Current Epoch :  15  current loss : 0.08761868121025616
Current Epoch :  16  current loss : 0.08640966143605569
Current Epoch :  17  current loss : 0.08523876110085096
Curr

In [7]:
# Here we declare a single layer NN...

layer_1 = x.dot(w1)
layer_1_act = sigmoid(layer_1)

In [8]:
# what we get?
print ("\n\nFinal : " ,layer_1_act[:,-1])
print ("Final Round: " ,np.round(layer_1_act[:,-1]))
print ("Ground Truth : ",y[:,-1])
print ("W1 : ",w1[:,-1])



Final :  [0.09554338 0.24983353 0.3376184  0.61640766]
Final Round:  [0. 0. 0. 1.]
Ground Truth :  [0 0 0 1]
W1 :  [ 1.57382846  1.1482537  -2.24775401]


-----------------------That's it for this cycle-----------------------

## Now what if we update the weights while training?

In [9]:
# Declare Hyper Parameters...

# Initialize weight(s)...
w1 = np.random.randn(3,1)

# Set the number of epoches
numer_of_epoch = 100

# Set the learning rate
learning_rate = 10

In [10]:
# Here is the set of training opertions with weight updation included...

for iter in range(numer_of_epoch):

    # 1. Make the Dot Product operation
    layer_1 = x.dot(w1)
    layer_1_act = sigmoid(layer_1)

    # loss function - Mean Square Error(MSE) 0.5
    loss = np.square(layer_1_act - y) / (len(layer_1_act)  * 2)

    print ("\nCurrent Epoch : ",iter ," Current Accuracy : ",1- loss.sum()," current loss :", loss.sum()," Current Learning Rate: ",learning_rate)

    # Stocastic Gradient Descent(SGD) - BATCH
    # this is the training algorithm here...
    grad_1_part_1 = (layer_1_act - y) / len(layer_1_act)
    grad_1_part_2 = d_sigmoid(layer_1)
    grad_1_part_3 = x
    grad_1 = grad_1_part_3.T.dot(grad_1_part_1 * grad_1_part_2)

    # Weight Update
    w1 -=  learning_rate * grad_1
    
    # To keep things easy to understand, we have hardcoaded
    # the steps where the learning rate is changed...
    # Traditionally it is to be updated automatically if no change is detected in
    # Accuracy and loss over a number of epoch...
    if iter == 50 :
        learning_rate = 1
        
    if iter == 70 :
        learning_rate = 0.1


Current Epoch :  0  Current Accuracy :  0.8873114091385832  current loss : 0.11268859086141686  Current Learning Rate:  10

Current Epoch :  1  Current Accuracy :  0.891453031946667  current loss : 0.10854696805333297  Current Learning Rate:  10

Current Epoch :  2  Current Accuracy :  0.8981444126739027  current loss : 0.10185558732609733  Current Learning Rate:  10

Current Epoch :  3  Current Accuracy :  0.909008927315297  current loss : 0.09099107268470297  Current Learning Rate:  10

Current Epoch :  4  Current Accuracy :  0.9241317597903226  current loss : 0.07586824020967738  Current Learning Rate:  10

Current Epoch :  5  Current Accuracy :  0.937033494263932  current loss : 0.06296650573606796  Current Learning Rate:  10

Current Epoch :  6  Current Accuracy :  0.943871370195538  current loss : 0.05612862980446194  Current Learning Rate:  10

Current Epoch :  7  Current Accuracy :  0.9487263982900559  current loss : 0.051273601709944105  Current Learning Rate:  10

Current Ep

In [11]:
# Here we declare a single layer NN...

layer_1 = x.dot(w1)
layer_1_act = sigmoid(layer_1)

In [12]:
# what we get this time?
print ("\n\nFinal : " ,layer_1_act[:,-1])
print ("Final Round: " ,np.round(layer_1_act[:,-1]))
print ("Ground Truth : ",y[:,-1])
print ("W1 : ",w1[:,-1])



Final :  [0.00906602 0.16232736 0.1620407  0.80376285]
Final Round:  [0. 0. 0. 1.]
Ground Truth :  [0 0 0 1]
W1 :  [ 3.05099269  3.05310233 -4.69411465]


### Does updating weights and learning rates made a difference?
### Let's compare the outputs of both cycles...

-----------------------That's it for this cycle-----------------------

## But what if we extend the same approach to 2 layers...
## Only one way to find out...

In [13]:
# Generate a new Random seed...
np.random.seed(1234)

In [14]:
# Lets use 'Tan-hyperbolic' activation function this time...

def tanh(x):
    return np.tanh(x)

def d_tanh(x):
    return 1 - tanh(x) ** 2

In [15]:
# Initialize the weights for layers 'l1' & 'l2'...

w1 = np.random.randn(3,5)
w2 = np.random.randn(5,1)

print('w1 = ', w1)
print('\nw2 = ', w2)

w1 =  [[ 0.47143516 -1.19097569  1.43270697 -0.3126519  -0.72058873]
 [ 0.88716294  0.85958841 -0.6365235   0.01569637 -2.24268495]
 [ 1.15003572  0.99194602  0.95332413 -2.02125482 -0.33407737]]

w2 =  [[ 0.00211836]
 [ 0.40545341]
 [ 0.28909194]
 [ 1.32115819]
 [-1.54690555]]


In [16]:
# Declare Hyper Parameters...

# Set the number of epoches
numer_of_epoch = 250

# Set the learning rate
learning_rate = 10

In [17]:
# Training the network...

for iter in range(numer_of_epoch):

    layer_1 = x.dot(w1)
    layer_1_act = tanh(layer_1)

    layer_2 = layer_1_act.dot(w2)
    layer_2_act = tanh(layer_2)

    cost = np.square(layer_2_act - y).sum() / len(x)

    grad_2_part_1 = (2/len(x)) * (layer_2_act - y)
    grad_2_part_2 = d_tanh(layer_2)
    grad_2_part_3 = layer_1_act
    grad_2 =   grad_2_part_3.T.dot(grad_2_part_1 * grad_2_part_2) 

    grad_1_part_1 = (grad_2_part_1 * grad_2_part_2).dot(w2.T)
    grad_1_part_2 = d_tanh(layer_1)
    grad_1_part_3 = x
    grad_1 =   grad_1_part_3.T.dot(grad_1_part_1 * grad_1_part_2)
    
    # Update the weights...

    w1 -= 0.01*grad_1
    w2 -= 0.1*grad_2

In [18]:
# This is the Architecture...

layer_2 = layer_1_act.dot(w2)
layer_2_act = tanh(layer_2)
print(layer_2_act)


[[0.03024685]
 [0.05881942]
 [0.08153515]
 [0.71023473]]
