## Import Python Librairies

In [None]:
import numpy as np
import matplotlib.pyplot as plt

## Prepare the data

In [None]:
X_train = np.array([1.0,])
Y_train = np.array([-2.0,])

## Build the artificial neural-network

In [None]:
ANN_ARCHITECTURE = [
    {"input_dim": 1, "output_dim": 2, "activation": "relu"},
    {"input_dim": 2, "output_dim": 2, "activation": "relu"},
    {"input_dim": 2, "output_dim": 1, "activation": "none"},
]

In [None]:
PSEUDO_RANDOM_PARAM_VALUES = {
    'W1': np.array([[ 0.01],
                    [-0.03]]),
    'b1': np.array([[ 0.02],
                    [-0.04]]),
    'W2': np.array([[ 0.05, -0.06 ],
                    [-0.07,  0.08]]),
    'b2': np.array([[ 0.09],
                    [-0.10]]),
    'W3': np.array([[-0.11, -0.12]]),
    'b3': np.array([[-0.13]])
}

In [None]:
def relu(Z):
    return np.maximum(0,Z)

def relu_backward(dA, Z):
    dZ = np.array(dA, copy = True)
    dZ[Z <= 0] = 0;
    return dZ;

### Single layer forward propagation step

$$\boldsymbol{Z}^{[l]} = \boldsymbol{W}^{[l]} \cdot \boldsymbol{A}^{[l-1]} + \boldsymbol{b}^{[l]}$$

$$\boldsymbol{A}^{[l]} = g^{[l]}(\boldsymbol{Z}^{[l]})$$

In [None]:
def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation):
    
    # calculation of the input value for the activation function
    Z_curr = np.dot(W_curr, A_prev) + b_curr    
    
    # selection of activation function
    if activation == "none":
        return Z_curr, Z_curr
    elif activation == "relu":
        activation_func = relu
    else:
        raise Exception('Non-supported activation function')
        
    # return of calculated activation A and the intermediate Z matrix
    return activation_func(Z_curr), Z_curr

In [None]:
def full_forward_propagation(X, params_values, ann_architecture):
    # creating a temporary memory to store the information needed for a backward step
    memory = {}
    # X vector is the activation for layer 0 
    A_curr = X
    
    # iteration over network layers
    for idx, layer in enumerate(ann_architecture):
        # we number network layers starting from 1
        layer_idx = idx + 1
        # transfer the activation from the previous iteration
        A_prev = A_curr
        
        # extraction of the activation function for the current layer
        activ_function_curr = layer["activation"]
        
        # extraction of W for the current layer
        W_curr = params_values["W" + str(layer_idx)]
        # extraction of b for the current layer
        b_curr = params_values["b" + str(layer_idx)]
        # calculation of activation for the current layer
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, activ_function_curr)
        
        # saving calculated values in the memory
        memory["A" + str(idx)] = A_prev
        memory["Z" + str(layer_idx)] = Z_curr
    
    # return of prediction vector and a dictionary containing intermediate values
    return A_curr, memory

In [None]:
def get_cost_value(Ŷ, Y):
    # this cost function works for 1-dimension only
    # to do: use a quadratic function instead
    cost = Ŷ - Y
    return np.squeeze(cost)

![Network architecture](https://miro.medium.com/max/1022/1*fX0kutywUnSTlDVS-yKdZA.png)

**Figure**: The four main formula of backpropagation at each layer. For more detail refer to http://neuralnetworksanddeeplearning.com/chap2.html

In [None]:
def single_layer_backward_propagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation, layer, debug=False):
    
    # end of BP1 or BP2 
    if activation == "none":  # i.e. no σ in the layer
        dZ_curr = dA_curr 

    else:  # i.e. σ in the layer
        if activation == "relu":
            backward_activation_func = relu_backward
        else:
            raise Exception('activation function not supported.')
        
        # calculation of the activation function derivative
        dZ_curr = backward_activation_func(dA_curr, Z_curr) 
        if debug:
            print('Step_4: layer',layer,'dZ=', dZ_curr.tolist())
        
    # BP3: derivative of the matrix W
    dW_curr = np.dot(dZ_curr, A_prev.T). # BP3
    if debug:
        # tolist() allows printing a numpy array on a single debug line
        print('Step_4: layer',layer,'dW=dZ.A_prev.T=', dZ_curr.tolist(), '.', A_prev.T.tolist())
        print('                dW=', dW_curr.tolist())
    
    # BP4: derivative of the vector b
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) # BP4
    if debug:
        print('Step_4: layer',layer,'db=', db_curr.tolist())
    
    # beginning of BP2: derivative of the matrix A_prev
    dA_prev = np.dot(W_curr.T, dZ_curr) 
    if debug:
        print('Step_4: layer',layer,'dA_prev=W.T.dZ=', W_curr.T.tolist(), '.', dZ_curr.tolist())
        print('                dA_prev=', dA_prev.tolist())
        
    return dA_prev, dW_curr, db_curr

In [None]:
def full_backward_propagation(Ŷ, cost, memory, params_values, ann_architecture, debug=False):
    
    grads_values = {}
    
    # number of examples
    m = Ŷ.shape[1]
    
    # initiation of gradient descent algorithm
    # i.e. compute 𐤃C (beginning of BP1)
    dA_prev = cost.reshape(Ŷ.shape)
    
    for layer_idx_prev, layer in reversed(list(enumerate(ann_architecture))):
        
        # we number network layers from 1
        layer_idx_curr = layer_idx_prev + 1
        
        # extraction of the activation function for the current layer
        activ_function_curr = layer["activation"]    
        
        dA_curr = dA_prev
        
        A_prev = memory["A" + str(layer_idx_prev)]
        Z_curr = memory["Z" + str(layer_idx_curr)]
        
        W_curr = params_values["W" + str(layer_idx_curr)]
        b_curr = params_values["b" + str(layer_idx_curr)]
        
        dA_prev, dW_curr, db_curr = single_layer_backward_propagation(
            dA_curr, W_curr, b_curr, Z_curr, A_prev, activ_function_curr, layer_idx_curr, debug)
        
        grads_values["dW" + str(layer_idx_curr)] = dW_curr
        grads_values["db" + str(layer_idx_curr)] = db_curr
    
    return grads_values

For each $l = L, L-1, \ldots, 2$:
* **update** the weights according to the rule $w^l \rightarrow w^l-\frac{\eta}{m} \sum_x \delta^{x,l} (a^{x,l-1})^T$

* **update** the biases according to the rule $b^l \rightarrow b^l-\frac{\eta}{m} \sum_x \delta^{x,l}$

In [None]:
def update(params_values, grads_values, ann_architecture, learning_rate, m):

    # iteration over network layers
    for layer_idx, layer in enumerate(ann_architecture, 1):
        params_values["W" + str(layer_idx)] -= learning_rate * grads_values["dW" + str(layer_idx)] / m     
        params_values["b" + str(layer_idx)] -= learning_rate * grads_values["db" + str(layer_idx)] / m        
        
    return params_values;

In [None]:
def train(X, Y, ann_architecture, params_values, learning_rate, debug=False, callback=None):
    # initiation of neural net parameters
    
    # initiation of lists storing the history 
    # of metrics calculated during the learning process 
    cost_history = []
    
    # performing calculations for subsequent iterations
    Ŷ, memory = full_forward_propagation(X, params_values, ann_architecture)
    if debug:
        print('Step_2: memory=%s', memory)        
        print('Step_2: Ŷ=', Ŷ)
        
    # calculating metrics and saving them in history (just for future information)
    cost = get_cost_value(Ŷ, Y)
    if debug:
        print('Step_3: cost=%.5f' % cost)
    cost_history.append(cost)
       
    # step backward - calculating gradient
    grads_values = full_backward_propagation(Ŷ, cost, memory, params_values, ann_architecture, debug)
    
    #print('grads_values:',grads_values)
    # updating model state
    m = X.shape[0] # m is number of samples in the batch
    params_values = update(params_values, grads_values, ann_architecture, learning_rate, m)
    if debug:
        print('Step_5: params_values=', params_values)
    return params_values

In [None]:
X_train = X_train.reshape(X_train.shape[0], 1)
Y_train = Y_train.reshape(Y_train.shape[0], 1)

## Train the artificial neural-network model

In [None]:
debug = True

# Training
ann_architecture = ANN_ARCHITECTURE
param_values = PSEUDO_RANDOM_PARAM_VALUES.copy()

if debug:
    print('X_train:', X_train)
    print('Y_train:', Y_train)
    print('ann_architecture:', ANN_ARCHITECTURE)

# implementation of the stochastic gradient descent
EPOCHS = 2
for epoch in range(EPOCHS):
    
    if debug:
        print('##### EPOCH %d #####' % epoch)
        print('Step_0: param_values:', param_values)
    
    samples_per_batch = 1
    
    for i in range(int(X_train.shape[0]/samples_per_batch)):
        si = i * samples_per_batch
        sj = (i + 1) * samples_per_batch

        if debug:
            print('Step_1: X_train[%d,%d]=%s' % (si, sj, X_train[si:sj]))

        learning_rate = 0.01
        
        param_values = train(
            np.transpose(X_train[si:sj]), 
            np.transpose(Y_train[si:sj]),
            ann_architecture, 
            param_values, 
            learning_rate,
            debug) 

X_train: [[1.]]
Y_train: [[-2.]]
ann_architecture: [{'input_dim': 1, 'output_dim': 2, 'activation': 'relu'}, {'input_dim': 2, 'output_dim': 2, 'activation': 'relu'}, {'input_dim': 2, 'output_dim': 1, 'activation': 'none'}]
##### EPOCH 0 #####
Step_0: param_values: {'W1': array([[ 0.01],
       [-0.03]]), 'b1': array([[ 0.02],
       [-0.04]]), 'W2': array([[ 0.05, -0.06],
       [-0.07,  0.08]]), 'b2': array([[ 0.09],
       [-0.1 ]]), 'W3': array([[-0.11, -0.12]]), 'b3': array([[-0.13]])}
Step_1: X_train[0,1]=[[1.]]
Step_2: memory=%s {'A0': array([[1.]]), 'Z1': array([[ 0.03],
       [-0.07]]), 'A1': array([[0.03],
       [0.  ]]), 'Z2': array([[ 0.0915],
       [-0.1021]]), 'A2': array([[0.0915],
       [0.    ]]), 'Z3': array([[-0.140065]])}
Step_2: Ŷ= [[-0.140065]]
Step_3: cost=1.85994
Step_4: layer 3 dW=dZ.A_prev.T= [[1.8599350000000001]] . [[0.0915, 0.0]]
                dW= [[0.17018405250000002, 0.0]]
Step_4: layer 3 db= [[1.8599350000000001]]
Step_4: layer 3 dA_prev=W.T.dZ= [[