In [1]:
import numpy as np
from IPython.display import Image

In [2]:
NN_ARCHITECTURE = [
    {"input_dim": 2, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 1, "activation": "sigmoid"},
]

In [3]:
# Function for initilizing our nn weights and biasis
def init_layers(nn_architecture, seed=101):
    np.random.seed(seed)
    number_of_layers = len(nn_architecture)
    params_values = {}
    
    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]
        
        params_values[f"W{layer_idx}"] = np.random.randn(layer_output_size, layer_input_size) * 0.1
        params_values[f"B{layer_idx}"] = np.random.randn(layer_output_size) * 0.1
        
    return params_values
        

In [4]:
# Activation Functions and their respective dirivatives:

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def relu(Z):
    return np.maximum(0, Z)

def sigmoid_gradient(dA, Z):
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_gradient(dA, Z):
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0;
    return dZ

Single Layer Forward Propagation:

$$\boldsymbol{Z}^{[l]} = \boldsymbol{W}^{[l]} \cdot \boldsymbol{A}^{[l-1]} + \boldsymbol{b}^{[l]}$$


$$\boldsymbol{A}^{[l]} = g^{[l]}(\boldsymbol{Z}^{[l]})$$

In [5]:
# Given the inputs from a previous layer, calculate the next layer

def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation="relu"):
    Z_curr = np.dot(W_curr * A_prev) + b_curr
    
    if activation is "relu":
        active_fn = relu
    elif activation is "sigmoid":
        active_fn = sigmoid
    else:
        raise Exception("Non-supported activation function")
        
    return active_fn(Z_curr), Z_curr

In [6]:
def full_forward_propagation(X, params_values, nn_architecture):
    # Create a temporary memory hash for the backwards_stop
    memory = {}
    A_curr = X # X vector is the activation for layer 0
    
    # Iterate over network layers:
    for idx, layer in enumerate(nn_architecture):
        layer_idx = idx + 1
        # Transfer the previous activation into the current layer
        A_prev = A_curr
        
        active_fn_curr = layer["activation"]
        W_curr = params_values[f"W{layer_idx}"]
        b_curr = params_values[f"b{layer_idx}"]
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, active_fn_curr)
        
        memory[f"A{idx}"] = A_prev
        memory[f"Z{layer_idx}"] = Z_curr # Why do we store Z_curr instead of A_curr
        
    return A_curr, memory

Cross Entropy Cost:
![Cost Function](./assets/cost_function.gif)

In [75]:
def get_cross_entropy_cost(Y_hat, Y):
    m = Y_hat.shape[1] # Number of examples
    # https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#id11
    # https://www.youtube.com/watch?v=mj5DpK5gGsY
    cost = (-1 / m) * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T))
    return np.squeeze(cost)

In [89]:
def convert_prob_into_class(probs):
    return np.array([float(prob > 0) for prob in probs])

In [90]:
def get_accuracy_value(Y_hat, Y):
    _Y_hat = convert_prob_into_class(Y_hat)
    return (Y == _Y_hat).all(axis=0).mean()

Sadly, backward propagation is regarded by many inexperienced deep learning enthusiasts as algorithm that is intimidating and difficult to understand. The combination of differential calculus and linear algebra very often deters people who do not have a solid mathematical training. 

Often people confuse backward propagation with gradient descent, but in fact these are two separate matters. The purpose of the first one is to calculate the gradient effectively, whereas the second one is to use the calculated gradient to optimize. In NN, we calculate the gradient of the cost function (discussed earlier) in respect to parameters, but backpropagation can be used to calculate derivatives of any function. The essence of this algorithm is the recursive use of a chain rule known from differential calculus - calculate a derivative of functions created by assembling other functions, whose derivatives we already know. This process - for one network layer - is described by the following formulas. Unfortunately, due to the fact that this article focuses mainly on practical implementation, I'll omit the derivation. Looking at the formulas, it becomes obvious why we decided to remember the values of the A and Z matrices for intermediate layers in a forward step.

$$\boldsymbol{dW}^{[l]} = \frac{\partial L }{\partial \boldsymbol{W}^{[l]}} = \frac{1}{m} \boldsymbol{dZ}^{[l]} \boldsymbol{A}^{[l-1] T}$$

$$\boldsymbol{db}^{[l]} = \frac{\partial L }{\partial \boldsymbol{b}^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} \boldsymbol{dZ}^{[l](i)}$$

$$\boldsymbol{dA}^{[l-1]} = \frac{\partial L }{\partial \boldsymbol{A}^{[l-1]}} = \boldsymbol{W}^{[l] T} \boldsymbol{dZ}^{[l]}$$

$$\boldsymbol{dZ}^{[l]} = \boldsymbol{dA}^{[l]} * g'(\boldsymbol{Z}^{[l]})$$

