In [1]:
import numpy as np
from IPython.display import Image

In [2]:
NN_ARCHITECTURE = [
    {"input_dim": 2, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 50, "activation": "relu"},
    {"input_dim": 50, "output_dim": 25, "activation": "relu"},
    {"input_dim": 25, "output_dim": 1, "activation": "sigmoid"},
]

In [3]:
# Function for initilizing our nn weights and biasis
def init_layers(nn_architecture, seed=101):
    np.random.seed(seed)
    number_of_layers = len(nn_architecture)
    params_values = {}
    
    for idx, layer in enumerate(nn_architecture):
        curr_layer_idx = idx + 1
        layer_input_size = layer["input_dim"]
        layer_output_size = layer["output_dim"]
        
        params_values[f"W{curr_layer_idx}"] = np.random.randn(layer_output_size, layer_input_size) * 0.1
        params_values[f"b{curr_layer_idx}"] = np.random.randn(layer_output_size) * 0.1
        
    return params_values
        

In [4]:
# Activation Functions and their respective dirivatives:

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def relu(Z):
    return np.maximum(0, Z)

def sigmoid_gradient(dA, Z):
    # dA is the derivative of the current activation function
    sig = sigmoid(Z)
    return dA * sig * (1 - sig)

def relu_gradient(dA, Z):
    # dA is the derivative of the current activation function
    dZ = np.array(dA, copy=True)
    # Whichever values are less than or equal to zero have zero gradient
    dZ[Z <= 0] = 0;
    return dZ

Single Layer Forward Propagation:

$$\boldsymbol{Z}^{[l]} = \boldsymbol{W}^{[l]} \cdot \boldsymbol{A}^{[l-1]} + \boldsymbol{b}^{[l]}$$


$$\boldsymbol{A}^{[l]} = g^{[l]}(\boldsymbol{Z}^{[l]})$$

In [5]:
# Given the inputs from a previous layer, calculate the next layer

def single_layer_forward_propagation(A_prev, W_curr, b_curr, activation):
    Z_curr = np.dot(W_curr, A_prev) + b_curr
    
    if activation is "relu":
        active_fn = relu
    elif activation is "sigmoid":
        active_fn = sigmoid
    else:
        raise Exception("Non-supported activation function")
        
    return active_fn(Z_curr), Z_curr # Why return both values?

In [6]:
def full_forward_propagation(X, params_values, nn_architecture):
    # Create a temporary memory hash for backwards_prop
    memory = {}
    A_curr = X # X vector is the activation for layer 0
    
    # Iterate over network layers:
    for prev_layer_idx, layer in enumerate(nn_architecture):
        curr_layer_idx = prev_layer_idx + 1
        # Transfer the previous activation into the current layer
        A_prev = A_curr
        
        curr_active_fn = layer["activation"]
        W_curr = params_values[f"W{curr_layer_idx}"]
        b_curr = params_values[f"b{curr_layer_idx}"]
        A_curr, Z_curr = single_layer_forward_propagation(A_prev, W_curr, b_curr, curr_active_fn)
        
        # The activation of the current layer isn't stored in memory, b/c it is returned directly
        memory[f"A{prev_layer_idx}"] = A_prev
        memory[f"Z{curr_layer_idx}"] = Z_curr
        
    return A_curr, memory

Cross Entropy Cost:
![Cost Function](./assets/cost_function.gif)

In [7]:
def get_cross_entropy_cost(Y_hat, Y):
    m = Y_hat.shape[1] # Number of examples
    # https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#id11
    # https://www.youtube.com/watch?v=mj5DpK5gGsY
    cost = (-1 / m) * (np.dot(Y, np.log(Y_hat).T) + np.dot(1 - Y, np.log(1 - Y_hat).T))
    return np.squeeze(cost)

In [8]:
# Not used by NN, just for reporting
def convert_prob_into_class(probs):
    return np.array([float(prob > 0) for prob in probs])

In [9]:
# Not used by NN, just for reporting
def get_accuracy_value(Y_hat, Y):
    _Y_hat = convert_prob_into_class(Y_hat)
    return (Y == _Y_hat).all(axis=0).mean()

In NN, we calculate the gradient of the cost function in respect to parameters, but backpropagation can be used to calculate derivatives of any function. The essence of this algorithm is the recursive use of a chain rule known from differential calculus: calculate a derivative of functions created by assembling other functions, whose derivatives we already know. This process - for one network layer - is described by the following formulas:

$$\boldsymbol{dW}^{[l]} = \frac{\partial L }{\partial \boldsymbol{W}^{[l]}} = \frac{1}{m} \boldsymbol{dZ}^{[l]} \boldsymbol{A}^{[l-1] T}$$

$$\boldsymbol{db}^{[l]} = \frac{\partial L }{\partial \boldsymbol{b}^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} \boldsymbol{dZ}^{[l](i)}$$

$$\boldsymbol{dA}^{[l-1]} = \frac{\partial L }{\partial \boldsymbol{A}^{[l-1]}} = \boldsymbol{W}^{[l] T} \boldsymbol{dZ}^{[l]}$$

$$\boldsymbol{dZ}^{[l]} = \boldsymbol{dA}^{[l]} * g'(\boldsymbol{Z}^{[l]})$$



In [10]:
def single_layer_backpropagation(dA_curr, W_curr, b_curr, Z_curr, A_prev, activation):
    # Number of examples:
    m = A_prev.shape[1]
    
    if activation is "relu":
        activation_fn = relu_gradient
    elif activiation is "sigmoid":
        activation_fn = sigmoid_gradient
    else:
        raise Exception('Non-supported activation function')
        
    # The gradient Z with respect to the current ativiation
    dZ_curr = activation_fn(dA_curr, Z_curr)
    
    # Partial derivative matrix W with respect to Loss
    dW_curr = np.dot(dZ_curr, A_prev.T) / m
    # Partial derivative of vector b with respect to loss
    db_curr = np.sum(dZ_curr, axis=1, keepdims=True) / m
    # Partial deriviative of the previous layer's activatino func with respect to the loss
    dA_prev = np.dot(W_curr.T, dZ_curr)
    
    return dA_prev, dW_curr, db_curr  

We start by calculating a derivative of the cost function with respect to the prediction vector: the result of forward propagation. This is quite trivial as it only consists of rewriting the following formula. Then iterate through the layers of the network starting from the end and calculate the derivatives with respect to all parameters. Ultimately, function returns a python dictionary containing the gradient we are looking for:

$$\frac{\partial L }{\partial \boldsymbol{\hat{Y}}} = -(\frac{\boldsymbol{Y}}{\boldsymbol{\hat{Y}}}- \frac{1-\boldsymbol{Y}}{1-\boldsymbol{\hat{Y}}})$$

In [11]:
def full_backpropagation(Y_hat, Y, memory, params_values, nn_architecture):
    
    # Store our gradients in a dictionary for later use
    grads_values = {}
    
    # Number of samples
    m = Y.shape[0]
    # a hack ensuring the same shape of the prediction vector and labels vector
    Y = Y.reshape(Y_hat.shape)
    
    # Derivative of the cost function with respect to the prediction vector
    dA_prev = -1 * (np.divide(Y, Y_hat) - np.divide((1 - Y), (1 - Y_hat)))
    
    for prev_layer_idx, layer in reversed(list(enumerate(nn_architecture))):
        curr_layer_idx = prev_layer_idx + 1
        curr_activation_fn = layer['activation']

        dA_curr = dA_prev

        A_prev = memory[f"A{prev_layer_idx}"]
        Z_curr = memory[f"Z{curr_layer_idx}"]
        W_curr = params_values[f"W{curr_layer_idx}"]
        b_curr = params_values[f"b{curr_layer_idx}"]

        dA_prev, dW_curr, db_curr = single_layer_backpropagation(
            dA_curr,
            W_curr,
            b_curr,
            Z_curr,
            A_prev,
            curr_activation_fn
        )

        # Store our gradients in a dictionary for later updates
        grads_values[f"dW{curr_layer_idx}"] = dW_curr
        grads_values[f"db{curr_layer_idx}"] = db_curr
    return grads_values

In [12]:
params_values = init_layers(NN_ARCHITECTURE)

In [13]:
params_values

{'W1': array([[ 2.70684984e-01,  6.28132709e-02],
        [ 9.07969446e-02,  5.03825754e-02],
        [ 6.51117948e-02, -3.19318045e-02],
        [-8.48076983e-02,  6.05965349e-02],
        [-2.01816824e-01,  7.40122057e-02],
        [ 5.28813494e-02, -5.89000533e-02],
        [ 1.88695309e-02, -7.58872056e-02],
        [-9.33237216e-02,  9.55056509e-02],
        [ 1.90794322e-02,  1.97875732e-01],
        [ 2.60596728e-01,  6.83508886e-02],
        [ 3.02665449e-02,  1.69372293e-01],
        [-1.70608593e-01, -1.15911942e-01],
        [-1.34840721e-02,  3.90527843e-02],
        [ 1.66904636e-02,  1.84501859e-02],
        [ 8.07705914e-02,  7.29596753e-03],
        [ 6.38787013e-02,  3.29646299e-02],
        [-4.97104023e-02, -7.54069701e-02],
        [-9.43406403e-02,  4.84751647e-02],
        [-1.16773316e-02,  1.90175480e-01],
        [ 2.38126959e-02,  1.99665229e-01],
        [-9.93263500e-02,  1.96799505e-02],
        [-1.13664459e-01,  3.66479606e-05],
        [ 1.02598415e-01, 

In [14]:
A_curr, memory = full_forward_propagation(np.array([1,1]), params_values, NN_ARCHITECTURE)

In [15]:
memory

{'A0': array([1, 1]),
 'Z1': array([ 0.25796572,  0.10653767,  0.04788267, -0.07215597, -0.07192768,
         0.09646232, -0.1496051 ,  0.18846834,  0.10357345,  0.38999541,
         0.23824187, -0.07811868, -0.01208316,  0.05817428,  0.15618749,
         0.20035584, -0.12823342,  0.14812776,  0.07797946,  0.14929895,
        -0.06093395, -0.18691233, -0.05135339,  0.21007422,  0.25060457]),
 'A1': array([0.25796572, 0.10653767, 0.04788267, 0.        , 0.        ,
        0.09646232, 0.        , 0.18846834, 0.10357345, 0.38999541,
        0.23824187, 0.        , 0.        , 0.05817428, 0.15618749,
        0.20035584, 0.        , 0.14812776, 0.07797946, 0.14929895,
        0.        , 0.        , 0.        , 0.21007422, 0.25060457]),
 'Z2': array([-0.04077838, -0.01644601, -0.01304237, -0.00952487, -0.16221483,
         0.08872989,  0.03257746, -0.03374251, -0.0765148 ,  0.21248075,
         0.18171438, -0.17162753,  0.04408912,  0.05183725,  0.20597302,
        -0.07470464,  0.00444537

The activation of the current layer isn't stored in memory, b/c it is returned directly

In [20]:
A_curr == sigmoid(memory['Z5'])

array([ True])