<b>Import dependencies</b>

In [1]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from dnn_utils import sigmoid, relu, sigmoid_backward, relu_backward
np.random.seed(1)

<b>Initialization</b>

In [2]:
# init two-layer model parameters

def init_parameters(n_x, n_h, n_y):
    '''
    random init for weights, init with zeros for bias
    
    Args:
    n_x -- size of input layer
    n_h -- neurons of hidden layer
    n_y -- neurons of output layer
    
    Return:
    parameters -- a python dictionary contains model parameters
    '''
    
    np.random.seed(1)
    
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    parameters = {
        'W1': W1,
        'b1': b1,
        'W2': W2,
        'b2': b2
    }
    
    return parameters

In [3]:
# test init function

parameters = init_parameters(3, 2, 1)
print(f'W1:\n {parameters["W1"]} and shape {parameters["W1"].shape}')
print(f'b1:\n {parameters["b1"]} and shape {parameters["b1"].shape}')
print(f'W2:\n {parameters["W2"]} and shape {parameters["W2"].shape}')
print(f'b2:\n {parameters["b2"]} and shape {parameters["b2"].shape}')

W1:
 [[ 0.01624345 -0.00611756 -0.00528172]
 [-0.01072969  0.00865408 -0.02301539]] and shape (2, 3)
b1:
 [[0.]
 [0.]] and shape (2, 1)
W2:
 [[ 0.01744812 -0.00761207]] and shape (1, 2)
b2:
 [[0.]] and shape (1, 1)


In [4]:
# init l-layers model parameters

def init_l_layer_parameters(layer_dims):
    '''
    random init for weights, init with zeros for bias
    
    Args:
    layer_dims -- a list contains layer dimensions
    
    Return:
    parameters -- a python dictionary contains model parameters
    '''
    
    np.random.seed(1)
    
    parameters = {}
    
    for i in range(1, len(layer_dims)):
        parameters['W'+str(i)] = np.random.randn(layer_dims[i], layer_dims[i-1]) * 0.01
        parameters['b'+str(i)] = np.zeros((layer_dims[i], 1))

    return parameters

In [26]:
# test init function

parameters = init_l_layer_parameters([5,4,3,2])
for i in parameters:
    print(f'{i} has {parameters[i]} and shape {parameters[i].shape}')

W1 has [[ 0.01624345 -0.00611756 -0.00528172 -0.01072969  0.00865408]
 [-0.02301539  0.01744812 -0.00761207  0.00319039 -0.0024937 ]
 [ 0.01462108 -0.02060141 -0.00322417 -0.00384054  0.01133769]
 [-0.01099891 -0.00172428 -0.00877858  0.00042214  0.00582815]] and shape (4, 5)
b1 has [[0.]
 [0.]
 [0.]
 [0.]] and shape (4, 1)
W2 has [[-0.01100619  0.01144724  0.00901591  0.00502494]
 [ 0.00900856 -0.00683728 -0.0012289  -0.00935769]
 [-0.00267888  0.00530355 -0.00691661 -0.00396754]] and shape (3, 4)
b2 has [[0.]
 [0.]
 [0.]] and shape (3, 1)
W3 has [[-0.00687173 -0.00845206 -0.00671246]
 [-0.00012665 -0.0111731   0.00234416]] and shape (2, 3)
b3 has [[0.]
 [0.]] and shape (2, 1)


<b>Forward propagation</b>

In [27]:
# linear_forward

def linear_forward(A, W, b):
    '''
    perform linear forward propagation
    
    Args:
    A -- previous layer input
    W -- current layer weights
    b -- current layer bias
    
    Return:
    Z -- linear forward output 
    linear_cache -- intermediate variable for backward propagation
    '''
    
    Z = W.dot(A) + b
    linear_cache = (A, W, b)
    
    return Z, linear_cache

In [28]:
# test linear_forward

A_test = np.random.randn(3,2)
W_test = np.random.randn(1,3) * 0.01
b_test = np.zeros((1,1))
Z_test, linear_cache = linear_forward(A_test, W_test, b_test)

print(f'linear forward output:\n {Z_test} and shape {Z_test.shape}')

linear forward output:
 [[0.00063885 0.00926233]] and shape (1, 2)


In [29]:
# linear_activation_forward

def linear_activation_forward(A_prev, W, b, activation):
    '''
    perform linear forward propagation with non-linear
    
    Args:
    A_prev -- previous layer input
    W -- current layer weights
    b -- current layer bais
    activation -- type of activation function
    
    Return:
    A -- linear_activation output
    linear_activation_cache -- intermediate variable for backward propagation
    '''
    
    Z, linear_cache = linear_forward(A_prev, W, b)
    
    if activation == 'sigmoid':
        A, activation_cache = sigmoid(Z)
    elif activation == 'relu':
        A, activation_cache = relu(Z)
        
    linear_activation_cache = (linear_cache, activation_cache)
    return A, linear_activation_cache

In [30]:
# test linear_activation forward function

sigmoid_output, sigmoid_linear_activation_cache = linear_activation_forward(A_test, W_test, b_test, 'sigmoid')
relu_output, relu_linear_activation_cache = linear_activation_forward(A_test, W_test, b_test, 'relu')
print(f'sigmoid output:\n {sigmoid_output} and shape {sigmoid_output.shape}')
print(f'relu output:\n {relu_output} and shape {relu_output.shape}')

sigmoid output:
 [[0.50015971 0.50231557]] and shape (1, 2)
relu output:
 [[0.00063885 0.00926233]] and shape (1, 2)


In [31]:
# l-layer linear_activation_forward

def l_layer_linear_activation_forward(X, parameters):
    '''
    perform l-pass forward propagation with non-linear
    
    Args:
    X -- original input
    parameters -- a dictionary contains parameters
    
    Return:
    A -- linear_activation output
    cache -- intermediate variable for backward propagation
    '''
    A = X 
    L = len(parameters) // 2
    cache = []
    for i in range(1, L):
        A_prev = A
        A, linear_activation_cache = linear_activation_forward(A_prev, parameters['W'+str(i)], parameters['b'+str(i)], 'relu')
        cache.append(linear_activation_cache)
        
    A_out, linear_activation_cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], 'sigmoid')
    cache.append(linear_activation_cache)
        
    return A_out, cache

In [33]:
# test l-layer linear_activation_forward

output, cache = l_layer_linear_activation_forward(np.random.randn(5,3), parameters)
print(f'the network output:\n {output} and has shape {output.shape}')

the network output:
 [[0.4999999  0.49999877 0.49999914]
 [0.49999989 0.50000011 0.50000007]] and has shape (2, 3)


<b>Cost function</b>

In [34]:
def compute_cost(Y, Y_predict):
    '''
    compute model cost
    
    Args:
    Y -- ground-truth labels
    Y_predict -- model output
    
    Return:
    cost -- model cost
    '''
    m = Y.shape[1]
    cost = -np.sum(Y * np.log(Y_predict) + (1-Y) * np.log(1 - Y_predict)) / m
    cost = np.squeeze(cost)
    return cost

In [35]:
# test cost

cost = compute_cost(np.zeros((1,3)), output)
print(f'model cost: {cost}')

model cost: 1.3862929573092737


<b>Back propagation</b>

In [36]:
# linear_backward

def linear_backward(dZ, linear_cache):
    '''
    perform linear backward propagation
    
    Args:
    dZ -- gradient for Z
    linear_cache -- intermediate variables for backward propagation
    
    Return:
    linear_grads -- gradients
    '''
    
    A, W, b = linear_cache
    m = A.shape[1]
    
    dW = (1/m) * np.dot(dZ, A.T)
    db = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ) 
    
    linear_grads = {
        'dW': dW,
        'db': db,
        'dA_prev': dA_prev
    }
    
    return linear_grads

In [37]:
# test linear_backward

linear_grads = linear_backward(1-Z_test, linear_cache)
print(f'the gradients for weights:\n {linear_grads["dW"]} and has shape {linear_grads["dW"].shape}')
print(f'the gradients for bias:\n {linear_grads["db"]} and has shape {linear_grads["db"].shape}')
print(f'the gradients for previous layer output:\n {linear_grads["dA_prev"]} and has shape {linear_grads["dA_prev"].shape}')

the gradients for weights:
 [[ 1.19695646 -0.53556022  0.46504878]] and has shape (1, 3)
the gradients for bias:
 [[0.99504941]] and has shape (1, 1)
the gradients for previous layer output:
 [[ 0.00050775  0.00050337]
 [-0.00636589 -0.00631096]
 [ 0.00190794  0.00189147]] and has shape (3, 2)


In [38]:
# linear_activation_backward

def linear_activation_backward(dA, linear_activation_cache, activation):
    '''
    perform linear backward propagation with non-linear
    
    Args:
    dA -- gradient for A
    linear_activation_cache -- intermediate variables for backward propagation
    activation -- decide to pick relu or sigmoid
    
    Return:
    linear_activation_grads -- gradients
    '''
    (linear_cache, activation_cache) = linear_activation_cache
    
    if activation == 'sigmoid':
        dZ = sigmoid_backward(dA, activation_cache)
        linear_activation_grads = linear_backward(dZ, linear_cache)
    elif activation == 'relu':
        dZ = relu_backward(dA, activation_cache)
        linear_activation_grads = linear_backward(dZ, linear_cache)
        
    return linear_activation_grads

In [39]:
# test linear_activation_backward

sigmoid_grads = linear_activation_backward(sigmoid_output, sigmoid_linear_activation_cache, 'sigmoid')
relu_grads = linear_activation_backward(relu_output, relu_linear_activation_cache, 'relu')
print(f'sigmoid gradients:\n {sigmoid_grads}')
print(f'relu gradients:\n {relu_grads}')

sigmoid gradients:
 {'dW': array([[ 0.1503623 , -0.06772609,  0.0595537 ]]), 'db': array([[0.12530806]]), 'dA_prev': array([[ 6.35299736e-05,  6.38024469e-05],
       [-7.96498818e-04, -7.99914916e-04],
       [ 2.38720561e-04,  2.39744408e-04]])}
relu gradients:
 {'dW': array([[ 0.00396671, -0.00417203,  0.00759938]]), 'db': array([[0.00495059]]), 'dA_prev': array([[ 3.24586083e-07,  4.70598280e-06],
       [-4.06945598e-06, -5.90006500e-05],
       [ 1.21966636e-06,  1.76832255e-05]])}


In [40]:
# l-layer linear_activation_backward

def l_layer_linear_activation_backward(Y_predict, Y, cache):
    '''
    perform l-pass forward propagation with non-linear
    
    Args:
    Y_predict -- forward pass output 
    Y -- ground-truth label
    cache -- intermediate cache
    
    Return:
    grads -- A dictionary with the gradients
    '''
    
    grads = {}
    L = len(cache)
    Y = Y.reshape(Y_predict.shape)
    
    dloss = - (np.divide(Y, Y_predict) - np.divide(1-Y, 1-Y_predict))
    current_cache = cache[L-1] 
    linear_activation_grads = linear_activation_backward(dloss, current_cache, 'sigmoid')
    dW = linear_activation_grads['dW']
    db = linear_activation_grads['db']
    dA_prev = linear_activation_grads['dA_prev']
    grads['dW' + str(L)] = dW 
    grads['db' + str(L)] = db 
    grads['dA' + str(L-1)] = dA_prev 
    
    for i in reversed(range(L-1)):
        current_cache = cache[i]
        linear_activation_grads = linear_activation_backward(grads['dA' + str(i+1)], current_cache, 'relu')
        dW = linear_activation_grads['dW']
        db = linear_activation_grads['db']
        dA_prev = linear_activation_grads['dA_prev']
        grads['dW' + str(i+1)] = dW
        grads['db' + str(i+1)] = db
        grads['dA' + str(i)] = dA_prev

    return grads

In [42]:
# test l-layer linear_activation_backward

grads = l_layer_linear_activation_backward(output, np.array([[1, 0, 1],
                                                            [1, 0, 0]]), cache)
print(f'grads {grads}')

grads {'dW3': array([[ 2.21461081e-05, -6.55843207e-06,  1.23840881e-05],
       [ 1.41430411e-04, -6.55843223e-06,  6.07893675e-05]]), 'db3': array([[-0.16666739],
       [ 0.16666669]]), 'dA2': array([[ 0.00349919, -0.00349918,  0.00337255],
       [ 0.00981258, -0.00981257, -0.00136052],
       [ 0.00218415, -0.00218414,  0.00452832]]), 'dW2': array([[ 6.29242964e-06, -1.63789215e-05,  8.77805608e-06,
         1.49520943e-06],
       [ 1.76455213e-05,  0.00000000e+00,  2.46158295e-05,
         0.00000000e+00],
       [ 0.00000000e+00,  1.38681524e-05,  0.00000000e+00,
         3.57360303e-06]]), 'db2': array([[0.00112419],
       [0.00327086],
       [0.00078139]]), 'dA1': array([[ 4.98845036e-05,  4.43636856e-05, -4.92497087e-05],
       [-2.70353316e-05, -5.16396465e-05,  6.26225048e-05],
       [ 1.94896428e-05, -1.64414004e-05, -9.14012761e-07],
       [-7.42399256e-05, -8.91750474e-06, -1.01939507e-06]]), 'dW1': array([[ 8.52908316e-06, -1.25661913e-06,  3.63421148e-05,
       

<b>Update parameters</b>

In [43]:
def update_params(params, grads, learning_rate):
    '''
    update weights and bias
    
    Args:
    parameters -- weights and bias
    grads -- gradients for weights and bias
    learning_rate -- learning rate alpha
    
    Return:
    updated_parameters -- updated parameters
    '''
    
    L = len(params) // 2
    parameters = params.copy()
    for i in range(L):
        parameters['W' + str(i+1)] = parameters['W' + str(i+1)] - learning_rate * grads['dW' + str(i+1)]
        parameters['b' + str(i+1)] = parameters['b' + str(i+1)] - learning_rate * grads['db' + str(i+1)]
    
    return parameters

In [45]:
# test update parameters

updated_parameters = update_params(parameters, grads, 0.1)
print(f'updated parameters:\n {updated_parameters}')

updated parameters:
 {'W1': array([[ 0.0162426 , -0.00611744, -0.00528535, -0.01072885,  0.00865355],
       [-0.02301692,  0.01744689, -0.00761146,  0.00318884, -0.00249655],
       [ 0.01462075, -0.02060136, -0.00322559, -0.00384022,  0.01133749],
       [-0.01099898, -0.00172389, -0.00877905,  0.00042221,  0.00582754]]), 'b1': array([[-1.66281679e-06],
       [-3.66095275e-07],
       [-6.49654759e-07],
       [ 3.31229993e-07]]), 'W2': array([[-0.01100682,  0.01144887,  0.00901503,  0.00502479],
       [ 0.00900679, -0.00683728, -0.00123136, -0.00935769],
       [-0.00267888,  0.00530217, -0.00691661, -0.00396789]]), 'b2': array([[-1.12418516e-04],
       [-3.27086067e-04],
       [-7.81390469e-05]]), 'W3': array([[-0.00687394, -0.0084514 , -0.0067137 ],
       [-0.00014079, -0.01117245,  0.00233808]]), 'b3': array([[ 0.01666674],
       [-0.01666667]])}
