<b>Import dependencies</b>

In [174]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from dnn_utils import sigmoid, relu, sigmoid_backward, relu_backward

<b>Initialization</b>

In [48]:
# init two-layer model parameters

def init_parameters(n_x, n_h, n_y):
    '''
    random init for weights, init with zeros for bias
    
    Args:
    n_x -- size of input layer
    n_h -- neurons of hidden layer
    n_y -- neurons of output layer
    
    Return:
    parameters -- a python dictionary contains model parameters
    '''
    
    np.random.seed(1)
    
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    parameters = {
        'W1': W1,
        'b1': b1,
        'W2': W2,
        'b2': b2
    }
    
    return parameters

In [47]:
# test init function

parameters = init_parameters(3, 2, 1)
print(f'W1:\n {parameters["W1"]} and shape {parameters["W1"].shape}')
print(f'b1:\n {parameters["b1"]} and shape {parameters["b1"].shape}')
print(f'W2:\n {parameters["W2"]} and shape {parameters["W2"].shape}')
print(f'b2:\n {parameters["b2"]} and shape {parameters["b2"].shape}')

W1:
 [[ 0.01624345 -0.00611756 -0.00528172]
 [-0.01072969  0.00865408 -0.02301539]] and shape (2, 3)
b1:
 [[0.]
 [0.]] and shape (2, 1)
W2:
 [[ 0.01744812 -0.00761207]] and shape (1, 2)
b2:
 [[0.]] and shape (1, 1)


In [23]:
# init l-layers model parameters

def init_l_layer_parameters(layer_dims):
    '''
    random init for weights, init with zeros for bias
    
    Args:
    layer_dims -- a list contains layer dimensions
    
    Return:
    parameters -- a python dictionary contains model parameters
    '''
    
    np.random.seed(3)
    
    parameters = {}
    
    for i in range(1, len(layer_dims)):
        parameters['W'+str(i)] = np.random.randn(layer_dims[i], layer_dims[i-1]) * 0.01
        parameters['b'+str(i)] = np.zeros((layer_dims[i], 1))

    return parameters

In [71]:
# test init function

parameters = init_l_layer_parameters([2,5,4,3,2,1])
for i in parameters:
    print(f'{i} has {parameters[i]} and shape {parameters[i].shape}')

W1 has [[ 0.01788628  0.0043651 ]
 [ 0.00096497 -0.01863493]
 [-0.00277388 -0.00354759]
 [-0.00082741 -0.00627001]
 [-0.00043818 -0.00477218]] and shape (5, 2)
b1 has [[0.]
 [0.]
 [0.]
 [0.]
 [0.]] and shape (5, 1)
W2 has [[-0.01313865  0.00884622  0.00881318  0.01709573  0.00050034]
 [-0.00404677 -0.0054536  -0.01546477  0.00982367 -0.01101068]
 [-0.01185047 -0.0020565   0.01486148  0.00236716 -0.01023785]
 [-0.00712993  0.00625245 -0.00160513 -0.00768836 -0.00230031]] and shape (4, 5)
b2 has [[0.]
 [0.]
 [0.]
 [0.]] and shape (4, 1)
W3 has [[ 0.00745056  0.01976111 -0.01244123 -0.00626417]
 [-0.00803766 -0.02419083 -0.00923792 -0.01023876]
 [ 0.01123978 -0.00131914 -0.01623285  0.00646675]] and shape (3, 4)
b3 has [[0.]
 [0.]
 [0.]] and shape (3, 1)
W4 has [[-0.00356271 -0.01743141 -0.0059665 ]
 [-0.00588594 -0.00873882  0.00029714]] and shape (2, 3)
b4 has [[0.]
 [0.]] and shape (2, 1)
W5 has [[-0.02248258 -0.00267762]] and shape (1, 2)
b5 has [[0.]] and shape (1, 1)


<b>Forward propagation</b>

In [25]:
# linear_forward

def linear_forward(A, W, b):
    '''
    perform linear forward propagation
    
    Args:
    A -- previous layer input
    W -- current layer weights
    b -- current layer bias
    
    Return:
    Z -- linear forward output 
    linear_cache -- intermediate variable for backward propagation
    '''
    
    Z = W.dot(A) + b
    linear_cache = (A, W, b)
    
    return Z, linear_cache

In [49]:
# test linear_forward

A_test = np.random.randn(3,4)
W_test = np.random.randn(1,3) * 0.01
b_test = np.zeros((1,1))
Z_test, linear_cache = linear_forward(A_test, W_test, b_test)

print(f'linear forward output:\n {Z_test} and shape {Z_test.shape}')

linear forward output:
 [[-0.00875679 -0.00956643 -0.00273312  0.0153382 ]] and shape (1, 4)


In [27]:
# linear_activation_forward

def linear_activation_forward(A_prev, W, b, activation):
    '''
    perform linear forward propagation with non-linear
    
    Args:
    A_prev -- previous layer input
    W -- current layer weights
    b -- current layer bais
    activation -- type of activation function
    
    Return:
    A -- linear_activation output
    linear_activation_cache -- intermediate variable for backward propagation
    '''
    
    Z, linear_cache = linear_forward(A_prev, W, b)
    
    if activation == 'sigmoid':
        A, activation_cache = sigmoid(Z)
    elif activation == 'relu':
        A, activation_cache = relu(Z)
        
    linear_activation_cache = (linear_cache, activation_cache)
    return A, linear_activation_cache

In [52]:
# test linear_activation forward function

sigmoid_output, sigmoid_linear_activation_cache = linear_activation_forward(A_test, W_test, b_test, 'sigmoid')
relu_output, relu_linear_activation_cache = linear_activation_forward(A_test, W_test, b_test, 'relu')
print(f'sigmoid output:\n {sigmoid_output} and shape {sigmoid_output.shape}')
print(f'relu output:\n {relu_output} and shape {relu_output.shape}')

sigmoid output:
 [[0.49781082 0.49760841 0.49931672 0.50383447]] and shape (1, 4)
relu output:
 [[0.        0.        0.        0.0153382]] and shape (1, 4)


In [75]:
# l-layer linear_activation_forward

def l_layer_linear_activation_forward(X, parameters):
    '''
    perform l-pass forward propagation with non-linear
    
    Args:
    X -- original input
    parameters -- a dictionary contains parameters
    
    Return:
    A -- linear_activation output
    cache -- intermediate variable for backward propagation
    '''
    A = X 
    L = len(parameters) // 2
    cache = []
    for i in range(1, L):
        A_prev = A
        A, linear_activation_cache = linear_activation_forward(A_prev, parameters['W'+str(i)], parameters['b'+str(i)], 'relu')
        cache.append(linear_activation_cache)
        
    A_out, linear_activation_cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], 'sigmoid')
    cache.append(linear_activation_cache)
        
    return A_out, cache

In [77]:
# test l-layer linear_activation_forward

output, cache = l_layer_linear_activation_forward(np.random.randn(2,3), parameters)
print(f'the network output:\n {output} and has shape {output.shape}')

the network output:
 [[0.5 0.5 0.5]] and has shape (1, 3)


<b>Cost function</b>

In [78]:
def compute_cost(Y, Y_predict):
    '''
    compute model cost
    
    Args:
    Y -- ground-truth labels
    Y_predict -- model output
    
    Return:
    cost -- model cost
    '''
    m = Y.shape[1]
    cost = -np.sum(Y * np.log(Y_predict) + (1-Y) * np.log(1 - Y_predict)) / m
    cost = np.squeeze(cost)
    return cost

In [83]:
# test cost

cost = compute_cost(np.zeros((1,3)), output)
print(f'model cost: {cost}')

model cost: 0.6931471805599453


<b>Back propagation</b>

In [84]:
# linear_backward

def linear_backward(dZ, linear_cache):
    '''
    perform linear backward propagation
    
    Args:
    dZ -- gradient for Z
    linear_cache -- intermediate variables for backward propagation
    
    Return:
    linear_grads -- gradients
    '''
    
    A, W, b = linear_cache
    m = A.shape[1]
    
    dW = (1/m) * np.dot(dZ, A.T)
    db = (1/m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ) 
    
    linear_grads = {
        'dW': dW,
        'db': db,
        'dA_prev': dA_prev
    }
    
    return linear_grads

In [88]:
# test linear_backward

linear_grads = linear_backward(1-Z_test, linear_cache)
print(f'the gradients for weights:\n {linear_grads["dW"]} and has shape {linear_grads["dW"].shape}')
print(f'the gradients for bias:\n {linear_grads["db"]} and has shape {linear_grads["db"].shape}')
print(f'the gradients for previous layer output:\n {linear_grads["dA_prev"]} and has shape {linear_grads["dA_prev"].shape}')

the gradients for weights:
 [[-0.12309023 -0.16478042 -0.11099738]] and has shape (1, 3)
the gradients for bias:
 [[1.00142954]] and has shape (1, 1)
the gradients for previous layer output:
 [[-0.01110257 -0.01111148 -0.01103627 -0.01083738]
 [ 0.01154748  0.01155675  0.01147852  0.01127166]
 [ 0.00909486  0.00910216  0.00904055  0.00887762]] and has shape (3, 4)


In [89]:
# linear_activation_backward

def linear_activation_backward(dA, linear_activation_cache, activation):
    '''
    perform linear backward propagation with non-linear
    
    Args:
    dA -- gradient for A
    linear_activation_cache -- intermediate variables for backward propagation
    activation -- decide to pick relu or sigmoid
    
    Return:
    linear_activation_grads -- gradients
    '''
    (linear_cache, activation_cache) = linear_activation_cache
    
    if activation == 'sigmoid':
        dZ = sigmoid_backward(dA, activation_cache)
        linear_activation_grads = linear_backward(dZ, linear_cache)
    elif activation == 'relu':
        dZ = relu_backward(dA, activation_cache)
        linear_activation_grads = linear_backward(dZ, linear_cache)
        
    return linear_activation_grads

In [93]:
# test linear_activation_backward

sigmoid_grads = linear_activation_backward(sigmoid_test, sigmoid_linear_activation_cache, 'sigmoid')
relu_grads = linear_activation_backward(relu_test, relu_linear_activation_cache, 'relu')
print(f'sigmoid gradients:\n {sigmoid_grads}')
print(f'relu gradients:\n {relu_grads}')

sigmoid gradients:
 {'dW': array([[-0.01477733, -0.02024595, -0.01364713]]), 'db': array([[0.12470584]]), 'dA_prev': array([[-0.00139533, -0.00136982, -0.00137973, -0.00134527],
       [ 0.00145124,  0.00142471,  0.00143502,  0.00139918],
       [ 0.00114301,  0.00112211,  0.00113023,  0.001102  ]])}
relu gradients:
 {'dW': array([[0., 0., 0.]]), 'db': array([[0.]]), 'dA_prev': array([[-0., -0., -0., -0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])}


In [125]:
# l-layer linear_activation_backward

def l_layer_linear_activation_backward(Y_predict, Y, cache):
    '''
    perform l-pass forward propagation with non-linear
    
    Args:
    Y_predict -- forward pass output 
    Y -- ground-truth label
    cache -- intermediate cache
    
    Return:
    grads -- A dictionary with the gradients
    '''
    
    grads = {}
    L = len(cache)
    Y = Y.reshape(Y_predict.shape)
    
    dloss = - (np.divide(Y, Y_predict) - np.divide(1-Y, 1-Y_predict))
    current_cache = cache[L-1] 
    linear_activation_grads = linear_activation_backward(dloss, current_cache, 'sigmoid')
    dW = linear_activation_grads['dW']
    db = linear_activation_grads['db']
    dA_prev = linear_activation_grads['dA_prev']
    grads['dW' + str(L)] = dW 
    grads['db' + str(L)] = db 
    grads['dA' + str(L-1)] = dA_prev 
    
    for i in reversed(range(L-1)):
        current_cache = cache[i]
        linear_activation_grads = linear_activation_backward(grads['dA' + str(i+1)], current_cache, 'relu')
        dW = linear_activation_grads['dW']
        db = linear_activation_grads['db']
        dA_prev = linear_activation_grads['dA_prev']
        grads['dW' + str(i+1)] = dW
        grads['db' + str(i+1)] = db
        grads['dA' + str(i)] = dA_prev

    return grads

In [150]:
# test l-layer linear_activation_backward

grads = l_layer_linear_activation_backward(output, np.array([[1, 0, 1]]), cache)
print(f'grads {grads}')

grads {'dW5': array([[0., 0.]]), 'db5': array([[-0.16666667]]), 'dA4': array([[ 0.01124129, -0.01124129,  0.01124129],
       [ 0.00133881, -0.00133881,  0.00133881]]), 'dW4': array([[0., 0., 0.],
       [0., 0., 0.]]), 'db4': array([[0.],
       [0.]]), 'dA3': array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]]), 'dW3': array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]]), 'db3': array([[0.],
       [0.],
       [0.]]), 'dA2': array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]]), 'dW2': array([[0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0.]]), 'db2': array([[0.],
       [0.],
       [0.],
       [0.]]), 'dA1': array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]]), 'dW1': array([[0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.],
       [0., 0.]]), 'db1': array([[0.],
       [0.],
       [0.],
       [

<b>Update parameters</b>

In [177]:
def update_params(parameters, grads, learning_rate):
    '''
    update weights and bias
    
    Args:
    parameters -- weights and bias
    grads -- gradients for weights and bias
    learning_rate -- learning rate alpha
    
    Return:
    updated_parameters -- updated parameters
    '''
    
    L = len(parameters) // 2
    updated_parameters = parameters
    for i in range(L):
        updated_parameters['W' + str(i+1)] = parameters['W' + str(i+1)] - learning_rate * grads['dW' + str(i+1)]
        updated_parameters['b' + str(i+1)] = parameters['b' + str(i+1)] - learning_rate * grads['db' + str(i+1)]
    
    return updated_parameters

In [180]:
# test update parameters

updated_parameters = update_params(parameters, grads, 0.01)
print(f'updated parameters:\n {updated_parameters}')

updated parameters:
 {'W1': array([[ 0.01788628,  0.0043651 ],
       [ 0.00096497, -0.01863493],
       [-0.00277388, -0.00354759],
       [-0.00082741, -0.00627001],
       [-0.00043818, -0.00477218]]), 'b1': array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]]), 'W2': array([[-0.01313865,  0.00884622,  0.00881318,  0.01709573,  0.00050034],
       [-0.00404677, -0.0054536 , -0.01546477,  0.00982367, -0.01101068],
       [-0.01185047, -0.0020565 ,  0.01486148,  0.00236716, -0.01023785],
       [-0.00712993,  0.00625245, -0.00160513, -0.00768836, -0.00230031]]), 'b2': array([[0.],
       [0.],
       [0.],
       [0.]]), 'W3': array([[ 0.00745056,  0.01976111, -0.01244123, -0.00626417],
       [-0.00803766, -0.02419083, -0.00923792, -0.01023876],
       [ 0.01123978, -0.00131914, -0.01623285,  0.00646675]]), 'b3': array([[0.],
       [0.],
       [0.]]), 'W4': array([[-0.00356271, -0.01743141, -0.0059665 ],
       [-0.00588594, -0.00873882,  0.00029714]]), 'b4': array([[0.