In [1]:
import numpy as np
from my_ANN import initialize_params, forward, compute_cost, backward

### Convert all Parameters into a Vector

In [2]:
params = initialize_params([2,3,1])
params

{'W1': array([[-0.83518171, -0.60852509],
        [ 1.11130172, -0.50993656],
        [-0.34327987,  0.39759152]]),
 'B1': array([[0.],
        [0.],
        [0.]]),
 'W2': array([[ 0.16970138, -0.39428035, -0.88987278]]),
 'B2': array([[0.]])}

In [3]:
sorted_keys = sorted(params.keys(), key=lambda x: (-int(x[1]), x[0]), reverse=True)  # descending
sorted_keys

['W1', 'B1', 'W2', 'B2']

In [4]:
params_W = {key: value for key, value in params.items() if not key.startswith('B')}
params_W

{'W1': array([[-0.83518171, -0.60852509],
        [ 1.11130172, -0.50993656],
        [-0.34327987,  0.39759152]]),
 'W2': array([[ 0.16970138, -0.39428035, -0.88987278]])}

In [5]:
def params_to_vector(params):
    param_vector = []
    param_shape_dict = {}
    
    sorted_keys = sorted(params.keys(), key=lambda x: (-int(x[1]), x[0]), reverse=True)  # descending
    for key in sorted_keys:
        param_vector.append(params[key].flatten())
        param_shape_dict[key] = params[key].shape
        
    param_vector = np.concatenate(param_vector, axis=0)
    
    return param_vector, param_shape_dict

In [6]:
param_vector, param_shape_dict = params_to_vector(params)
param_vector, param_shape_dict

(array([-0.83518171, -0.60852509,  1.11130172, -0.50993656, -0.34327987,
         0.39759152,  0.        ,  0.        ,  0.        ,  0.16970138,
        -0.39428035, -0.88987278,  0.        ]),
 {'W1': (3, 2), 'B1': (3, 1), 'W2': (1, 3), 'B2': (1, 1)})

In [7]:
def grads_to_vector(grads):
    grad_vector = []
    
    # Remove keys that start with 'dA'
    grads = {key: value 
             for key, value in grads.items() 
             if not key.startswith('dA')}
    
    sorted_keys = sorted(grads.keys(), key=lambda x: (-int(x[2]), x[1]), reverse=True)  # dW1, dB1, dW2, dB2, ...
    for key in sorted_keys:
        grad_vector.append(grads[key].flatten())
    
    grad_vector = np.concatenate(grad_vector, axis=0)
    
    return grad_vector

In [8]:
grads = {
    'dW2': np.random.randn(3, 2),
    'dB2': np.random.randn(3, 1),
    'dW1': np.random.randn(1, 3),
    'dB1': np.random.randn(1, 1),
}

grads_to_vector(grads)

array([ 0.94752275,  1.59906321,  0.53015376,  1.12324794, -0.61308952,
        0.42788732,  0.05212445,  0.84834142, -0.63920017, -0.73756782,
       -1.19341761, -1.58130931, -0.046875  ])

### Inverse Function

In [9]:
np.prod(param_shape_dict['W1'])

6

In [10]:
def vector_to_dict(vector, shape_dict):
    dict = {}

    start_idx = 0
    for key, shape in shape_dict.items():
        size = np.prod(shape)
        end_idx = start_idx + size
        
        dict[key] = vector[start_idx: end_idx].reshape(shape)

        start_idx += size

    return dict

In [11]:
vector_to_dict(param_vector, param_shape_dict)

{'W1': array([[-0.83518171, -0.60852509],
        [ 1.11130172, -0.50993656],
        [-0.34327987,  0.39759152]]),
 'B1': array([[0.],
        [0.],
        [0.]]),
 'W2': array([[ 0.16970138, -0.39428035, -0.88987278]]),
 'B2': array([[0.]])}

# Gradient Checking

$$ gradapprox = \frac{\partial J}{\partial \theta} = \lim_{\varepsilon \to 0} \frac{J(\theta + \varepsilon) - J(\theta - \varepsilon)}{2 \varepsilon}$$

$$ difference = \frac {\| grad - gradapprox \|_2}{\| grad \|_2 + \| gradapprox \|_2 }$$

**Note**: Use `np.linalg.norm` to get the norms

In [12]:
def check_gradient(X, Y, params, grads, epsilon=1e-7, print_process=False):
    param_vector, param_shape_dict = params_to_vector(params)  # shape (-1, )
    grad_vector = grads_to_vector(grads)  # shape (-1, )

    n_params = param_vector.shape[0]
    J_plus = np.zeros((n_params, ))
    J_minus = np.zeros((n_params, ))
    grad_approx_vector = np.zeros((n_params, ))

    for i in range(n_params):  # we want to change only 1 param and keep others intact to compute the change of cost based on that 1 param
        # Perturb the i-th parameter positively
        plus_param_vector = np.copy(param_vector)
        plus_param_vector[i] +=  epsilon
        
        Y_pred1, _ = forward(X, vector_to_dict(plus_param_vector, param_shape_dict))
        J_plus[i] = compute_cost(Y_pred1, Y)
        
        # Perturb the i-th parameter negatively
        minus_param_vector = np.copy(param_vector)
        minus_param_vector[i] -= epsilon
        
        Y_pred2, _ = forward(X, vector_to_dict(minus_param_vector, param_shape_dict))
        J_minus[i] = compute_cost(Y_pred2, Y)
        
        # Compute the approximate gradient
        grad_approx_vector[i] = (J_plus[i] - J_minus[i]) / (2*epsilon)

        if print_process==True and i % 10 == 0:
            print(f"Parameter {i}:")
            print(f"plus_param_vector: {plus_param_vector[i]}")
            print(f"J_plus: {J_plus[i]}")
            print(f"minus_param_vector: {minus_param_vector[i]}")
            print(f"J_minus: {J_minus[i]}")
            print(f"grad_approx_vector: {grad_approx_vector[i]}")
            print('-'*100)

    # Compare gradients
    numerator = np.linalg.norm(grad_vector - grad_approx_vector)
    denominator = np.linalg.norm(grad_approx_vector) + np.linalg.norm(grad_vector)
    diff = numerator / denominator

    if diff > 2e-7:
        print ("There is a mistake in the backward propagation! Difference = " + str(diff))
    else:
        print ("Your backward propagation works perfectly fine! Difference = " + str(diff))

    return diff, grad_approx_vector

In [13]:
def test_case(): 
    np.random.seed(1)
    
    x = np.random.randn(4,3)
    y = np.array([1, 1, 0]).reshape(1, -1)
    
    W1 = np.random.randn(5,4) 
    B1 = np.random.randn(5,1) 
    W2 = np.random.randn(3,5) 
    B2 = np.random.randn(3,1) 
    W3 = np.random.randn(1,3) 
    B3 = np.random.randn(1,1) 
    parameters = {
        "W1": W1, "B1": B1,
        "W2": W2, "B2": B2,
        "W3": W3, "B3": B3
    }
    
    return x, y, parameters

In [14]:
X, Y, parameters = test_case()

# forward pass
Y_pred, caches = forward(X, parameters)
# back-propagation
gradients = backward(Y, Y_pred, caches)

grads_to_vector(gradients)

array([-0.37347779, -1.47903216,  0.17596143, -1.33685036, -0.01967514,
       -0.08573553,  0.01188465, -0.07674312,  0.03916037, -0.05539735,
        0.04872715, -0.09359393, -0.05337778, -0.21138458,  0.02514856,
       -0.19106384,  0.        ,  0.        ,  0.        ,  0.        ,
        0.63290787,  0.0372514 , -0.06401301,  0.09045575,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.91580165,  0.02451548, -0.10797954,  0.90281891,  0.        ,
        0.        ,  0.        ,  0.19763343,  0.        ,  0.        ,
        2.24404238,  0.21225753])

In [15]:
check_gradient(X, Y, parameters, gradients)

Your backward propagation works perfectly fine! Difference = 1.1890417878730741e-07


(1.1890417878730741e-07,
 array([-0.37347774, -1.47903191,  0.17596133, -1.33685053, -0.01967497,
        -0.08573522,  0.01188466, -0.07674277,  0.03916037, -0.05539735,
         0.04872715, -0.09359393, -0.05337767, -0.21138479,  0.02514863,
        -0.19106371,  0.        ,  0.        ,  0.        ,  0.        ,
         0.63290795,  0.03725111, -0.06401301,  0.09045546,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.9158017 ,  0.02451541, -0.10797954,  0.90281879,  0.        ,
         0.        ,  0.        ,  0.19763344,  0.        ,  0.        ,
         2.24404227,  0.21225742]))

In [16]:
_, _ = check_gradient(X, Y, parameters, gradients, print_process=True)

Parameter 0:
plus_param_vector: -0.3224171040135075
J_plus: 2.4078333524866014
minus_param_vector: -0.3224173040135075
J_minus: 2.4078334271821493
grad_approx_vector: -0.37347773984564014
----------------------------------------------------------------------------------------------------
Parameter 10:
plus_param_vector: 0.9015908205927955
J_plus: 2.4078333947070716
minus_param_vector: 0.9015906205927956
J_minus: 2.4078333849616422
grad_approx_vector: 0.048727146761962103
----------------------------------------------------------------------------------------------------
Parameter 20:
plus_param_vector: -0.6871726001195995
J_plus: 2.4078334531251744
minus_param_vector: -0.6871728001195994
J_minus: 2.407833326543585
grad_approx_vector: 0.6329079460520859
----------------------------------------------------------------------------------------------------
Parameter 30:
plus_param_vector: -0.7471581937508377
J_plus: 2.407833389834357
minus_param_vector: -0.7471583937508376
J_minus: 2.407833

**Notes** 
- Gradient Checking is slow! Approximating the gradient with $\frac{\partial J}{\partial \theta} \approx  \frac{J(\theta + \varepsilon) - J(\theta - \varepsilon)}{2 \varepsilon}$ is computationally costly. For this reason, we don't run gradient checking at every iteration during training. Just a few times to check if the gradient is correct. 
- Gradient Checking, at least as we've presented it, doesn't work with dropout. You would usually run the gradient check algorithm without dropout to make sure your backprop is correct, then add dropout. 

Congrats! Now you can be confident that your deep learning model is working correctly! You can even use this to convince your CEO. :) 
<br>
<font color='blue'>
    
**What you should remember from this notebook**:
- Gradient checking verifies closeness between the gradients from backpropagation and the numerical approximation of the gradient (computed using forward propagation).
- Gradient checking is slow, so you don't want to run it in every iteration of training. You would usually run it only to make sure your code is correct, then turn it off and use backprop for the actual learning process. 