# Building your Deep Neural Network: Step by Step

**Notation**:
- Superscript $[l]$ denotes a quantity associated with the $l^{th}$ layer. 
    - Example: $a^{[L]}$ is the $L^{th}$ layer activation. $W^{[L]}$ and $b^{[L]}$ are the $L^{th}$ layer parameters.
- Superscript $(i)$ denotes a quantity associated with the $i^{th}$ example. 
    - Example: $x^{(i)}$ is the $i^{th}$ training example.
- Lowerscript $i$ denotes the $i^{th}$ entry of a vector.
    - Example: $a^{[l]}_i$ denotes the $i^{th}$ entry of the $l^{th}$ layer's activations).
    
**initialize_parameters_deep(layer_dims) --> L_layer_forward(X, parameters)  -->  Compute-cost  -->L_layer_backward(A_last, Y, caches) --> Update-params**

linear_forward(A_prev, W, b) -->  activation(A_prev, W, b, activation) -->  **L_layer_forward(X, parameters)**

linear_backward(E, cache) --> linear_activation_backward(dA, cache, activation) --> **L_layer_backward(A_last, Y, caches)**

```python
def L_layer_model(X, Y, layers_dims, learning_rate=.0075, num_iterations=3000, print_every=500):
    """
    Implement L layers neural network
    
    Arguments:
    X -- input data (n_x, m)
    Y -- ground truth 0 for non-cat and 1 for cat (n_y, m)
    layers_dims -- dimensions of layers (n_x, n_h, n_y)
    num_iterations -- number of iterations of optimization loops
    print_every -- number of iterations the cost will be printed
    
    Returns:
    parameters -- dictionary contains all weights and biases
    """
    np.random.seed(2)
    costs = []
    m = X.shape[1]
    
    # Initialize the parameters
    parameters = initialize_parameters_deep(layers_dims)
    
    # Loop (gradient descent)
    for i in range(num_iterations):
        # Forward propagation: [Linear -> relu] * (L-1) -> Linear -> sigmoid
        A_last, caches = L_model_forward(X, parameters)
        
        #Compute cost
        cost = compute_cost(A_last, Y)
        
        # Backward Propagation
        grads = L_model_backward(A_last, Y, caches)
        
        # Update parameters
        parameters = update_parameters(parameters, grads, learning_rate)
        
        if print_every != None and (i+1)%print_every==0:
            costs.append(cost)
            print('Cost after', i+1, 'iterations is', cost)
            
    plt.plot(costs)
    plt.xlabel('iterations(per hundreds)')
    plt.ylabel('cost')
    plt.title('Learning rate = ' + str(learning_rate))
    
    return parameters
```

# 1. Packages

In [1]:
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

import numpy as np
import matplotlib.pyplot as plt
import h5py
from testCases_v4 import *
from dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward

%matplotlib inline
plt.rcParams['figure.figsize']=(5, 4)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

# 2. Initialization for L-layer

In [2]:
def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims -- list contains the dimensions of each layer in the network
    
    Returns:
    parameters -- dictionary contains all weights and biases:
                    W_l -- shape(layer_dims[l], layer_dims[l-1])
                    b_l -- shape(layer_dims[l], 1)
    """
    np.random.seed(3)
    parameters = {}
    L = len(layer_dims)
    
    for i in range(1, L):
        parameters['W'+str(i)] = np.random.randn(layer_dims[i], layer_dims[i-1])*0.01
        parameters['b'+str(i)] = np.zeros((layer_dims[i], 1), dtype=np.float32)
        
    return parameters

In [3]:
parameters = initialize_parameters_deep([5,4,3])
print("W1 = " + str(parameters["W1"]))
print("b1 = " + str(parameters["b1"]))
print("W2 = " + str(parameters["W2"]))
print("b2 = " + str(parameters["b2"]))

W1 = [[ 0.01788628  0.0043651   0.00096497 -0.01863493 -0.00277388]
 [-0.00354759 -0.00082741 -0.00627001 -0.00043818 -0.00477218]
 [-0.01313865  0.00884622  0.00881318  0.01709573  0.00050034]
 [-0.00404677 -0.0054536  -0.01546477  0.00982367 -0.01101068]]
b1 = [[0.]
 [0.]
 [0.]
 [0.]]
W2 = [[-0.01185047 -0.0020565   0.01486148  0.00236716]
 [-0.01023785 -0.00712993  0.00625245 -0.00160513]
 [-0.00768836 -0.00230031  0.00745056  0.01976111]]
b2 = [[0.]
 [0.]
 [0.]]


# 3. Forward propagation
## 3.1 Linear Forward

In [4]:
def linear_forward(A_prev, W, b):
    """
    Implement linear part in forward propagation process
    Arguments:
    A_prev -- activations of previous layer, shape(previous layer dimension, m)
    W -- weight, shape(current layer dimension, previous layer dimension)
    b -- bias, shape(current layer dimension, 1)
    
    Returns:
    Z -- input of activation
    cache -- tuple contains A_prev, W, b
    """
    Z = np.dot(W, A_prev) + b
    cache = (A_prev, W, b)
    
    return Z, cache

In [5]:
A, W, b = linear_forward_test_case()

Z, linear_cache = linear_forward(A, W, b)
print("Z = " + str(Z))

Z = [[ 3.26295337 -1.23429987]]


## 3.2 Activation

In [6]:
def activation(A_prev, W, b, activation):
    """
    Implement the sigmoid or relu
    
    Arguments:
    A_prev -- activations of previous layer, shape(previous layer dimension, m)
    W -- weight, shape(current layer dimension, previous layer dimension)
    b -- bias, shape(current layer dimension, 1)
    activation: store as string ("sigmoid" or "relu")
    
    Returns:
    A -- activation value
    cache -- tuple contains linear_cache which is output of linear_forward and Z implemented by pre-defined function "sigmoid" and "relu"
    """
    Z, linear_cache = linear_forward(A_prev, W, b)
    if activation == 'sigmoid':
        A, Z = sigmoid(Z) # quite absurd but sigmoid is pre-defined function
    elif activation == 'relu':
        A, Z = relu(Z)
        
    cache = (linear_cache, Z)
    return A, cache

In [7]:
A_prev, W, b = linear_activation_forward_test_case()

A, linear_activation_cache = activation(A_prev, W, b, activation = "sigmoid")
print("With sigmoid: A = " + str(A))

A, linear_activation_cache = activation(A_prev, W, b, activation = "relu")
print("With ReLU: A = " + str(A))

With sigmoid: A = [[0.96890023 0.11013289]]
With ReLU: A = [[3.43896131 0.        ]]


# 3.3. L-layer forward

In [8]:
def L_layer_forward(X, parameters):
    """
    Implement forward propagation at first L-1 layers and last L layer
    
    Arguments:
    X -- input data (input feafures size, m)
    parameters -- ouput of initialize_parameters_deep
    
    Returns:
    A_last -- last activation values in deep neural networks
    caches -- list contains cache which is output of activation function
    """
    caches = []
    A = X
    L = len(parameters)//2
    
    # relu activation for first L-1 layers
    for l in range(1, L):
        A_prev = A
        A, cache = activation(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation='relu')
        caches.append(cache)
        
    # sigmoid for last layer
    A_last, cache = activation(A, parameters['W'+str(L)], parameters['b'+str(L)], activation='sigmoid')
    caches.append(cache)
    
    return A_last, caches

In [9]:
X, parameters = L_model_forward_test_case()
A_last, caches = L_layer_forward(X, parameters)
print("AL = " + str(A_last))
print("Length of caches list = " + str(len(caches)))

AL = [[0.17007265 0.2524272 ]]
Length of caches list = 2


# 4. Cost function

In [10]:
def compute_cost(A_last, Y):
    """
    Compute the negative log-likelihood cost
    Arguments:
    A_last -- the output of forward propagation function
    Y -- the ground truth
    Returns:
    Return the cost value
    """
    return -np.mean(Y*np.log(A_last) + (1-Y)*np.log(1-A_last))

In [11]:
Y, A_last = compute_cost_test_case()

print("cost = " + str(compute_cost(A_last, Y)))

cost = 0.414931599615397


# 5. Backward propagation
## 5.1. Linear backward

In [30]:
def linear_backward(E, cache):
    """
    Implement the linear part in backward propagation
    
    Arguments:
    E -- E part in layer l
    cache -- contains A_prev, W, b
    
    Returns:
    dA_prev -- weight at next layer * E at next layer, prepare for previous layer
    dW -- gradient of weight, same shape with W
    db -- gradient of bias, same shape with b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]
    
    dW = 1/m * np.dot(E, A_prev.T)
    db = 1/m * np.sum(E, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, E)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    
    return dA_prev, dW, db

In [31]:
E, linear_cache = linear_backward_test_case()

dA_prev, dW, db = linear_backward(E, linear_cache)
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

dA_prev = [[ 0.51822968 -0.19517421]
 [-0.40506361  0.15255393]
 [ 2.37496825 -0.89445391]]
dW = [[-0.10076895  1.40685096  1.64992505]]
db = [[0.50629448]]


## 5.2. Linear-Activation backward

In [32]:
def linear_activation_backward(dA, cache, activation):
    """
    Calculate E in linear_backward function
    
    Arguments:
    dA -- dA_prev in linear_backward function but when calculating dA_prev in prev layer, it becomes dA
    cache -- output in activation function
    activation -- string "sigmoid" or "relu"
    
    Returns:
    Same output with linear_backward function
    """
    linear_cache, Z = cache
    
    if activation == 'relu':
        E = relu_backward(dA, Z)
        
    elif activation == 'sigmoid':
        E = sigmoid_backward(dA, Z)
        
    dA_prev, dW, db = linear_backward(E, linear_cache)
    return dA_prev, dW, db

In [33]:
A_last, linear_activation_cache = linear_activation_backward_test_case()

dA_prev, dW, db = linear_activation_backward(A_last, linear_activation_cache, activation = "sigmoid")
print ("sigmoid:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db) + "\n")

dA_prev, dW, db = linear_activation_backward(A_last, linear_activation_cache, activation = "relu")
print ("relu:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

sigmoid:
dA_prev = [[ 0.11017994  0.01105339]
 [ 0.09466817  0.00949723]
 [-0.05743092 -0.00576154]]
dW = [[ 0.10266786  0.09778551 -0.01968084]]
db = [[-0.05729622]]

relu:
dA_prev = [[ 0.44090989 -0.        ]
 [ 0.37883606 -0.        ]
 [-0.2298228   0.        ]]
dW = [[ 0.44513824  0.37371418 -0.10478989]]
db = [[-0.20837892]]


## 5.3. L-layer backward

In [36]:
def L_layer_backward(A_last, Y, caches):
    """
    Implement forward propagation at first L-1 layers and last L layer
    
    Arguments:
    A_last -- last activation vector (1, m)
    Y -- ground truth (1, m)
    caches -- output of L_layer_forward
    
    Returns:
    grads -- dictinary contains dA, dW and db for all layer
    """
    grads = {}
    m = A_last.shape[1]
    L = len(caches)
    Y = Y.reshape(A_last.shape)
    
    dA_last = - (np.divide(Y, A_last) - np.divide(1-Y, 1-A_last))
    current_cache = caches[-1]
    
    grads['dA'+str(L-1)], grads['dW'+str(L)], grads['db'+str(L)] = linear_activation_backward(dA_last, current_cache, activation='sigmoid')
    
    for l in reversed(range(L-1)):
        current_cache = caches[l]
        grads['dA'+str(l)], grads['dW'+str(l+1)], grads['db'+str(l+1)] = linear_activation_backward(grads['dA'+str(l+1)], current_cache, activation='relu')
        
    return grads

In [37]:
Y_assess, AL, caches = L_model_backward_test_case()
grads = L_layer_backward(AL, Y_assess, caches)
print ("dW1 = "+ str(grads["dW1"]))
print ("db1 = "+ str(grads["db1"]))
print ("dA1 = "+ str(grads["dA1"]))

dW1 = [[-inf -inf -inf -inf]
 [  0.   0.   0.   0.]
 [-inf -inf -inf -inf]]
db1 = [[inf]
 [ 0.]
 [inf]]
dA1 = [[ inf  inf]
 [-inf -inf]
 [ inf  inf]]




## 5.4. Update parameter

In [42]:
def update_parameter(parameters, grads, learning_rate):
    """
    Update parameters uisng gradient descent
    
    Arguments:
    parameters -- dictionary contains all the parameters
    grads -- output of L_layer_backward
    
    Returns:
    parameters -- dictionary contains updated parameters
    """
    L = len(parameters)//2
    for l in range(1, L+1):
        parameters['W'+str(l)] -= learning_rate*grads['dW'+str(l)]
        parameters['b'+str(l)] -= learning_rate*grads['db'+str(l)]
        
    return parameters

In [43]:
def predict(X, y, parameters):
    """
    This function is used to predict the results of a  L-layer neural network.
    
    Arguments:
    X -- data set of examples you would like to label
    parameters -- parameters of the trained model
    
    Returns:
    p -- predictions for the given dataset X
    """
    
    m = X.shape[1]
    n = len(parameters) // 2 # number of layers in the neural network
    p = np.zeros((1,m))
    
    # Forward propagation
    probas, caches = L_model_forward(X, parameters)

    
    # convert probas to 0/1 predictions
    for i in range(0, probas.shape[1]):
        if probas[0,i] > 0.5:
            p[0,i] = 1
        else:
            p[0,i] = 0
    
    #print results
    #print ("predictions: " + str(p))
    #print ("true labels: " + str(y))
    print("Accuracy: "  + str(np.sum((p == y)/m)))
        
    return p