## Gradient Descent from Scratch for L-layer neural network

In [75]:
import numpy as np


In [76]:
def initialize_deep_nn(layers):
    """
    Inputs
    layers : list of units in each layer of the network 
    
    Outpits
    initial_vals : dictionary with "W_1", "b_1",, 'W_2', 'b_2' ..., 'W_L', 'b_L' where 
                   W_l -- weight matrix of lth layer (shape: layers_dim[l], layers_dim[l-1])
                   b_l -- bias of lth layer(shape: layers_dim[l], 1)
    """

    params = {}
    L = len(layers)
    for l in range(1, L):  
        #initialize to small(to avoid preventing slow learning due to large weights) 
        #normal random variables with mean 0
        params['W_' + str(l)] = np.random.randn(layers[l], layers[l-1])*0.01
        #we can initialize the biases to zero
        params['b_' + str(l)] = np.zeros((layers[l],1))        
    return params

In [77]:
def activation_output(Z, activation_type):
    
    if activation_type=='relu':
        A=np.maximum(0,Z)
    elif activation_type=='sigmoid':
        A=1/(1+np.exp(-Z))
    elif activation_type=='tanh':
        a1, a2 = np.exp(Z), np.exp(-Z)
        A= (a1-a2)/(a1+a2)
    return A

In [78]:
def forward_prop_L_layers(X, params):
    """
    Forward propagation steps of L-layer deep nn.
    
    Inputs
    X : data, numpy array  (shape : input size, number of examples)
    params : initialized values of weights and biases from initialize_deep_nn
    
    Outputs:
    A_L : final layer's activation output
    caches : list of caches where cache[k]= ( A_str(k), W_str(k+1), b_str(k+1), Z_str(k+1) ) k=0,1...L-1
    """

    caches = []
    A = X
    L = len(params) // 2                  # number of layers in the neural network
    
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    for k in range(1, L): 
        Z_k= np.dot(params["W_"+str(k)], A) + params["b_"+str(k)]
        cache=(A, params["W_"+str(k)], params["b_"+str(k)] ,  Z_k)
        caches.append(cache)
        #one can enter 'sigmoid' or 'tanh', too.
        #A is updated to be the activation of kth layer. It has been activation of k-1'th unit so far.
        A= activation_output(Z_k,'relu')
        
        
    
    # Suppose we are doing binary classification and the last layer is a sigmoid unit.
    Z=np.dot(params["W_"+str(L)], A)+params["b_"+str(L)]
    cache=( A, params["W_"+str(L)] , params["b_"+str(L)] ,  Z )
    caches.append(cache)
    A_L= activation_output(Z,'sigmoid')
        
    return A_L, caches

In [79]:
def binary_logistic_cost (A_L, Y):
    """
    For this example, assume the cost is cross entropy cost.

    Inputs
    A_L : probability vector of probabilities that a certain example is of class 1
            , shape (shape: 1, number of examples)
    Y : vector of ground truth labels of 1's or 0's, (shape : 1, number of examples)
   
    
    Outputs
    cost : cross-entropy cost
    """
    
    m = Y.shape[1]
    cost = -1/m * np.sum(Y* np.log(AL) + (1-Y)*np.log(1-AL)) 
    return np.squeeze(cost)      

In [80]:

def backward_prop_L_layers(A_L, Y, caches):
    
    """
    Inputs
    A_L : probability vector of probabilities that a certain example is of class 1 (shape: 1, number of examples)
    Y : vector of ground truth labels of 1's or 0's, (shape : 1, number of examples)
    caches : list of caches where cache[k]= ( A_str(k), W_str(k), b_str(k)), Z_str(k) ) k=1,2...L
    
    Outputs:
    gradients: dictionary of gradients to be used in updates: 
                gradients['dW_'+str(k)]
                gradients['db_'+str(k)]     k=1,...L
    """
    
    gradients = {}     #dictionary to hold the gradients
    L = len(caches)   #number of layers in the model
    m = A_L.shape[1]  #number of samples
    Y = Y.reshape(A_L.shape) # just in case convert Y to the same shape as A_l
    
    
    #last layer assumed to be sigmoid so 
    dA_L = - (np.divide(Y, A_L) - np.divide(1 - Y, 1 - A_L))
    
    #A_L_1 stands for A for L-1'th layer
    A_L_1, W_L, b_L, Z_L = caches[-1]
    #compute dZ of L-1 th layer 
    g= 1/(1+np.exp(-Z_L))
    dZ_L= dA_L * g *(1-g)
    #using dZ, W_L, b_L, Z_L compute the gradients
    dW_L = 1/m * np.dot(dZ_L, A_L_1.T)
    db_L = 1/m * np.sum(dZ_L, axis=1, keepdims=True)
    dA_L_1 = np.dot(W_L.T, dZ_L)
    
    #store them in gradients dictionary
    dA=dA_L_1
    gradients["dW_" + str(L)]=dW_L
    gradients["db_" + str(L)]=db_L
    
    for k in reversed(range(1,L)):
        
        #A of (k-1)th layer, W of kth layer, b of kth layer and Z of kth layer 
        A, W, b, Z = caches[k-1]
        
        #we assume relu activation functions in all layers. so the derivative of relu activation function
        #will be used to compute dZ_k from dA_k
        g_prime=np.ones((Z.shape))
        g_prime[Z<0]=0
        dZ= dA * g_prime
        m=dA.shape[1]
        dW = 1/m * np.dot(dZ, A.T)
        db = 1/m * np.sum(dZ, axis=1, keepdims=True)
        #now update dA so that it corresponds to dA of k-1'th layer instead of dA of k'th layer
        dA = np.dot(W.T, dZ)
        
        gradients["dW_" + str(k)]=dW
        gradients["db_" + str(k)]=db
    
    return gradients
        
        

In [81]:
def update_params(params, lr, gradients):
    
    """
    Inputs:
    params: dictionary holding weights and biases params['W_'+str(k)] and params['b_'+str(k)] k=1,2...L
    gradients: dictionary holding gradients of weights and biases gradients["dW_" + str(k)]
    lr: learning rate, a positive(small) scalar
    
    Outputs:
    params: updated dictionary holding weights and biases
    """
    
    for k in range(len(params)//2):
        params["W_" + str(k+1)] -= lr* gradients["dW_"+ str(k+1)]
        params["b_" + str(k+1)] -= lr* gradients["db_"+ str(k+1)]
        
    return params
    