In [1]:
#Load packages
import numpy as np
import h5py
import matplotlib.pyplot as plt
from testCases_v4 import *
from dnn_utils_v2 import sigmoid, sigmoid_backward,relu, relu_backward

%matplotlib inline
plt.rcParams['figure.figsize']=(5.0,4.0)
#set default size of plots
plt.rcParams['image.interpolation']='nearest'
plt.rcParams['image.cmap']='gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

In [2]:
#2 layer neural network

#Create and initialize parameters

#Model structure is : LINEAR->RELU->LINEAR->SIGMOID
#Use random initialization for the wight matrices, use, np.random.randn(shape)*0.01
#Use zero initialization for the biases, np.zeros(shape)

def initialize_parameters(n_x,n_h,n_y):
    
    """
    Argument:
    n_x--size of input layer
    n_h--size of hidden layer
    n_y--size of output layer
    
    Returns:
    
    w1----weight matrix of shape(n_h,n_x)
    b1----bias vector of shape(n_h,1)
    w2----weight matrix of shape(n_y,n_h)
    b2----bias vector of shape(n_y,1)
    
    """
    np.random.seed(1)
    
    w1=np.random.randn(n_h,n_x)*0.01
    b1=np.zeros((n_h,1))
    w2=np.random.randn(n_y,n_h)*0.01
    b2=np.zeros((n_y,1))
    
    #assert
    assert(w1.shape==(n_h,n_x))
    assert(b1.shape==(n_h,1))
    assert(w2.shape==(n_y,n_h))
    assert(b2.shape==(n_y,1))
    
    parameters={"w1":w1,
               "b1":b1,
               "w2":w2,
               "b2":b2}
    
    return parameters
    
    

In [3]:
parameters=initialize_parameters(3,2,1)
print("w1="+str(parameters["w1"]))
print("b1="+str(parameters["b1"]))
print("w2="+str(parameters["w2"]))
print("b2="+str(parameters["b2"]))

w1=[[ 0.01624345 -0.00611756 -0.00528172]
 [-0.01072969  0.00865408 -0.02301539]]
b1=[[0.]
 [0.]]
w2=[[ 0.01744812 -0.00761207]]
b2=[[0.]]


In [4]:
##Graded function initialize_parameters_deep
#[LINEAR->RELU] L-1 times-->LINEAR-->SIGMOID

def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims: --python array(list) contains the dimensions of each layer in out network
    
    Returns:
    parameters --python dictionary containing your parameters "w1,"b1",..."wL","bL":
    
    wl----weight matrix of shape(layer_dims[l],layer_dims[l-1])
    bl----Bias mabrix of shape(layer_dims[l],1)
    """
    np.random.seed(3)
    parameters={}
    L=len(layer_dims)
    
    for l in range(1,L):
        
        parameters["w"+str(l)]=np.random.randn(layer_dims[l],layer_dims[l-1])*0.01
        parameters["b"+str(l)]=np.zeros((layer_dims[l],1))
        
        assert(parameters["w"+str(l)].shape==(layer_dims[l],layer_dims[l-1]))
        assert(parameters["b"+str(l)].shape==(layer_dims[l],1))       
        
    return parameters

parameters=initialize_parameters_deep([5,4,3])

print("w1="+str(parameters["w1"]))
print("b1="+str(parameters["b1"]))
print("w2="+str(parameters["w2"]))
print("b2="+str(parameters["b2"]))
        
        

w1=[[ 0.01788628  0.0043651   0.00096497 -0.01863493 -0.00277388]
 [-0.00354759 -0.00082741 -0.00627001 -0.00043818 -0.00477218]
 [-0.01313865  0.00884622  0.00881318  0.01709573  0.00050034]
 [-0.00404677 -0.0054536  -0.01546477  0.00982367 -0.01101068]]
b1=[[0.]
 [0.]
 [0.]
 [0.]]
w2=[[-0.01185047 -0.0020565   0.01486148  0.00236716]
 [-0.01023785 -0.00712993  0.00625245 -0.00160513]
 [-0.00768836 -0.00230031  0.00745056  0.01976111]]
b2=[[0.]
 [0.]
 [0.]]


In [5]:
def linear_forward(A,W,b):
    """
        Implement the linear part of a layer's forward propagation
        
        Arguments:
        
        A----Activation from previous layer(or input data):,shape:(size of previsios layer,number of examples)
        W----Weight matrix: numpy array of shape:(size of current layer,size of previous layer)
        b----bias vector: a numpy array of shape:(size of current layer,1)
        
        Returns:
        
        Z----the input of activation function, also called pre-activation parameter
        
        cache--a python tuple containing "A","W", and "b", stored for computing the
        backward pass efficently
        
    """
    Z=np.dot(W,A)+b
    
    assert(Z.shape==(W.shape[0],A.shape[1]))
    
    cache=(A,W,b)
    
    return Z, cache

In [6]:
A,W,b=linear_forward_test_case()

Z,linear_cache=linear_forward(A,W,b)
print("Z="+str(Z))

Z=[[ 3.26295337 -1.23429987]]


In [7]:
def linear_activation_forward(A_prev,W,b,activation):
    
    """
        Implement the forward propagation for the LINEAR->ACTIVATION layer
        
        Argument:
        
        A_prev -- activation from previous layer(or input data):(size of previous_layer,number of examples)
        
        W---Weights matrix: numpy array of shape,(size of current layer,size of previous layer)
        
        b---bias vector, numpy array of shape, (size of current layer,1)
        
        activation---the activation to be used in this layer, stored as text string:"sigmoid" or "relu"
        
        Returns:
        
        A-- the output of the activation function, also called the post-activation value
        cache--a python tuple containing "linear_cache" and "activation_cache"
        
        stored for computing the backward pass efficiency
        
        
        """
    
    if activation=="sigmoid":
        Z,linear_cache=linear_forward(A_prev,W,b)
        A,activation_cache=sigmoid(Z)
        
    elif activation=="relu":
        Z,linear_cache=linear_forward(A_prev,W,b)
        A,activation_cache=relu(Z)
            
    assert(A.shape==(W.shape[0],A_prev.shape[1]))
        
    cache=linear_cache,activation_cache
        
    return A,cache
    

In [8]:
A_prev,W,b=linear_activation_forward_test_case()

A,linear_activation_cache=linear_activation_forward(A_prev,W,b,activation="sigmoid")
print("with sigmoid:"+str(A))

A,linear_activation_cache=linear_activation_forward(A_prev,W,b,activation="relu")
print("with relu:"+str(A))

with sigmoid:[[0.96890023 0.11013289]]
with relu:[[3.43896131 0.        ]]


In [9]:
def L_model_forward(X,parameters):
    """
        Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
        
        Arguments:
        
        X-----data, numpy array of shape(input size, number of examples)
        parameters-- output of initialize_parameters_deep()
        
        Returns:
        
        AL --last-post- activation value
        caches--list of caches containing:
                every cache of linear_relu_forward()
                (there are L-1 of them, indexed from 0 to L-2)
                the cache of linear_sigmoid_forward()
                (there is one, indexed L-1)
    """
    
    caches=[]
    A=X
    L=len(parameters)//2 #number of layers in the neural network
    
    #Implement[LINEAR->RELU]*[L-1] add "cache" to the "caches list"
    
    for l in range(1,L):
        A_prev=A
        A,cache=linear_activation_forward(A_prev,parameters["W"+str(l)],parameters["b"+str(l)],activation="relu")
        caches.append(cache)
    
    AL,cache=linear_activation_forward(A,parameters["W"+str(L)],parameters["b"+str(L)],activation="sigmoid")
    caches.append(cache)
    
    assert(AL.shape==(1,X.shape[1]))
    return AL,caches

In [10]:
X,parameters=L_model_forward_test_case_2hidden()
AL,caches=L_model_forward(X,parameters)
print("AL="+str(AL))
print("Length of caches list="+str(len(caches)))
#parameters


AL=[[0.03921668 0.70498921 0.19734387 0.04728177]]
Length of caches list=3


In [11]:
#------COST FUNCTION---------

#compute the cross_entropy cost J,

def compute_cost(AL,Y):
    
    """
        Implement the cost function defined by equation
        
        Arguments:
        
        AL---Probability vector corresponding to your label predictions, shape(1,number of examples)
        
        Y---- true "label" vector (for example: containing 0 if non-cat, 1 if cat):(1,number of examples)
        
        Returns:
        
        cost----cross-entropy cost
    """
    
    m=Y.shape[1]#the examples represented as each columns
    
    #wrong-method#cost=(-1/m)*np.sum((np.dot(Y,np.log(AL))+np.dot(1-Y,np.log(1-AL))))
    
    cost=(-1/m)*np.sum((np.multiply(Y,np.log(AL))+np.multiply(1-Y,np.log(1-AL))))
    
    #cost=(-1/m)*np.sum((Y*np.log(AL)+(1-Y)*np.log(1-AL)))----its also a working method
    
    cost=np.squeeze(cost)# To make sure your cost's shape is what we expect(eg: this turns [[17]] to 17)
    
    assert(cost.shape == ())
    
    return cost


    
    
    
    

In [12]:
Y,AL=compute_cost_test_case()
print("cost="+str(compute_cost(AL,Y)))

#Y.shape= (1,3)
#AL.shape =(1,3)

cost=0.41493159961539694


In [88]:
#------LINEAR BACKWARD------

def linear_backward(dZ,cache):
    """
        Implement the linear portion of backward propagation 
        for a single layer(layer l)
        
        Arguments:
        
        dZ-- Cordient of the cost with respect to the linear output(of current layer l)
        
        cache--tuple of values (A_prev,W,b) coming from the forward propagation in the current layer
        
        Returns:
        
        dA_prev---Gradient of the cost with repect to the activation (of the previous layer l-1), same shape as W
        
        dW---Gradient of the cost with respect to W(current layer l), same type as w
        
        db---Gradient of the cost with respect to b(current layer l), same shape as b
    """
    
    A_prev,W,b=cache
    m=A_prev.shape[1]
    #db_unit=np.zeros((b.shape[0],b.shape[1]))
    
    #db.np.zeros(())
    dW=(1/m)*np.dot(dZ,A_prev.T)
    
    #both works---
    db=(1/m)*(np.sum(dZ,axis=1))
    db=db.reshape(b.shape)
    #db=(1/m)*dZ.reshape(b.shape)
    #db=(1/m)*dZ.sum(axis=1).reshape(b.shape[0],b.shape[1]) # reshaped because of the effect of braodcasing dZ on db
    
    dA_prev=np.dot(W.T,dZ)
    
    assert(dA_prev.shape==A_prev.shape)
    assert(dW.shape==W.shape)
    assert(db.shape==b.shape)
    
    return dA_prev,dW,db


    
    

In [89]:
dZ,linear_cache=linear_backward_test_case()
A_prev,W,b=linear_cache
dA_prev,dW,db=linear_backward(dZ,linear_cache)
#print(b.shape)
#print(dZ.shape)
#sum=dZ.sum(axis=1)
#print(sum)
print("dA_prev="+ str(dA_prev))
print("dW="+str(dW))
print("db="+str(db))
#print(b.shape)
#print(dZ.shape)
#print(db.shape)
#dZ
#b
#db.shape
print("length of b in linear_cache:",(linear_cache[2].shape),"type:",type(linear_cache[2]))

dA_prev=[[ 0.51822968 -0.19517421]
 [-0.40506361  0.15255393]
 [ 2.37496825 -0.89445391]]
dW=[[-0.10076895  1.40685096  1.64992505]]
db=[[0.50629448]]
length of b in linear_cache: (1, 1) type: <class 'numpy.ndarray'>


In [59]:
###-------LINEAR ACTIVATION BACKWARD----------###

def linear_activation_backward(dA,cache,activation):
    """
        Implement the backward propagation for the 
        LINEAR->ACTIVATION layer
        
        Arguments:
        
        dA-- Post-activation gradient for current layer l
        cache--tuple of values(linear cache, activation cache) was store for
                computing backward propagation efficiently
                
        activation---the activation to be used in this layer stored as text string: "sigmoid" or "relu"
        
        Returns:
        
            dA_prev ---Gradient of the cost with respect to the activation( of the previous layer l-1),
                        same as shape as A_prev
                        
            dW------Gradient of the cost with respect to W(current layer l), same shape as W
            
            db-----Gradient of the cost with respect to b.
                   (current layer l), same shape as b
    """
    linear_cache,activation_cache=cache
    
    if activation=="relu":
        
        dZ=relu_backward(dA,activation_cache)
        
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
        
    elif activation=="sigmoid":
        
        dZ=sigmoid_backward(dA,activation_cache)
        
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
        
    return dA_prev,dW,db
    

In [76]:
AL,linear_activation_cache=linear_activation_backward_test_case()

dA_prev,dW,db=linear_activation_backward(AL,linear_activation_cache,activation="sigmoid")

print("sigmoid")
print("dA_prev="+str(dA_prev))
print("dW="+str(dW))
print("db="+str(db)+"\n")
print("sigmoid:",dZ.shape)
dA_prev,dW,db=linear_activation_backward(AL,linear_activation_cache,activation="relu")

print("relu")
print("dA_prev="+str(dA_prev))
print("dW="+str(dW))
print("db="+str(db))

print("shape of dZ:",dZ.shape)

sigmoid
dA_prev=[[ 0.11017994  0.01105339]
 [ 0.09466817  0.00949723]
 [-0.05743092 -0.00576154]]
dW=[[ 0.10266786  0.09778551 -0.01968084]]
db=[[-0.05729622]]

sigmoid: (1, 2)
relu
dA_prev=[[ 0.44090989  0.        ]
 [ 0.37883606  0.        ]
 [-0.2298228   0.        ]]
dW=[[ 0.44513824  0.37371418 -0.10478989]]
db=[[-0.20837892]]
shape of dZ: (1, 2)


In [61]:
for i in reversed(range(6)):
    print(i)

5
4
3
2
1
0


In [96]:
#----GRADED FUNCTION: L_model_backward

def L_model_backward(AL,Y,caches):
    """
        Implement the backward propagation for the 
        [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID group
        
        Arguments:
        
        AL ---Probability vector ,output of the forward propagation(L_model_forward())
        
        Y--true "label" vector (containing 0 if non-cat 1 if cat)
        
        caches---list of caches containing:
                every cache of linear_activation_forward() with "relu" (its caches[l], for l in range (L-1)
                i.e. l=0,1,2,...,L-2, the cache of linear_activation_forward with "sigmoid", its caches[L-1]
                
        Returns:
        
        grads:---A dictionary with the gradients
        
        grads["dA"+str(l)]=...
        grads["dW"+str(l)]=...
        grads["db"+str(l)]=...
        
    """
    
    grads={}
    L=len(caches)#number of layers
    m=AL.shape[1]
    Y=Y.reshape(AL.shape)#after this line, Y is the same shape as AL
    
    #Initializing backpropagation
    
    dAL=-(np.divide(Y,AL)-np.divide(1-Y,1-AL))
    
    #Lth layer (SIGMOID->LINEAR) gradients
    
    #Inputs: "AL,Y,caches" Outputs: "grads["dAL"], grads["dWL"], grads["dbL"]
    
    current_cache=caches[L-1]
    
    grads["dA"+str(L-1)],grads["dW"+str(L)],grads["db"+str(L)]=linear_activation_backward(
                                        dAL,current_cache,activation="sigmoid") #linear_activation_backward function
    
    for l in reversed (range(L-1)):
        #l'th layer:(RELU->LINEAR) gradients
        #Inputs:"grads["dA"+str(l+1)], current cache"
        #Outputs:"grads["dA"+str(l)], grads["dW"+str(l+1)], grads["db"+str(l+1)]
        
        current_cache=caches[l]
        dA_prev_temp,dW_temp,db_temp=linear_activation_backward(
                                    grads["dA"+str(l+1)],current_cache,activation="relu")
        
        grads["dA"+str(l)]=dA_prev_temp
        grads["dW"+str(l+1)]=dW_temp
        grads["db"+str(l+1)]=db_temp
        
    return grads

    

In [97]:
AL,Y_assess,caches=L_model_backward_test_case()
grads=L_model_backward(AL,Y_assess,caches)
#print(grads)
print_grads(grads)
#print(caches[2])
#type(caches)
#len(caches)
#caches.count(0)

#print("AL.shape:",AL.shape)
#print("Y_assess shape:",Y_assess.shape)
#print("caches length",len(caches))

dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167]
 [0.         0.         0.         0.        ]
 [0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063]
 [ 0.        ]
 [-0.02835349]]
dA1 = [[ 0.12913162 -0.44014127]
 [-0.14175655  0.48317296]
 [ 0.01663708 -0.05670698]]


In [64]:
caches[1][1]

array([[ 0.64667545, -0.35627076]])

In [98]:
#--------UPDATE PARAMETERS---------#

    # Equations---
    
#  W^[l]=W^[l]-alpha*dW^[l]
#  b^[l]=b^[l]-alpha*db^[l]

# Update parameters usign gradient descent, on every W^[L] and b^[L] for l=1,2,3,...,L

def update_parameters(parameters,grads,learning_rate):
    """
        Update parameters using gradient descent
    
    Arguments:
    
        parameters--python dictionary containing your parameters
        grads--python dictionary containing your gradients, output of L_model_backward
        
        Returns:
        
        parameters--python dictionary conatining your updated parameters
        
        parameters["W"+str(l)]=...
        parameters["b"+str(l)]=...
        
    """
    
    L=len(parameters)//2 #number of layers in neural network
    
    #Update rule for each parameters. Use a for loop
    
    for l in range(L):
        parameters["W"+str(l+1)]=parameters["W"+str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b"+str(l+1)]=parameters["b"+str(l+1)]-learning_rate*grads["db"+str(l+1)]
    
    return parameters 

In [99]:
parameters,grads=update_parameters_test_case()
parameters=update_parameters(parameters,grads,0.1)

print("W1="+str(parameters["W1"]))
print("b1="+str(parameters["b1"]))
print("W2="+str(parameters["W2"]))
print("b2="+str(parameters["b2"]))

W1=[[-0.59562069 -0.09991781 -2.14584584  1.82662008]
 [-1.76569676 -0.80627147  0.51115557 -1.18258802]
 [-1.0535704  -0.86128581  0.68284052  2.20374577]]
b1=[[-0.04659241]
 [-1.28888275]
 [ 0.53405496]]
W2=[[-0.55569196  0.0354055   1.32964895]]
b2=[[-0.84610769]]
