In [1]:
def compute_cost(A,Y):
    cost=np.mean(-(Y*np.log(A)+(1-Y)*np.log(1-A)))
    cost=np.squeeze(cost)
    return cost


The standard way to avoid overfitting is called **L2 regularization**. It consists of appropriately modifying your cost function, from:
$$J = -\frac{1}{m} \sum\limits_{i = 1}^{m} \large{(}\small  y^{(i)}\log\left(a^{[L](i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right) \large{)} \tag{1}$$
To:
$$J_{regularized} = \small \underbrace{-\frac{1}{m} \sum\limits_{i = 1}^{m} \large{(}\small y^{(i)}\log\left(a^{[L](i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right) \large{)} }_\text{cross-entropy cost} + \underbrace{\frac{1}{m} \frac{\lambda}{2} \sum\limits_l\sum\limits_k\sum\limits_j W_{k,j}^{[l]2} }_\text{L2 regularization cost} \tag{2}$$

Let's modify your cost and observe the consequences.


In [5]:
def compute_cost_with_regularization(A,Y,parameters,lam):
    m=Y.shape[1]
    cross_entropy_cost=compute_cost(A,Y)
    L=len(parameters)//2
    L2_regularization_cost=0
    for i in range(1,L+1):
        L2_regularization_cost += (lambd/(m*2))*(np.sum(np.square(parameters["W"+str(i)])))
    cost = cross_entropy_cost + L2_regularization_cost

In [6]:
def linear_activation_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    if activation=="sigmoid":
        dz=sigmoid_backward(dA,activation_cache)
       
    if activation=="relu":
        dz=relu_backward(dA,activation_cache)

    A_prev, W, b = linear_cache
    m=A_prev.shape[1]
    dW = (1/m)*np.dot(dz,A_prev.T)
    db = (1/m)*np.sum(dz,keepdims=True,axis=1)
    dA_prev = np.dot(W.T,dz)
    return dA_prev, dW, db

for calculate backward propagation with regularization for 3 layers:<br>
    ($\frac{d}{dW} ( \frac{1}{2}\frac{\lambda}{m}  W^2) = \frac{\lambda}{m} W$)

In [None]:
# GRADED FUNCTION: backward_propagation_with_regularization

def backward_propagation_with_regularization(X, Y, cache, lambd):
    
    m = X.shape[1]
    (Z1, A1, W1, b1, Z2, A2, W2, b2, Z3, A3, W3, b3) = cache
    
    dZ3 = A3 - Y
   
    dW3 = 1./m * np.dot(dZ3, A2.T) +(lambd/m)*W3
    
    db3 = 1. / m * np.sum(dZ3, axis=1, keepdims=True)
    
    dA2 = np.dot(W3.T, dZ3)
    dZ2 = np.multiply(dA2, np.int64(A2 > 0))

    
    dW2 = 1./m * np.dot(dZ2, A1.T) + (lambd/m)*W2
    db2 = 1. / m * np.sum(dZ2, axis=1, keepdims=True)
    
    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.multiply(dA1, np.int64(A1 > 0))

    
    dW1 = 1./m * np.dot(dZ1, X.T)+(lambd/m)*W1
    db1 = 1. / m * np.sum(dZ1, axis=1, keepdims=True)
    
    gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
                 "dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1, 
                 "dZ1": dZ1, "dW1": dW1, "db1": db1}
    
    return gradients

In [8]:
def linear_activation_backward_with_regularization(dA, cache, activation,lamb):
    linear_cache, activation_cache = cache
    if activation=="sigmoid":
        dz=sigmoid_backward(dA,activation_cache)
       
    if activation=="relu":
        dz=relu_backward(dA,activation_cache)

    A_prev, W, b = linear_cache
    m=A_prev.shape[1]
    dW = (1/m)*np.dot(dz,A_prev.T)+(lamb/m)*W
    db = (1/m)*np.sum(dz,keepdims=True,axis=1)
    dA_prev = np.dot(W.T,dz)
    return dA_prev, dW, db

In [7]:
def L_model_backward(AL, Y, caches,regularization=True):

    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape)
    

    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    if regularization:
        current_cache = caches[L-1]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward_with_regularization(dAL, current_cache, activation = "sigmoid")
        grads["dA" + str(L-1)] = dA_prev_temp
        grads["dW" + str(L)] = dW_temp
        grads["db" + str(L)] =db_temp
        for l in reversed(range(L-1)):
        
            current_cache = caches[l]
            dA_prev_temp, dW_temp, db_temp = linear_activation_backward_with_regularization(grads["dA" + str(l + 1)], current_cache, activation = "relu")
            grads["dA" + str(l)] = dA_prev_temp
            grads["dW" + str(l+1)] = dW_temp
            grads["db" + str(l+1)] =db_temp
        
    else:
        current_cache = caches[L-1]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
        grads["dA" + str(L-1)] = dA_prev_temp
        grads["dW" + str(L)] = dW_temp
        grads["db" + str(L)] =db_temp
        for l in reversed(range(L-1)):
        
            current_cache = caches[l]
            dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, activation = "relu")
            grads["dA" + str(l)] = dA_prev_temp
            grads["dW" + str(l+1)] = dW_temp
            grads["db" + str(l+1)] =db_temp
    return grads