In [3]:
def layer_sizes(X, Y):
    """
    Arguments:
    X -- input dataset of shape (input size, number of examples)
    Y -- labels of shape (output size, number of examples)
    
    Returns:
    n_x -- the size of the input layer
    n_h -- the size of the hidden layer
    n_y -- the size of the output layer
    """
    n_x = X.shape[0]
    n_h = 2
    n_y = Y.shape[0]
    
    return n_x, n_h, n_y

def initialize_parameters(n_x, n_h, n_y):
    """
    Argument:
    n_x -- size of the input layer
    n_h -- size of the hidden layer
    n_y -- size of the output layer
    
    Returns:
    params -- python dictionary containing your parameters:
                    W1 -- weight matrix of shape (n_h, n_x)
                    b1 -- bias vector of shape (n_h, 1)
                    W2 -- weight matrix of shape (n_y, n_h)
                    b2 -- bias vector of shape (n_y, 1)
    """
    params ={}
    np.random.seed(63)
    params["W1"] = np.random.randn(n_h, n_x)*0.01
    params["b1"] = np.zeros((n_h,1))
    params["W2"] = np.random.randn(n_y, n_h)*0.01
    params["b2"] = np.zeros((n_y, 1))
    
    return params

def sigmoid(self, z):
    ''' 
    returns the sigmoid of z 
    '''
    sigmoid = 1/(1+np.exp(-z))
    return sigmoid

def forward_prop(X, parameters):
    """
    Argument:
    X -- input data of size (n_x, m)
    parameters -- python dictionary containing your parameters (output of initialization function)
    
    Returns:
    A2 -- The sigmoid output of the second activation
    cache -- a dictionary containing "Z1", "A1", "Z2" and "A2"
    """
    cache={}
    cache['Z1'] = np.dot(parameters['W1'].T, X) + parameters['b1']
    cache['A1'] = sigmoid(cache['Z1'])
    cache['Z2'] = np.dot(parameters['W2'].T, X) + parameters['b2']
    cache['A2']= sigmoid(cache['Z2'])  
    
    return cache['A2'], cache

$$J = - \frac{1}{m} \sum\limits_{i = 0}^{m} \large{(} \small y^{(i)}\log\left(a^{[2] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[2] (i)}\right) \large{)} \small$$

```python
logprobs = np.multiply(np.log(A2),Y)
cost = - np.sum(logprobs)               # no need to use a for loop!
```

In [7]:
def compute_cost(A2, Y, parameters):
    """
    Computes the cross-entropy cost given in equation (13)
    
    Arguments:
    A2 -- The sigmoid output of the second activation, of shape (1, number of examples)
    Y -- "true" labels vector of shape (1, number of examples)
    parameters -- python dictionary containing your parameters W1, b1, W2 and b2
    
    Returns:
    cost -- cross-entropy cost given equation above
    """
    m = Y.shape[1]
    cost = -np.sum(np.dot(Y,np.log(A2) + np.dot(1-Y, np.log(1-A2))))/m
    return cost

<img src="images/grad_summary.png" style="width:600px;height:300px;">

- Tips:
    - To compute dZ1 you'll need to compute $g^{[1]'}(Z^{[1]})$. Since $g^{[1]}(.)$ is the tanh activation function, if $a = g^{[1]}(z)$ then $g^{[1]'}(z) = 1-a^2$. So you can compute 
    $g^{[1]'}(Z^{[1]})$ using `(1 - np.power(A1, 2))`.

In [8]:
def back_prop(parameters, cache, X, Y):
    """
    Implement the backward propagation using the instructions above.
    
    Arguments:
    parameters -- python dictionary containing our parameters 
    cache -- a dictionary containing "Z1", "A1", "Z2" and "A2".
    X -- input data of shape (2, number of examples)
    Y -- "true" labels vector of shape (1, number of examples)
    
    Returns:
    grads -- python dictionary containing your gradients with respect to different parameters
    """
    
    grads = {}
    m = X.shape[1]
    dZ2 = cache['A2'] - Y
    grads['dW2'] = np.dot(dZ2, cache['A1'].T)/m
    grads['db2'] = np.sum(dZ2, axis=1, keepdims=True)/m
    dZ1 = np.dot(parameters['W2'].T,dZ2*(1-cache['A1']**2))
    grads['dW1'] = np.dot(dZ1, X.T)/m
    grads['db1'] = np.sum(dZ1, axis=1, keepdims=True)/m
    
    return grads

def optimize(parameters, grads, learning_rate=1.2):
    """
    Updates parameters using the gradient descent update rule given above
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients 
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
    """
    parameters['W1'] -= learning_rate*grads['dW1']
    parameters['b1'] -= learning_rate*grads['db1']
    parameters['W2'] -= learning_rate*grads['dW2']
    parameters['b2'] -= learning_rate*grads['db2']
    return parameters

def nn_model(X, Y, n_h, learning_rate, num_iterations=10000, print_cost=False):
    """
    Arguments:
    X -- dataset of shape (2, number of examples)
    Y -- labels of shape (1, number of examples)
    n_h -- size of the hidden layer
    num_iterations -- Number of iterations in gradient descent loop
    print_cost -- if True, print the cost every 1000 iterations
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    # get the input_dim and output_dim(classes)
    n_x, n_h, n_y = layer_sizes(X,Y)
    m = X.shape[1]
    # initializa the parameters randomly
    params = initialize_parameters(n_x, n_h, n_y)
    
#     W1, b1, W2, b2 = params['W1'], params['b1'], params['W2'], params['b2']
    # for loop here:
    for i in range(num_iterations):
        #     forward_prop  
        A2, cache = forward_prop(X, params)
    #     compute_cost
        cost = compute_cost(A2, Y, params)
    #     back_prop
        grads = back_prop(params, cache, X, Y)
    #     optimize
        params = optimize(params, grads, learning_rate) 
    

In [9]:
def predict(parameters, X):
    """
    Using the learned parameters, predicts a class for each example in X
    
    Arguments:
    parameters -- python dictionary containing your parameters 
    X -- input data of size (n_x, m)
    
    Returns
    predictions -- vector of predictions of our model (red: 0 / blue: 1)
    """
    A2, cache = forward_prop(X, parameters)
    pred = (A2 > 0.5)
    return pred
