In [2]:
import numpy as np

In [3]:
def sigmoid(Z):
    """
    Compute the sigmoid of Z

    Arguments:
    Z -- A scalar or numpy array of any size.

    Return:
    A -- sigmoid of Z
    backprop_store -- returns Z for backpropagation
    """

    A = 1 / (1+np.exp(-Z))
    backprop_store = Z

    return A, backprop_store

def leaky_relu(Z):
    """
    Compute leaky_ReLU of Z

    arguments:
    Z -- A scalar of numpy array of any size

    return:
    A -- post-activation of leaky ReLU of Z, same shape as Z
    backprop_store -- returns Z for backpropagation
    """

    A = np.maximum(0.01 * Z, Z)
    assert(A.shape == Z.shape)

    backprop_store = Z

    return A, backprop_store

def relu(Z):
    """
    Compute regular ReLU of Z

    arguments:
    Z -- A scalar of numpy array of any size

    return:
    A -- post-activation of ReLU of Z, same shape as Z
    backprop_store -- returns Z for backpropagation
    """

    A = np.maximum(0, Z)
    assert(A.shape == Z.shape)

    backprop_store = Z

    return A, backprop_store


def tanh(Z):
    """
    Compute tanh of Z

    arguments:
    Z -- A scalar of numpy array of any size

    return:
    A -- post-activation of tanh of Z, same shape as Z
    backprop_store -- returns Z for backpropagation
    """

    A = (np.exp(Z)-np.exp(-Z))  /  (np.exp(Z)+np.exp(-Z))
    assert(A.shape == Z.shape)

    backprop_store = Z

    return A, backprop_store


def softmax(Z):
    """
    Compute softmax of Z

    arguments:
    Z -- A scalar of numpy array of any size

    return:
    A -- post-activation of softmax of Z, same shape as Z
    backprop_store -- returns Z for backpropagation
    """

    e_Z = np.exp(Z - np.max(Z))
    A = e_Z / e_Z.sum()
    assert(A.shape == Z.shape)

    backprop_store = Z

    return A, backprop_store

In [4]:
def initialize_parameters(initialize, dimension_of_layers):
    """
    Arguments:
    initialization -- activation used in this layer. 
        Stored as text string: "He", "Xavier", "Yoshua" "random"
    dimensions_of_layers -- array (list) of size in each layer

    Returns:
    parameters -- dictionary containing parameters "W1", "b1", "W2", "b2",...
                W[layer] -- shape (dimension_of_layers[layer], (dimension_of_layers[layer-1])
                b[layer] -- bias vector shape (dimension_of_layers[layer], 1) 
    """
    
    # np.random.seed(1)  # Use when you need to test that the different initializations are giving different numbers
    parameters = {}
    num_layers = len(dimension_of_layers)

    for layer in range(1, num_layers):  # this will loop through first hidden layer to final output layer

        if initialize == "He":
            parameters["W" + str(layer)] = np.random.randn(dimension_of_layers[layer], 
                dimension_of_layers[layer - 1]) * np.sqrt(2. / dimension_of_layers[layer - 1])
            parameters["b" + str(layer)] = np.zeros( (dimension_of_layers[layer], 1) )

        elif initialize == "Yoshua":
            parameters["W" + str(layer)] = np.random.randn(dimension_of_layers[layer], 
                dimension_of_layers[layer - 1]) * np.sqrt(2. / (dimension_of_layers[layer - 1] + dimension_of_layers[layer]))
            parameters["b" + str(layer)] = np.zeros( (dimension_of_layers[layer], 1) )

        elif initialize == "Xavier":
            parameters["W" + str(layer)] = np.random.randn(dimension_of_layers[layer], 
                dimension_of_layers[layer - 1]) * np.sqrt(1. / (dimension_of_layers[layer - 1]))
            parameters["b" + str(layer)] = np.zeros( (dimension_of_layers[layer], 1) )

        elif initialize == "random":
            parameters["W" + str(layer)] = np.random.randn(dimension_of_layers[layer], dimension_of_layers[layer - 1]) * 0.01
            parameters["b" + str(layer)] = np.zeros( (dimension_of_layers[layer], 1) )

        else:
            print("ERROR: YOU MUST CHOOSE AN INITIALIZATION TYPE: \"He\", \"Yoshua\", \"Xavier\", or \"random\"")

            assert parameters["W" + str(layer)].shape == (dimension_of_layers[layer], dimension_of_layers[layer - 1])
            assert parameters["b" + str(layer)].shape == (dimension_of_layers[layer], 1)

    return parameters

In [5]:
def linear_forward(A_prev, W, b):
    """
    Implement the linear part of a layer's forward propagation.

    Arguments:
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)

    Returns:
    A_post -- the input of the activation function, also called pre-activation parameter 
    backprop_store -- a python dictionary containing "W", "b", and "A_prev" ; stored for computing the backward pass efficiently
    """

    Z = np.dot(W, A_prev) + b
    
    assert(Z.shape == (W.shape[0], A_prev.shape[1]))
    backprop_store = (W, b, A_prev)
    
    return Z, backprop_store

In [6]:
def linear_activation_forward(A_prev, W, b, hidden_activation):
    """
    Implement the forward propagation for the LINEAR->ACTIVATION layer

    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    hidden_activation -- the activation to be used in this layer, stored as a text string: "sigmoid", "relu",
                         "leaky relu", or "tanh"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python dictionary containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
             "linear_cache" and "activation_cache" are caching, storing, exactly what's being passed in it's function.
    """
    
    if hidden_activation == "sigmoid":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    
    elif hidden_activation == "relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)

    elif hidden_activation == "leaky relu":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = leaky_relu(Z)

    elif hidden_activation == "tanh":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = tanh(Z)

    elif hidden_activation == "softmax":
        # Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = softmax(Z)
    
    else:
        print("ERROR: YOU MUST CHOOSE A HIDDEN AND OUTPUT ACTIVATION. HIDDEN TYPES: \"sigmoid\", \"relu\", \"leaky relu\", or \"tanh\". OUTPUT TYPES: \"sigmoid\" or \"softmax\"")

    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache

In [7]:
def L_model_forward(X, parameters, dimension_of_layers, hidden_activation, output_activation):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- output of initialize_parameters_deep()
    dimension_of_layers -- array (list) of size in each layer
    hidden_activation -- the activation to be used in this layer, stored as a text string: "sigmoid", "relu",
                         "leaky relu", or "tanh"
    output_activation -- the activation to be used in the output layer (L), stored as a text string:
                         "sigmoid" or "softmax"
    
    
    Returns:
    AL -- last post-activation value
    caches -- list of caches containing:
                every cache of linear_activation_forward() (there are L-1 of them, indexed from 0 to L-1)
    """

    caches = []
    A = X
    L = len(dimension_of_layers)
    
    # when using range in the loop below it will end at the final hidden loop
    # Calculating AL for the output layer also used L-1 because you did not use
    # a loop with range. So, you need to put L-1 due to Python starting index from 0.
    
    # Implement [LINEAR -> hidden_activation]*(L-1). Add "cache" to the "caches" list.
    for layer in range(1, L-1):    # This will loop through first hidden layer to last hidden layer (before output layer L)
        A_prev = A 
        A, cache = linear_activation_forward(A_prev, parameters["W" + str(layer)], parameters["b" + str(layer)], hidden_activation)
        caches.append(cache)
    
    # use the A (now the last hidden layer activation after the loop) to get activation of final the layer (AL)
    # Implement LINEAR -> outut_activation. Add "cache" to the "caches" list.
    AL, cache = linear_activation_forward(A, parameters["W" + str(L-1)], parameters["b" + str(L-1)], output_activation)
    caches.append(cache)
    
    assert(AL.shape == (1,X.shape[1]))
            
    return AL, caches

In [8]:
### skipped cost function for softmax. Add it in when you have more free time.

def compute_cost(AL, Y):
    """
    Implement the cost function, cross-entropy cost for "sigmoid" function

    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector, shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    
    m = Y.shape[1]

    # Compute loss from aL and y.
    cost = cost = (   (    np.dot(Y, np.log(AL.T))   )  +  (np.dot(  (1 - Y), np.log(1 - AL.T)  )  )   )  /  -m
    
    cost = np.squeeze(cost)      # To make sure your cost's shape is what we expect (e.g. this turns [[17]] into 17).
    assert(cost.shape == ())
    
    return cost

In [9]:
def linear_backward(dZ, cache):
    """
    Implement the linear portion of backward propagation for a single layer (layer l)

    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = np.dot(dZ, A_prev.T) / m
    db = np.sum(dZ, axis = 1, keepdims = True) / m       ## remember this one for that type of formula!
    dA_prev = np.dot(W.T, dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

In [10]:
### need to also make backward function for tanh and softmax

def relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0  # wherever Z is less than or equal to 0 then 0 
                    # will be placed in dZ for that index.This works because they are
                    # the same size
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):
    """
    Implement the backward propagation for a single SIGMOID unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def leaky_relu_backward(dA, cache):
    """
    Implement the backward propagation for a single RELU unit.

    Arguments:
    dA -- post-activation gradient, of any shape
    cache -- 'Z' where we store for computing backward propagation efficiently

    Returns:
    dZ -- Gradient of the cost with respect to Z
    """
    
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    # When z <= 0, you should set dz to 0 as well. 
    dZ[Z <= 0] = 0.01  # wherever Z is less than or equal to 0 then 0.01 
                    # will be placed in dZ for that index.This works because they are
                    # the same size
    
    assert (dZ.shape == Z.shape)
    
    return dZ

In [11]:
## need to make one for softmax too, tanh, and leaky relu too

def linear_activation_backward(dA, cache, activation):
    """
    Implement the backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

In [12]:
##change this so that you have 2 options for the initialize backprop. one for
## sigmoid which is written and one for softmax which is dAL

def L_model_backward(AL, Y, caches, dimension_of_layers):
    """
    Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
    
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(dimension_of_layers) # the number of layers #you counted the input as a layer  ####### WATCH FORWARD AND BACKWARD PROP VIDEO TO KINDA UNDERSTAND
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # Initializing the backpropagation
    
    ## dAL for softmax here -> #you have to write the code#
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-2"], grads["dWL -1"], grads["dbL-1"]
    current_cache = caches[L - 2]    # this is accounting that Python indexes from 0 and that the first cache starts at layer 2
    grads["dA" + str(L-2)], grads["dW" + str(L-1)], grads["db" + str(L-1)] = linear_activation_backward(dAL, current_cache, "sigmoid")
    
    
    ########################stopped here
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)): # This is actually accounting Python indexes from 0 AND that range goes only to number before.
        # lth layer: (RELU -> LINEAR) gradients.
        # Inputs: "grads["dA" + str(l + 1)], current_cache". Outputs: "grads["dA" + str(l)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] 
        current_cache = caches[L - 2]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
        ### END CODE HERE ###

    return grads

In [13]:
X = np.random.rand(3, 8)
dimension_of_layers = (3, 4, 4, 1)

In [14]:
parameters = initialize_parameters("He", dimension_of_layers)

In [20]:
AL, cache = L_model_forward(X, parameters, dimension_of_layers, "relu", "sigmoid")

In [6]:
for l in reversed(range(4-2)):
    print(l)

1
0


In [105]:
m = 10
p = .5
y = [1,1,1,1,1,1,1,1,1,1]
np.log(p[range(m),y])

TypeError: 'float' object is not subscriptable

In [75]:
np.random.seed(2)
dimension_of_layers = [2, 3, 1]
X = np.random.randn(2,4)
X

array([[-0.41675785, -0.05626683, -2.1361961 ,  1.64027081],
       [-1.79343559, -0.84174737,  0.50288142, -1.24528809]])

In [76]:
parameters = initialize_parameters("He", dimension_of_layers)

In [77]:
parameters

{'W1': array([[-1.05795222, -0.90900761],
        [ 0.55145404,  2.29220801],
        [ 0.04153939, -1.11792545]]),
 'W2': array([[ 0.44013928, -0.48676236, -0.01561999]]),
 'b1': array([[ 0.],
        [ 0.],
        [ 0.]]),
 'b2': array([[ 0.]])}

In [91]:
output, cache = L_model_forward(X, parameters, dimension_of_layers, "relu", "softmax")

In [93]:
cache

[((array([[-1.05795222, -0.90900761],
          [ 0.55145404,  2.29220801],
          [ 0.04153939, -1.11792545]]), array([[ 0.],
          [ 0.],
          [ 0.]]), array([[-0.41675785, -0.05626683, -2.1361961 ,  1.64027081],
          [-1.79343559, -0.84174737,  0.50288142, -1.24528809]])),
  array([[ 2.07115649,  0.82468238,  1.80287036, -0.60335179],
         [-4.34075022, -1.96048863, -0.02530516, -1.94992536],
         [ 1.98761541,  0.93867351, -0.65092022,  1.46027509]])),
 ((array([[ 0.44013928, -0.48676236, -0.01561999]]),
   array([[ 0.]]),
   array([[ 2.07115649,  0.82468238,  1.80287036,  0.        ],
          [ 0.        ,  0.        ,  0.        ,  0.        ],
          [ 1.98761541,  0.93867351,  0.        ,  1.46027509]])),
  array([[ 0.8805508 ,  0.34831304,  0.79351406, -0.02280948]]))]

In [1]:
import numpy as np
import time

In [7]:
a = np.array([[0,0,0], [0,1,1],[1,0,0]])
y = ([[2,1,1]])
p = np.array([[.1,.9, .1], [.1, .45, .6], [.8, .45, .3]])

In [8]:
print(p)
print(a)

[[ 0.1   0.9   0.1 ]
 [ 0.1   0.45  0.6 ]
 [ 0.8   0.45  0.3 ]]
[[0 0 0]
 [0 1 1]
 [1 0 0]]


In [12]:
tic = time.time()
p[y, range(3)]
toc= time.time()
(toc - tic)*1000

0.4973411560058594

In [22]:
tic = time.time()
a*p
toc=time.time()
(toc - tic)*1000

0.4954338073730469

In [33]:
p = np.random.randn(3, 5000)
y = np.random.randint(2,5000)
a = np.random.randn(3,5000)

In [34]:
tic = time.time()
p[y, range(5000)]
toc= time.time()
(toc - tic)*1000

IndexError: index 1009 is out of bounds for axis 0 with size 3