In [385]:
import tensorflow as tf
import numpy as np
old_v = tf.logging.get_verbosity()
tf.logging.set_verbosity(tf.logging.ERROR)

In [386]:
from tensorflow.examples.tutorials.mnist import input_data
#get mnist data, with one_hot encoding
mnist = input_data.read_data_sets("MNIST_data/",one_hot=True)


#suppress warnings
tf.logging.set_verbosity(old_v)

num_train = mnist.train.num_examples #55,000
num_validation = mnist.validation.num_examples #5000
num_test = mnist.test.num_examples #10,000



Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [387]:
def layer_sizes(X, Y):
    n_x = X.shape[0]
    n_h = 4
    n_y = Y.shape[0]
    
    return (n_x, n_h, n_y)

In [445]:
def initialize_parameters(n_x, n_h, n_y):
    np.random.seed(2)
    W1 = np.random.randn(n_h, n_x)*0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h)*0.01
    b2 = np.zeros((n_y, 1))
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    return parameters

In [446]:
def relu(Z):
    A = np.maximum(0,Z)
    assert(A.shape == Z.shape)
    
    cache = Z
    return A

In [553]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    result = e_x/e_x.sum(axis = 0)
    return result

In [552]:
def forward_propagation(X, parameters):
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    Z1 = (np.dot(W1, X) + b1)
    A1 = relu(Z1)
    Z2 = (np.dot(W2,A1) + b2)
    A2 = softmax(Z2)
    assert(A2.shape == (10, X.shape[1])) #There are 10 classes.
    
    cache = {"Z1": Z1,
             "A1": A1,
             "Z2": Z2,
             "A2": A2}
    return A2, cache

In [549]:
def compute_cost(A2, Y, parameters):
    m = Y.shape[1]
    logprobs = np.multiply(np.log(A2), Y) # + np.multiply(np.log(1-A2+(1e-15)),1-Y)
    cost = -np.sum(logprobs)/m
    
    cost = np.squeeze(cost)
    assert(isinstance(cost, float))
    
    return cost

In [550]:
def backward_propagation(parameters, cache, X, Y):
    
    m = X.shape[1]
    
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    A1 = cache["A1"]
    A2 = cache["A2"]
    Z1 = cache["Z1"]
    Z2 = cache["Z2"]
    dZ2 = A2
    maxIndexEachColumn = Y.argmax(axis = 0)
    for column in range(0, m):
        dZ2[maxIndexEachColumn[column], column] = A2[maxIndexEachColumn[column], column] - 1
    
    
    dZ2 = dZ2/m
    dW2 = np.dot(dZ2, A1.T)
    db2 = np.sum(dZ2, axis = 1, keepdims = True)
    dA1 = np.dot(W2.T, dZ2)
    dZ1 = np.array(dA1, copy = True)
    dZ1[Z1 <= 0] = 0
    dW1 = np.dot(dZ1, X.T)
    db1 = np.sum(dZ1, axis = 1, keepdims = True)
    
    grads = {"dW1":dW1,
             "db1":db1,
             "dW2":dW2,
             "db2":db2}
    
    return grads

In [551]:
def update_parameters(parameters, grads, learning_rate = 0.01):
    W1 = parameters["W1"]
    b1 = parameters["b1"]
    W2 = parameters["W2"]
    b2 = parameters["b2"]
    
    dW1 = grads["dW1"]
    dW2 = grads["dW2"]
    db1 = grads["db1"]
    db2 = grads["db2"]
    
    W1 = W1 - learning_rate*dW1
    W2 = W2 - learning_rate*dW2
    b1 = b1 - learning_rate*db1
    b2 = b2 - learning_rate*db2
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters

In [548]:
def nn_model(n_h,  batchSize, num_minibatches = 10, print_cost = True):
    
    X_whole, Y_whole = mnist.train.next_batch(55000)
    X_whole = X_whole.T
    Y_whole = Y_whole.T
    parameters = initialize_parameters(784, n_h, 10) #The row of X is 784 and the row of Y is 10.
    W1 = parameters["W1"]
    W2 = parameters["W2"]
    for epoch in range(0, 100):
        for i in range(0, num_minibatches):
            mini_batch_X = X_whole[:, i*batchSize: (i+1)*batchSize]
            mini_batch_Y = Y_whole[:, i*batchSize: (i+1)*batchSize]
            X = mini_batch_X
            Y = mini_batch_Y
            A2, cache = forward_propagation(X, parameters)
            cost  = compute_cost(A2, Y, parameters)
            grads = backward_propagation(parameters, cache, X, Y)
            parameters = update_parameters(parameters, grads)
        if print_cost and epoch % 1 == 0:
            print("Cost after iteration %i: %f" %(epoch, cost))
            dW1 = grads["dW1"]
    return parameters

In [546]:
nn_model(500, 16, 3437)

Cost after iteration 0: 0.111376
Cost after iteration 1: 0.060508
Cost after iteration 2: 0.044208
Cost after iteration 3: 0.034214
Cost after iteration 4: 0.027435
Cost after iteration 5: 0.022505
Cost after iteration 6: 0.018787
Cost after iteration 7: 0.015996
Cost after iteration 8: 0.013856
Cost after iteration 9: 0.012015
Cost after iteration 10: 0.010573
Cost after iteration 11: 0.009365
Cost after iteration 12: 0.008275
Cost after iteration 13: 0.007389
Cost after iteration 14: 0.006625
Cost after iteration 15: 0.006047
Cost after iteration 16: 0.005470
Cost after iteration 17: 0.004941
Cost after iteration 18: 0.004495
Cost after iteration 19: 0.004127
Cost after iteration 20: 0.003778
Cost after iteration 21: 0.003493
Cost after iteration 22: 0.003227
Cost after iteration 23: 0.002974
Cost after iteration 24: 0.002760
Cost after iteration 25: 0.002568
Cost after iteration 26: 0.002397
Cost after iteration 27: 0.002235
Cost after iteration 28: 0.002081
Cost after iteration 29:

{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[ 2.44768690e-03],
        [ 6.91956400e-02],
        [-2.03987550e-03],
        [ 3.69903690e-02],
        [ 9.83242883e-03],
        [ 6.94576553e-03],
        [ 6.82330374e-02],
        [-3.32896158e-02],
        [ 4.17441408e-02],
        [ 4.44118463e-03],
        [-2.44284378e-03],
        [ 7.11729199e-02],
        [ 2.51448848e-02],
        [ 1

In [545]:
nn_model(500, 64, 859)

Cost after iteration 0: 0.835163
Cost after iteration 1: 0.446713
Cost after iteration 2: 0.350050
Cost after iteration 3: 0.308361
Cost after iteration 4: 0.283838
Cost after iteration 5: 0.266250
Cost after iteration 6: 0.252932
Cost after iteration 7: 0.242025
Cost after iteration 8: 0.232628
Cost after iteration 9: 0.224209
Cost after iteration 10: 0.216596
Cost after iteration 11: 0.209731
Cost after iteration 12: 0.203346
Cost after iteration 13: 0.197316
Cost after iteration 14: 0.191495
Cost after iteration 15: 0.185798
Cost after iteration 16: 0.180489
Cost after iteration 17: 0.175353
Cost after iteration 18: 0.170182
Cost after iteration 19: 0.165470
Cost after iteration 20: 0.161010
Cost after iteration 21: 0.156501
Cost after iteration 22: 0.152189
Cost after iteration 23: 0.148194
Cost after iteration 24: 0.144348
Cost after iteration 25: 0.140716
Cost after iteration 26: 0.137256
Cost after iteration 27: 0.133996
Cost after iteration 28: 0.130851
Cost after iteration 29:

{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[-1.23389898e-03],
        [ 2.97994827e-02],
        [-2.19887361e-02],
        [ 3.38958947e-02],
        [ 3.00079714e-03],
        [-5.87970761e-03],
        [ 2.50078038e-02],
        [-2.92672521e-03],
        [ 4.08609261e-02],
        [ 8.79472413e-04],
        [-5.67033098e-03],
        [ 6.69023661e-02],
        [ 1.08625436e-02],
        [ 1

In [544]:
nn_model(500, 256, 214) #Batch size = 256

Cost after iteration 0: 2.160954
Cost after iteration 1: 1.723757
Cost after iteration 2: 1.186979
Cost after iteration 3: 0.885664
Cost after iteration 4: 0.732093
Cost after iteration 5: 0.644446
Cost after iteration 6: 0.588384
Cost after iteration 7: 0.549517
Cost after iteration 8: 0.521020
Cost after iteration 9: 0.499253
Cost after iteration 10: 0.482123
Cost after iteration 11: 0.468302
Cost after iteration 12: 0.456870
Cost after iteration 13: 0.447215
Cost after iteration 14: 0.438927
Cost after iteration 15: 0.431693
Cost after iteration 16: 0.425324
Cost after iteration 17: 0.419633
Cost after iteration 18: 0.414497
Cost after iteration 19: 0.409771
Cost after iteration 20: 0.405343
Cost after iteration 21: 0.401213
Cost after iteration 22: 0.397352
Cost after iteration 23: 0.393691
Cost after iteration 24: 0.390239
Cost after iteration 25: 0.386935
Cost after iteration 26: 0.383795
Cost after iteration 27: 0.380760
Cost after iteration 28: 0.377810
Cost after iteration 29:

{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[-2.44478952e-03],
        [ 9.86638941e-03],
        [-7.68207000e-02],
        [ 3.60204030e-02],
        [-4.19090228e-04],
        [-1.86723863e-02],
        [-2.29734200e-02],
        [ 9.79394034e-03],
        [ 2.11218576e-02],
        [ 2.68315010e-03],
        [-1.87953091e-02],
        [ 4.76911694e-02],
        [ 1.86685566e-02],
        [ 5

In [543]:
nn_model(500, 1026, 50) #Batch size = 1026

Cost after iteration 0: 2.282046
Cost after iteration 1: 2.256859
Cost after iteration 2: 2.222995
Cost after iteration 3: 2.176289
Cost after iteration 4: 2.112506
Cost after iteration 5: 2.028341
Cost after iteration 6: 1.922717
Cost after iteration 7: 1.798275
Cost after iteration 8: 1.661867
Cost after iteration 9: 1.523005
Cost after iteration 10: 1.390870
Cost after iteration 11: 1.271613
Cost after iteration 12: 1.167690
Cost after iteration 13: 1.078866
Cost after iteration 14: 1.003551
Cost after iteration 15: 0.939776
Cost after iteration 16: 0.885634
Cost after iteration 17: 0.839453
Cost after iteration 18: 0.799828
Cost after iteration 19: 0.765599
Cost after iteration 20: 0.735825
Cost after iteration 21: 0.709744
Cost after iteration 22: 0.686735
Cost after iteration 23: 0.666303
Cost after iteration 24: 0.648045
Cost after iteration 25: 0.631635
Cost after iteration 26: 0.616808
Cost after iteration 27: 0.603345
Cost after iteration 28: 0.591065
Cost after iteration 29:

{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[-2.52516485e-03],
        [ 5.04598520e-03],
        [-3.92805135e-02],
        [ 2.36604592e-02],
        [-1.95771017e-03],
        [-9.86200626e-03],
        [-7.54956422e-03],
        [ 4.86885208e-03],
        [ 1.22015451e-02],
        [ 2.93215052e-03],
        [-4.17468087e-03],
        [ 3.00009641e-02],
        [ 1.59610968e-02],
        [ 4

In [444]:
def initialize_adam(parameters):
    L = len(parameters) // 2
    v = {}
    s = {}
    
    for l in range(L):
        v["dW" + str(l+1)] = np.zeros(np.shape(parameters["W" + str(l+1)]))
        v["db" + str(l+1)] = np.zeros(np.shape(parameters["b" + str(l+1)]))
        s["dW" + str(l+1)] = np.zeros(np.shape(parameters["W" + str(l+1)]))
        s["db" + str(l+1)] = np.zeros(np.shape(parameters["b" + str(l+1)]))
        
    return v,s

In [443]:
def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8):
    L = len(parameters) // 2
    v_corrected = {}
    s_corrected = {}
    
    for l in range(L):
        
        v["dW" + str(l+1)] = beta1*v["dW" + str(l+1)] + (1 - beta1)*grads['dW' + str(l+1)]
        v["db" + str(l+1)] = beta1*v["db" + str(l+1)] + (1 - beta1)*grads['db' + str(l+1)]
        
        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)]/(1-beta1**t)
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)]/(1-beta1**t)
        
        s["dW" + str(l+1)] = beta2*s["dW" + str(l+1)] + (1 - beta2)*(grads['dW' + str(l+1)]**2)
        s["db" + str(l+1)] = beta2*s["db" + str(l+1)] + (1 - beta2)*(grads['db' + str(l+1)]**2)
        
        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)]/(1-beta2**t)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)]/(1-beta2**t)
        
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*v_corrected["dW" + str(l+1)]/(np.sqrt(s_corrected["dW" + str(l+1)]) + epsilon)
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*v_corrected["db" + str(l+1)]/(np.sqrt(s_corrected["db" + str(l+1)]) + epsilon)
        
    return parameters, v, s    

In [569]:
def adam_nn_model(n_h,  batchSize, num_minibatches = 10, print_cost = True):
    
    X_whole, Y_whole = mnist.train.next_batch(55000)
    X_whole = X_whole.T
    Y_whole = Y_whole.T
    parameters = initialize_parameters(784, n_h, 10)
    v, s = initialize_adam(parameters) #The row of X is 784 and the row of Y is 10.
    for i in range(0, num_minibatches):
        mini_batch_X = X_whole[:, i*batchSize: (i+1)*batchSize]
        mini_batch_Y = Y_whole[:, i*batchSize: (i+1)*batchSize]
        X = mini_batch_X
        Y = mini_batch_Y
        A2, cache = forward_propagation(X, parameters)
        cost  = compute_cost(A2, Y, parameters)
        grads = backward_propagation(parameters, cache, X, Y)
        parameters, v, s = update_parameters_with_adam(parameters, grads, v, s, t = 2)
        if print_cost and i % 10 == 0:
            print("Cost after iteration %i: %f" %(i, cost))
            dW1 = grads["dW1"]
    return parameters

The cost display of Adam optimization:

Batch sizes: 34, Number of Batches: 3437

In [565]:
adam_nn_model(500, 16, 3437) 

Cost after iteration 0: 2.305353
Cost after iteration 100: 0.513697
Cost after iteration 200: 0.161393
Cost after iteration 300: 0.248376
Cost after iteration 400: 0.087492
Cost after iteration 500: 0.257224
Cost after iteration 600: 0.530261
Cost after iteration 700: 0.156354
Cost after iteration 800: 0.055896
Cost after iteration 900: 0.041761
Cost after iteration 1000: 0.445856
Cost after iteration 1100: 0.102424
Cost after iteration 1200: 0.145899
Cost after iteration 1300: 0.068414
Cost after iteration 1400: 0.436643
Cost after iteration 1500: 0.056428
Cost after iteration 1600: 0.081233
Cost after iteration 1700: 0.360556
Cost after iteration 1800: 0.033188
Cost after iteration 1900: 0.266694
Cost after iteration 2000: 0.106095
Cost after iteration 2100: 0.008217
Cost after iteration 2200: 0.222943
Cost after iteration 2300: 0.453727
Cost after iteration 2400: 0.089442
Cost after iteration 2500: 0.037455
Cost after iteration 2600: 0.225507
Cost after iteration 2700: 0.369177
Cost

{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[-0.07474992],
        [-0.07474676],
        [-0.11859178],
        [-0.03380476],
        [-0.06560334],
        [ 0.04306708],
        [ 0.00622678],
        [-0.07474879],
        [-0.06692365],
        [-0.07474889],
        [-0.06840235],
        [-0.07474975],
        [-0.07474994],
        [-0.0360239 ],
        [ 0.03509904],
        [-0.07474

Batch sizes: 64, Number of Batches: 859

In [566]:
adam_nn_model(500, 64, 859)

Cost after iteration 0: 2.300113
Cost after iteration 100: 0.119583
Cost after iteration 200: 0.056425
Cost after iteration 300: 0.056754
Cost after iteration 400: 0.060034
Cost after iteration 500: 0.198405
Cost after iteration 600: 0.071020
Cost after iteration 700: 0.109947
Cost after iteration 800: 0.099839


{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[-0.07474942],
        [-0.0569977 ],
        [-0.06268743],
        [ 0.03840229],
        [-0.10149761],
        [-0.05480127],
        [-0.0520285 ],
        [-0.02274504],
        [ 0.0163683 ],
        [-0.14102428],
        [-0.04654095],
        [-0.01691679],
        [-0.07474874],
        [-0.01379828],
        [-0.08663928],
        [-0.05104

Batch sizes: 256, Number of Batches: 214

In [570]:
adam_nn_model(500, 256, 214) 

Cost after iteration 0: 2.303416
Cost after iteration 10: 0.534240
Cost after iteration 20: 0.335755
Cost after iteration 30: 0.268761
Cost after iteration 40: 0.226864
Cost after iteration 50: 0.251984
Cost after iteration 60: 0.172092
Cost after iteration 70: 0.105476
Cost after iteration 80: 0.190135
Cost after iteration 90: 0.139661
Cost after iteration 100: 0.153010
Cost after iteration 110: 0.137483
Cost after iteration 120: 0.153582
Cost after iteration 130: 0.157527
Cost after iteration 140: 0.137918
Cost after iteration 150: 0.115156
Cost after iteration 160: 0.130533
Cost after iteration 170: 0.195057
Cost after iteration 180: 0.100916
Cost after iteration 190: 0.133402
Cost after iteration 200: 0.101644
Cost after iteration 210: 0.107557


{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[-0.07474856],
        [-0.11455916],
        [-0.10581506],
        [ 0.0443594 ],
        [-0.10696534],
        [-0.02307876],
        [-0.10038166],
        [-0.0313697 ],
        [ 0.03573976],
        [-0.07474843],
        [-0.06882836],
        [-0.07474916],
        [-0.07474825],
        [ 0.00741855],
        [-0.04316737],
        [-0.05273

Batch sizes: 1026, Number of Batches: 50

In [571]:
adam_nn_model(500, 1026, 50)

Cost after iteration 0: 2.303014
Cost after iteration 10: 0.577320
Cost after iteration 20: 0.274242
Cost after iteration 30: 0.193254
Cost after iteration 40: 0.185832


{'W1': array([[-0.00416758, -0.00056267, -0.02136196, ..., -0.00616844,
          0.00321336, -0.00946447],
        [-0.00530139, -0.01259207,  0.01677544, ..., -0.00328425,
         -0.00562311,  0.00117914],
        [ 0.00738638, -0.01587296,  0.001532  , ..., -0.00842856,
          0.01004047,  0.00054583],
        ...,
        [ 0.01007134, -0.00342119, -0.00861136, ..., -0.00939208,
          0.01302295,  0.00838285],
        [-0.00699008,  0.01134214, -0.00199053, ..., -0.00795476,
         -0.00139976, -0.00426493],
        [-0.00250374, -0.00127447,  0.0172375 , ...,  0.00355718,
          0.00222753,  0.00266074]]), 'b1': array([[-7.43536371e-02],
        [-4.68856497e-02],
        [-3.74760517e-02],
        [ 8.61917415e-03],
        [-1.20961075e-01],
        [-5.14157179e-02],
        [-3.72393969e-02],
        [ 6.83632674e-02],
        [ 2.83639192e-02],
        [-4.24367985e-02],
        [-4.76966372e-02],
        [ 9.37153599e-02],
        [-7.43536934e-02],
        [ 7