# Full implementation of training a 2-layer Neural Network

In [2]:
import numpy as np
from numpy.random import randn

In [3]:
N, D_in, H, D_out = 64, 1000, 100, 10
x, y = randn(N, D_in), randn(N, D_out)
w1, w2 = randn(D_in, H), randn(H, D_out)

for t in range(2000):
    # Forward pass
    h = 1 / (1 + np.exp(-x.dot(w1)))   # sigmoid activation
    y_pred = h.dot(w2)

    # Loss (mean squared error)
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h.T.dot(grad_y_pred)
    grad_h = grad_y_pred.dot(w2.T)
    grad_w1 = x.T.dot(grad_h * h * (1 - h))   # derivative of sigmoid

    # Gradient descent update
    w1 -= 1e-4 * grad_w1
    w2 -= 1e-4 * grad_w2

0 40582.518729573014
1 23628.020627415026
2 17054.32736660507
3 14086.160991911533
4 12521.758915656852
5 11585.276484026448
6 10938.617070972414
7 10415.939591907936
8 9950.55871191499
9 9580.327802407888
10 9240.495302079777
11 8931.73454736704
12 8650.610477279872
13 8387.76512411967
14 8149.52221386482
15 7953.565928025245
16 7769.140685617411
17 7591.000248412252
18 7421.041912833047
19 7258.524914359693
20 7098.321734269449
21 6935.216827668453
22 6771.079079827761
23 6620.701239432324
24 6486.353933383062
25 6359.369139791648
26 6238.372640768698
27 6122.950134810051
28 6012.898472958468
29 5908.218375170874
30 5807.912826977564
31 5710.966844011865
32 5616.435480221088
33 5523.47002945088
34 5431.088783631889
35 5338.261930556615
36 5244.116707088877
37 5148.67994303062
38 5054.433850829831
39 4967.188045657903
40 4887.339532191131
41 4812.252667280097
42 4740.197197377012
43 4670.257049211284
44 4602.092914524719
45 4535.567692709827
46 4470.660256950201
47 4407.40769206082
48