In [1]:
# We can technically create neural networks using numpy, which exposes
# an n-dimensional array object and lots of optimized functions for
# manipulating these arrays. But they also know nothing about graphs or
# deep learning or gradients. But here's an example of fitting a 2-layer
# netowrk to random data, implementing forward and backwards passes, to
# get train a neural network to predict y from x.

# Neural networks are tasked with learning an appropriate
# INTERNAL representation so that it can take arbitrary
# data and map it to the correct output.

In [2]:
import numpy as np

# the batch size (number of datapoints)
N = 64

# the input "feature" dimension / layer
D_in = 1000

# the hidden dimension (# of nodes in the hidden layer)
H = 100

# output dimension / layer
D_out = 10

# Create random input/output data
x = np.random.randn(N, D_in)      # 64 x 1000
y = np.random.randn(N, D_out)     # 64 x 10

# Randomly initialize weights
w1 = np.random.randn(D_in, H)     # 1000 x 100 - input weights
w2 = np.random.randn(H, D_out)    # 100 x 10 - hidden layer weights

learning_rate = 1e-6
num_passes = 500

In [3]:
# perform forward and backwards passes
for t in range(num_passes):
    # the forward pass - propagate the inputs through the network
    # layer by layer until it reaches the output layer. To propagate
    # it through the first layer, multiply the inputs by the first
    # layer weights.
    
    # dot product -- cosine similarity - 64 x 100
    h = x.dot(w1)
    # don't allow negative numbers in array
    h_relu = np.maximum(h, 0)
    # prediction = x * weight * weight2
    y_pred = h_relu.dot(w2)
    
    loss = np.square(y_pred - y).sum()
    print('loss:', t, loss)
    
    # backpropagation - computes gradients of w1 and w2 w/r/t loss
    # compute error at the output and distribute it back to hidden layer
    # backprop requires known output for each input value so that you can 
    # compute the error (supervised). You can use whatever error function
    # you like, or you can even use a weighted sum of different error
    # functions for individual training examples.
    
    # gradient descent gives us the weights minimizing the error
    
    # 2 * error or squared error?
    grad_y_pred = 2.0 * (y_pred - y)
    # error * weighted/capped X
    # this gives us new adjustment for second sets of weights
    grad_w2 = h_relu.T.dot(grad_y_pred)
    # error gradient * weights 2
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    # no negatives again
    grad_h[h < 0] = 0
    # data * error gradient * weights 2
    # this gives us new adjustment for first set of weights
    grad_w1 = x.T.dot(grad_h)
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

loss: 0 41778463.3908
loss: 1 40572998.7816
loss: 2 38626629.3182
loss: 3 30137349.5522
loss: 4 18679640.5011
loss: 5 9576483.20226
loss: 6 4799231.20306
loss: 7 2671127.17678
loss: 8 1738119.51262
loss: 9 1276556.33142
loss: 10 1007632.17666
loss: 11 826406.804325
loss: 12 692319.845938
loss: 13 587738.778106
loss: 14 503601.869539
loss: 15 434655.826133
loss: 16 377497.622608
loss: 17 329639.652659
loss: 18 289289.501504
loss: 19 255028.537896
loss: 20 225766.350413
loss: 21 200623.09997
loss: 22 178933.258419
loss: 23 160128.024002
loss: 24 143739.349369
loss: 25 129381.822232
loss: 26 116759.035368
loss: 27 105623.534134
loss: 28 95765.140964
loss: 29 87006.2688523
loss: 30 79204.0645084
loss: 31 72233.1821844
loss: 32 65990.4926418
loss: 33 60384.4886444
loss: 34 55346.170914
loss: 35 50800.3177728
loss: 36 46690.3759852
loss: 37 42967.392981
loss: 38 39586.7819258
loss: 39 36511.2411272
loss: 40 33710.1194788
loss: 41 31156.6210181
loss: 42 28822.5415483
loss: 43 26688.7897666
lo

In [4]:
# loss converges on 0

In [5]:
# This gets super complicated when we have more than one hidden layer.
# Torch provides the implementation for forward/backward passes using
# automatic differentiation -- basically the generalization of backprop.
# torch.autograd does this. It defines a computational graph where
# nodes in the graph are tensors and edges are functions that produce
# output tensors from input tensors. This greatly facilitates backprop --
# allows you to easily compute gradients. 

# Variable represents a node in a computational graph.
# x = Variable()
# x.data // a tensor
# x.grad // holds the gradient of x

# Variables have same API as Tensors -- nearly any operation on a variable
# is one that can be done on a tensor; the difference is that using a
# variable defines a computational graph.

# Using torch to implement a 2-layer network:

In [6]:
import torch
from torch.autograd import Variable

In [12]:
dtype = torch.FloatTensor

# create the random input/output data -- these won't require gradients.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# we want to compute gradients w/r/t these during backprop
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

In [13]:
for t in range(500):
    # Forward pass: basically the same as numpy operations, but 
    # don't need to remember intermediate variables.
    # mm is matrix multiplication.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute loss -- loss.shape == (1,) (a Variable)
    # loss.data.shape == (1,) (a Tensor)
    # loss.data[0] is a scalar value holding the loss
    loss = (y_pred - y).pow(2).sum()
    
    print(t, loss.data[0])
    
    # Backward pass can be computed from any Variable throughout
    # the network associated with that Variable. Computes loss
    # gradient w/r/t all Variables with requires_grad=True. So
    # w1.grad and w2.grad will be Variables holding the gradient
    # of the loss w/r/t w1 and w2. It knows to do this since
    # loss is the output of y_pred and y, and y_pred is the product
    # of x, w1, w2. 
    loss.backward()
    
    # Update weights with gradient descent. w1.data and w2.data are
    # Tensors, w1.grad and w2.grad are Variables and w1.grad.data
    # and w2.grad.data are tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data
    
    # Zero the gradients
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 29792916.0
1 28371110.0
2 32703822.0
3 37737352.0
4 37553976.0
5 29106612.0
6 17101122.0
7 8252016.0
8 3861294.25
9 2052118.375
10 1312504.0
11 970250.875
12 778663.5
13 650675.3125
14 554636.3125
15 477834.3125
16 414490.34375
17 361360.90625
18 316375.6875
19 277990.4375
20 245141.25
21 216942.765625
22 192529.8125
23 171336.984375
24 152865.1875
25 136702.96875
26 122503.7890625
27 110017.7265625
28 98992.0859375
29 89240.1640625
30 80598.0859375
31 72910.625
32 66063.9453125
33 59950.3046875
34 54486.09765625
35 49594.7265625
36 45216.48828125
37 41278.44140625
38 37730.6875
39 34531.2109375
40 31640.08203125
41 29023.005859375
42 26652.130859375
43 24498.955078125
44 22543.193359375
45 20763.0625
46 19141.328125
47 17662.58984375
48 16313.154296875
49 15079.87109375
50 13951.4541015625
51 12917.8193359375
52 11970.1591796875
53 11100.6982421875
54 10301.1669921875
55 9566.5146484375
56 8890.6904296875
57 8269.1064453125
58 7696.71435546875
59 7168.49560546875
60 6680.97314453125

In [14]:
# again, converges towards 0