In [37]:
# Code in file tensor/two_layer_net_numpy.py
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.dot(w1)
  h_relu = np.maximum(h, 0)
  y_pred = h_relu.dot(w2)
  
  # Compute and print loss
  loss = np.square(y_pred - y).sum()
  print(t, loss)
  
  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.T.dot(grad_y_pred)
  grad_h_relu = grad_y_pred.dot(w2.T)
  grad_h = grad_h_relu.copy()
  grad_h[h < 0] = 0
  grad_w1 = x.T.dot(grad_h)
 
  # Update weights
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 28941888.2903891
1 26592034.862033214
2 28193212.179164015
3 29333673.666476343
4 26771884.854327887
5 20218341.013178155
6 12661029.948719759
7 6974211.9949134495
8 3736679.5554723255
9 2125513.8786284714
10 1347448.0555756784
11 953085.8284064902
12 732404.5462561375
13 593560.9911913229
14 496430.0927308339
15 423047.2893706238
16 364879.95916152187
17 317319.43318261096
18 277789.00783888844
19 244492.8157174144
20 216210.52569681683
21 192030.26455266506
22 171198.78280751157
23 153175.80301703984
24 137508.7771000133
25 123825.0424656576
26 111825.93277440229
27 101256.51023143646
28 91937.23581462004
29 83679.18713478258
30 76334.11144038086
31 69775.86823203487
32 63909.931415881576
33 58644.98123269838
34 53910.03494298072
35 49643.213981815374
36 45786.172724889635
37 42294.1755689638
38 39124.749065921
39 36248.08794274322
40 33627.403665113685
41 31234.13069371683
42 29044.27632025287
43 27036.994875234323
44 25195.40736793319
45 23502.890828284428
46 21944.772138599248
4

In [39]:
# Code in file tensor/two_layer_net_tensor.py
import torch

# device = torch.device('cpu')
device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.mm(w1)
  h_relu = h.clamp(min=0)
  y_pred = h_relu.mm(w2)

  # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
  # of shape (); we can get its value as a Python number with loss.item().
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h < 0] = 0
  grad_w1 = x.t().mm(grad_h)

  # Update weights using gradient descent
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 35918020.0
1 42317172.0
2 58148144.0
3 66652096.0
4 49997160.0
5 20707976.0
6 5838780.5
7 2200260.25
8 1392803.875
9 1091804.125
10 903945.5
11 761458.375
12 647321.875
13 554303.5625
14 477621.5625
15 414041.375
16 360946.65625
17 316188.25
18 278288.1875
19 245974.75
20 218220.4375
21 194207.109375
22 173385.125
23 155236.40625
24 139355.75
25 125409.75
26 113109.34375
27 102255.234375
28 92632.75
29 84094.171875
30 76486.8984375
31 69684.7109375
32 63585.70703125
33 58112.0546875
34 53187.24609375
35 48744.375
36 44733.03515625
37 41103.0703125
38 37813.296875
39 34827.75
40 32115.84765625
41 29647.66015625
42 27398.396484375
43 25344.009765625
44 23466.466796875
45 21749.203125
46 20175.01953125
47 18731.64453125
48 17405.259765625
49 16186.4150390625
50 15064.0986328125
51 14030.23046875
52 13077.7734375
53 12198.7275390625
54 11386.640625
55 10635.787109375
56 9941.052734375
57 9297.3759765625
58 8701.1689453125
59 8147.72509765625
60 7634.07666015625
61 7157.26416015625
62 671

In [44]:
# # Code in file autograd/tf_two_layer_net.py
# import tensorflow as tf
# import numpy as np
# 
# # First we set up the computational graph:
# 
# # N is batch size; D_in is input dimension;
# # H is hidden dimension; D_out is output dimension.
# N, D_in, H, D_out = 64, 1000, 100, 10
# 
# # Create placeholders for the input and target data; these will be filled
# # with real data when we execute the graph.
# x = tf.keras.Input(dtype = tf.float32, shape=(None, D_in))
# y = tf.keras.Input(dtype = tf.float32, shape=(None, D_out))
# 
# # Create Variables for the weights and initialize them with random data.
# # A TensorFlow Variable persists its value across executions of the graph.
# w1 = tf.Variable(tf.random.normal((D_in, H)))
# w2 = tf.Variable(tf.random.normal((H, D_out)))
# 
# # Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# # Note that this code does not actually perform any numeric operations; it
# # merely sets up the computational graph that we will later execute.
# h = tf.matmul(x, w1)
# h_relu = tf.maximum(h, tf.zeros(1))
# y_pred = tf.matmul(h_relu, w2)
# 
# # Compute loss using operations on TensorFlow Tensors
# loss = tf.reduce_sum((y - y_pred) ** 2.0)
# 
# # Compute gradient of the loss with respect to w1 and w2.
# grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])
# 
# # Update the weights using gradient descent. To actually update the weights
# # we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# # in TensorFlow the the act of updating the value of the weights is part of
# # the computational graph; in PyTorch this happens outside the computational
# # graph.
# learning_rate = 1e-6
# new_w1 = w1.assign(w1 - learning_rate * grad_w1)
# new_w2 = w2.assign(w2 - learning_rate * grad_w2)
# 
# # Now we have built our computational graph, so we enter a TensorFlow session to
# # actually execute the graph.
# # Run the graph once to initialize the Variables w1 and w2.
# optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
# 
# x_value = np.random.randn(N, D_in)
# y_value = np.random.randn(N, D_out)
# 
# # Create numpy arrays holding the actual data for the inputs x and targets y
# for _ in range(500):
#     with tf.GradientTape() as tape:
#       h= tf.matmul(x, w1)
#       h_relu = tf.maximum(h, tf.zeros(1))
#       y_pred = tf.matmul(h_relu, w2)
#       loss = tf.reduce_sum((y - y_pred) ** 2.0)
#     
#     gradients = tape.gradient(loss, [w1, w2])
#     optimizer.apply_gradients(zip(gradients, [w1, w2]))
#     print(loss.numpy())



RuntimeError: tf.gradients is not supported when eager execution is enabled. Use tf.GradientTape instead.

In [46]:
# Code in file nn/two_layer_net_nn.py
import torch

# device = torch.device('cpu')
device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# After constructing the model we use the .to() method to move it to the
# desired device.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        ).to(device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function. Setting
# reduction='sum' means that we are computing the *sum* of squared errors rather
# than the mean; this is for consistency with the examples above where we
# manually compute the loss, but in practice it is more common to use mean
# squared error as a loss by setting reduction='elementwise_mean'.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model. Module objects
  # override the __call__ operator so you can call them like functions. When
  # doing so you pass a Tensor of input data to the Module and it produces
  # a Tensor of output data.
  y_pred = model(x)

  # Compute and print loss. We pass Tensors containing the predicted and true
  # values of y, and the loss function returns a Tensor containing the loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Zero the gradients before running the backward pass.
  model.zero_grad()

  # Backward pass: compute gradient of the loss with respect to all the learnable
  # parameters of the model. Internally, the parameters of each Module are stored
  # in Tensors with requires_grad=True, so this call will compute gradients for
  # all learnable parameters in the model.
  loss.backward()

  # Update the weights using gradient descent. Each parameter is a Tensor, so
  # we can access its data and gradients like we did before.
  with torch.no_grad():
    for param in model.parameters():
      param.data -= learning_rate * param.grad

0 693.6939086914062
1 683.6755981445312
2 674.2957763671875
3 665.5421142578125
4 657.3455200195312
5 649.4089965820312
6 641.779541015625
7 634.5535888671875
8 627.5890502929688
9 620.905517578125
10 614.453369140625
11 608.1475219726562
12 601.9097290039062
13 595.731689453125
14 589.6025390625
15 583.4493408203125
16 577.2836303710938
17 571.1387939453125
18 565.0403442382812
19 558.940185546875
20 552.7813720703125
21 546.600341796875
22 540.3707275390625
23 534.0579223632812
24 527.7005615234375
25 521.2633056640625
26 514.8046264648438
27 508.23272705078125
28 501.5549011230469
29 494.79278564453125
30 487.9367980957031
31 480.9544982910156
32 473.8597412109375
33 466.6923522949219
34 459.3495178222656
35 451.9526672363281
36 444.42620849609375
37 436.7837829589844
38 429.0760498046875
39 421.26031494140625
40 413.396484375
41 405.40777587890625
42 397.36578369140625
43 389.2261962890625
44 381.0356140136719
45 372.7848205566406
46 364.4912414550781
47 356.1532287597656
48 347.81

In [47]:
# Code in file nn/two_layer_net_optim.py
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algorithms. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for t in range(500):
  # Forward pass: compute predicted y by passing x to the model.
  y_pred = model(x)

  # Compute and print loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.item())
  
  # Before the backward pass, use the optimizer object to zero all of the
  # gradients for the Tensors it will update (which are the learnable weights
  # of the model)
  optimizer.zero_grad()

  # Backward pass: compute gradient of the loss with respect to model parameters
  loss.backward()

  # Calling the step function on an Optimizer makes an update to its parameters
  optimizer.step()

0 612.4791259765625
1 605.2576293945312
2 598.3970947265625
3 591.7305297851562
4 585.2027587890625
5 578.8516845703125
6 572.7098999023438
7 566.7997436523438
8 561.0980224609375
9 555.5546875
10 550.2174682617188
11 544.9686889648438
12 539.8546142578125
13 534.8262939453125
14 529.9065551757812
15 525.0681762695312
16 520.29443359375
17 515.600830078125
18 510.95013427734375
19 506.3228454589844
20 501.7575988769531
21 497.225830078125
22 492.7273864746094
23 488.2712097167969
24 483.81451416015625
25 479.34039306640625
26 474.8628234863281
27 470.3786315917969
28 465.8890380859375
29 461.4114685058594
30 456.94708251953125
31 452.45941162109375
32 447.9401550292969
33 443.4000244140625
34 438.8351135253906
35 434.2449035644531
36 429.669189453125
37 425.08294677734375
38 420.47161865234375
39 415.84014892578125
40 411.2151184082031
41 406.58160400390625
42 401.9275207519531
43 397.2799072265625
44 392.625
45 387.95062255859375
46 383.26025390625
47 378.567626953125
48 373.864562988

In [51]:
# Code in file nn/two_layer_net_module.py
import torch

class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we instantiate two nn.Linear modules and assign them as
    member variables.
    """
    super(TwoLayerNet, self).__init__()
    self.linear1 = torch.nn.Linear(D_in, H)
    self.linear2 = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    In the forward function we accept a Tensor of input data and we must return
    a Tensor of output data. We can use Modules defined in the constructor as
    well as arbitrary (differentiable) operations on Tensors.
    """
    h_relu = self.linear1(x).clamp(min=0)
    y_pred = self.linear2(h_relu)
    return y_pred

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above.
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = loss_fn(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()


0 640.0499877929688
1 592.2871704101562
2 551.427734375
3 515.4647827148438
4 483.7063903808594
5 455.10589599609375
6 429.28814697265625
7 405.7327575683594
8 384.0366516113281
9 363.7674255371094
10 344.8120422363281
11 327.03045654296875
12 310.298583984375
13 294.44781494140625
14 279.47308349609375
15 265.2679443359375
16 251.7413330078125
17 238.86087036132812
18 226.6383819580078
19 214.96165466308594
20 203.7803955078125
21 193.09146118164062
22 182.90599060058594
23 173.15521240234375
24 163.8271942138672
25 154.90084838867188
26 146.3857879638672
27 138.24411010742188
28 130.5223846435547
29 123.19560241699219
30 116.22514343261719
31 109.62008666992188
32 103.36155700683594
33 97.43389892578125
34 91.8178939819336
35 86.49102020263672
36 81.4552001953125
37 76.69800567626953
38 72.2008056640625
39 67.95809173583984
40 63.94975280761719
41 60.173179626464844
42 56.61274719238281
43 53.26205825805664
44 50.10822677612305
45 47.13569641113281
46 44.34131622314453
47 41.71448516

In [50]:
# Code in file nn/dynamic_net.py
import random
import torch

class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
    """
    In the constructor we construct three nn.Linear instances that we will use
    in the forward pass.
    """
    super(DynamicNet, self).__init__()
    self.input_linear = torch.nn.Linear(D_in, H)
    self.middle_linear = torch.nn.Linear(H, H)
    self.output_linear = torch.nn.Linear(H, D_out)

  def forward(self, x):
    """
    For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
    and reuse the middle_linear Module that many times to compute hidden layer
    representations.

    Since each forward pass builds a dynamic computation graph, we can use normal
    Python control-flow operators like loops or conditional statements when
    defining the forward pass of the model.

    Here we also see that it is perfectly safe to reuse the same Module many
    times when defining a computational graph. This is a big improvement from Lua
    Torch, where each Module could be used only once.
    """
    h_relu = self.input_linear(x).clamp(min=0)
    for _ in range(random.randint(0, 3)):
      h_relu = self.middle_linear(h_relu).clamp(min=0)
    y_pred = self.output_linear(h_relu)
    return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs.
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = criterion(y_pred, y)
  print(t, loss.item())

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()