In [1]:
# -*- coding: utf-8 -*-
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 28005808.0754
1 22735762.9281
2 21880038.2703
3 21678092.9915
4 20212200.4664
5 16791178.4226
6 12315290.3447
7 8093030.73499
8 4991204.46118
9 3024958.2231
10 1884531.12623
11 1239860.81047
12 871808.536675
13 652303.600875
14 513372.231856
15 419201.32143
16 351190.963305
17 299333.688592
18 258159.203938
19 224504.986603
20 196435.433513
21 172689.40061
22 152466.013298
23 135056.119784
24 120002.930928
25 106898.540835
26 95449.8657839
27 85428.3309136
28 76627.6850596
29 68870.3966939
30 62013.6308258
31 55944.4238017
32 50554.1917213
33 45754.7942467
34 41472.0542557
35 37643.9106418
36 34216.2215478
37 31145.207345
38 28384.5867266
39 25900.1089451
40 23659.4191349
41 21636.5237248
42 19809.959943
43 18157.6868685
44 16659.3556481
45 15300.1900001
46 14066.3003051
47 12944.5142455
48 11922.8483751
49 10990.471241
50 10139.3892712
51 9361.5743681
52 8649.8789773
53 7998.33316039
54 7401.11584265
55 6853.00360065
56 6349.84937656
57 5887.49834718
58 5462.33985686
59 5071.1902140

433 1.69185176124e-05
434 1.61191996975e-05
435 1.53576923806e-05
436 1.46322028848e-05
437 1.394102508e-05
438 1.32826514295e-05
439 1.26552858564e-05
440 1.20577149099e-05
441 1.14882801934e-05
442 1.09457929409e-05
443 1.04289557096e-05
444 9.93655685339e-06
445 9.46749350889e-06
446 9.02057831798e-06
447 8.59477532454e-06
448 8.18906960162e-06
449 7.80257472629e-06
450 7.43427855432e-06
451 7.08350904531e-06
452 6.7492724563e-06
453 6.43079225549e-06
454 6.12736042343e-06
455 5.83826809866e-06
456 5.5628175741e-06
457 5.30041293743e-06
458 5.05042393921e-06
459 4.81218344829e-06
460 4.58521549727e-06
461 4.36894274475e-06
462 4.16291088801e-06
463 3.96661107963e-06
464 3.77959247085e-06
465 3.60137242893e-06
466 3.4315742882e-06
467 3.26977892849e-06
468 3.11562243959e-06
469 2.96877639787e-06
470 2.82883667869e-06
471 2.69550395549e-06
472 2.5684575915e-06
473 2.44740145342e-06
474 2.33206203725e-06
475 2.22216462122e-06
476 2.11745353996e-06
477 2.01770420513e-06
478 1.9226325933

In [2]:
# -*- coding: utf-8 -*-

import torch


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33769147.302892685
1 27739827.97490984
2 22558099.447547153
3 16837037.398299932
4 11430001.33270958
5 7285172.190358217
6 4584081.005585887
7 2976298.8683479624
8 2042941.5248041302
9 1488414.081519029
10 1142329.4906526422
11 911817.1576672359
12 748532.8440033775
13 626911.899895114
14 532483.4056296572
15 457077.2756461989
16 395571.07430605445
17 344573.26045513205
18 301799.40557022626
19 265631.5811271028
20 234782.68368045066
21 208391.7060166318
22 185627.78689661788
23 165885.18195641052
24 148680.30014484195
25 133637.60436871194
26 120439.65138073533
27 108806.61295147211
28 98539.95524092065
29 89458.07246631663
30 81386.33380595912
31 74189.74687517327
32 67753.00331592222
33 61975.053927162575
34 56782.88347156579
35 52105.25394450477
36 47883.839342042396
37 44064.90606657462
38 40603.17601072097
39 37461.055882091096
40 34601.52898921934
41 31996.904053064878
42 29619.84281560858
43 27448.48497411218
44 25463.813929824177
45 23642.679121307854
46 21970.892394544586
4

In [7]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Variables; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
    # (1,); loss.data[0] is a scalar value holding the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Variables with requires_grad=True.
    # After this call w1.grad and w2.grad will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent; w1.data and w2.data are Tensors,
    # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
    # Tensors.
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 30365948.0
1 23163580.0
2 20700956.0
3 19493002.0
4 17845288.0
5 15161301.0
6 11762465.0
7 8395171.0
8 5663332.5
9 3727158.0
10 2472422.0
11 1689735.0
12 1206268.125
13 901904.5
14 703550.0625
15 568115.9375
16 471137.5625
17 398398.53125
18 341713.0625
19 296157.6875
20 258642.8125
21 227234.09375
22 200602.3125
23 177828.140625
24 158205.640625
25 141209.515625
26 126411.7421875
27 113464.9375
28 102100.421875
29 92088.9375
30 83247.1484375
31 75413.578125
32 68449.7109375
33 62241.40234375
34 56693.21484375
35 51729.1953125
36 47273.890625
37 43267.9921875
38 39660.9375
39 36407.98046875
40 33467.234375
41 30803.7421875
42 28386.96875
43 26190.197265625
44 24190.220703125
45 22367.814453125
46 20705.740234375
47 19185.8515625
48 17794.55859375
49 16519.359375
50 15347.26953125
51 14270.4853515625
52 13279.73828125
53 12367.4287109375
54 11526.09375
55 10750.1630859375
56 10033.060546875
57 9370.12109375
58 8756.455078125
59 8187.56689453125
60 7660.291015625
61 7171.20654296875
62

425 0.00027269613929092884
426 0.00026651573716662824
427 0.00026025576516985893
428 0.00025412547984160483
429 0.0002484182477928698
430 0.00024283496895805
431 0.00023673080431763083
432 0.0002308328403159976
433 0.0002253867860417813
434 0.0002193998225266114
435 0.00021414284128695726
436 0.00020838674390688539
437 0.00020438447245396674
438 0.00019965390674769878
439 0.00019554294703993946
440 0.0001912963343784213
441 0.0001872500724857673
442 0.00018313471809960902
443 0.00017960750847123563
444 0.00017581881547812372
445 0.0001720029831631109
446 0.00016798442811705172
447 0.00016440921172033995
448 0.00016092057921923697
449 0.00015749655722174793
450 0.00015387889288831502
451 0.00015144891222007573
452 0.00014786123938392848
453 0.00014517686213366687
454 0.00014215509872883558
455 0.00013933549053035676
456 0.00013689063780475408
457 0.00013395243149716407
458 0.00013080306234769523
459 0.00012860730930697173
460 0.00012598122702911496
461 0.00012342692934907973
462 0.00012

In [8]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations on Variables; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.data[0])

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    w1.data -= learning_rate * w1.grad.data
    w2.data -= learning_rate * w2.grad.data

    # Manually zero the gradients after updating weights
    w1.grad.data.zero_()
    w2.grad.data.zero_()

0 27270500.0
1 27102458.0
2 33369380.0
3 41256488.0
4 43192844.0
5 33342540.0
6 18190846.0
7 7620986.5
8 3166070.5
9 1627746.0
10 1077557.625
11 831038.0625
12 685853.0
13 582374.4375
14 501070.125
15 434737.1875
16 379520.375
17 333006.5625
18 293535.84375
19 259815.46875
20 230841.140625
21 205801.59375
22 184062.96875
23 165107.359375
24 148548.40625
25 134001.390625
26 121177.75
27 109834.953125
28 99769.5234375
29 90804.3359375
30 82802.515625
31 75635.7734375
32 69206.25
33 63425.375
34 58208.21875
35 53499.2734375
36 49243.6171875
37 45385.578125
38 41879.40625
39 38690.28125
40 35783.73046875
41 33131.54296875
42 30709.345703125
43 28493.576171875
44 26463.640625
45 24601.51171875
46 22889.796875
47 21315.51953125
48 19866.533203125
49 18530.693359375
50 17298.51953125
51 16158.0419921875
52 15106.525390625
53 14133.5947265625
54 13232.583984375
55 12397.3134765625
56 11622.4619140625
57 10902.67578125
58 10233.9404296875
59 9612.80078125
60 9036.66015625
61 8499.6103515625
62 

464 0.0009991417173296213
465 0.0009695154149085283
466 0.000943863473366946
467 0.0009172391728498042
468 0.0008941550040617585
469 0.0008706487133167684
470 0.0008466849685646594
471 0.0008254779386334121
472 0.0008026270079426467
473 0.0007828273810446262
474 0.0007618936942890286
475 0.000743031152524054
476 0.0007237913669086993
477 0.0007043782388791442
478 0.0006860602297820151
479 0.000668743101414293
480 0.0006529595702886581
481 0.000636149721685797
482 0.0006215082248672843
483 0.0006048373761586845
484 0.0005910980398766696
485 0.0005770422867499292
486 0.0005625992198474705
487 0.0005496607045643032
488 0.0005365461693145335
489 0.0005237384466454387
490 0.0005105189047753811
491 0.0004978534416295588
492 0.0004866464005317539
493 0.0004764967306982726
494 0.00046551739796996117
495 0.00045521953143179417
496 0.00044536733184941113
497 0.00043525840737856925
498 0.00042617417057044804
499 0.0004164920828770846


In [10]:
# -*- coding: utf-8 -*-
import tensorflow as tf
import numpy as np

# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

3.36271e+07
2.88202e+07
2.83534e+07
2.72048e+07
2.31166e+07
1.6599e+07
1.03103e+07
5.87176e+06
3.34932e+06
2.0355e+06
1.35711e+06
987091.0
766194.0
620800.0
516671.0
437391.0
374521.0
323173.0
280519.0
244702.0
214378.0
188584.0
166470.0
147368.0
130805.0
116394.0
103804.0
92771.8
83078.7
74542.0
67000.3
60320.4
54402.7
49137.7
44442.3
40248.8
36497.2
33136.4
30119.5
27409.8
24969.4
22767.2
20777.4
18977.8
17349.0
15872.5
14532.4
13315.5
12209.3
11202.7
10285.7
9449.96
8688.42
7993.37
7358.31
6777.64
6246.25
5759.74
5314.41
4906.47
4532.15
4188.43
3872.76
3582.53
3315.54
3069.81
2843.47
2634.94
2442.67
2265.38
2101.72
1950.69
1811.23
1682.28
1563.08
1452.82
1350.82
1256.42
1169.02
1088.05
1013.0
943.438
878.924
819.104
763.574
712.021
664.132
619.656
578.323
539.886
504.128
470.866
439.919
411.102
384.277
359.277
335.982
314.27
294.029
275.159
257.552
241.127
225.894
211.761
198.558
186.217
174.688
163.907
153.822
144.39
135.561
127.301
119.563
112.321
105.533
99.1757
93.2181
87.6348
8

In [11]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Variable of input data to the Module and it produces
    # a Variable of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Variables containing the predicted and true
    # values of y, and the loss function returns a Variable containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Variables with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Variable, so
    # we can access its data and gradients like we did before.
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data

0 604.5413818359375
1 561.2833862304688
2 523.7066650390625
3 490.4233093261719
4 460.53338623046875
5 433.3995666503906
6 408.4915771484375
7 385.5908203125
8 364.306640625
9 344.47552490234375
10 325.89404296875
11 308.5115966796875
12 292.16644287109375
13 276.7727355957031
14 262.2524108886719
15 248.52316284179688
16 235.56240844726562
17 223.18609619140625
18 211.3828125
19 200.12657165527344
20 189.37452697753906
21 179.06813049316406
22 169.193115234375
23 159.78128051757812
24 150.8151397705078
25 142.26405334472656
26 134.1456756591797
27 126.45195770263672
28 119.1596450805664
29 112.24336242675781
30 105.68647003173828
31 99.4938735961914
32 93.62181091308594
33 88.0911636352539
34 82.86775207519531
35 77.9398422241211
36 73.30531311035156
37 68.94452667236328
38 64.84710693359375
39 60.99868392944336
40 57.385169982910156
41 53.9847412109375
42 50.79265594482422
43 47.797271728515625
44 44.97994613647461
45 42.334999084472656
46 39.852867126464844
47 37.52044677734375
48 3

356 0.00019142887322232127
357 0.00018520290905144066
358 0.00017918043886311352
359 0.00017336201563011855
360 0.00016774061077740043
361 0.00016229719039984047
362 0.000157029673573561
363 0.00015192960563581437
364 0.00014699780149385333
365 0.00014223474136088043
366 0.00013763285824097693
367 0.00013316738477442414
368 0.00012885734031442553
369 0.00012467356282286346
370 0.00012063661415595561
371 0.00011672863183775917
372 0.00011294829891994596
373 0.0001092949096346274
374 0.00010575865599093959
375 0.00010233304055873305
376 9.902478632284328e-05
377 9.582214988768101e-05
378 9.272134047932923e-05
379 8.972568321041763e-05
380 8.682540646987036e-05
381 8.402482490055263e-05
382 8.131121285259724e-05
383 7.868631655583158e-05
384 7.614360220031813e-05
385 7.368634396698326e-05
386 7.13061963324435e-05
387 6.900783773744479e-05
388 6.678055797237903e-05
389 6.462722376454622e-05
390 6.254300387809053e-05
391 6.0525482695084065e-05
392 5.857379801454954e-05
393 5.668720041285269

In [12]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.data[0])

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 701.5176391601562
1 683.686767578125
2 666.367431640625
3 649.4774780273438
4 633.1680908203125
5 617.3645629882812
6 602.1714477539062
7 587.4563598632812
8 573.0671997070312
9 559.0860595703125
10 545.4544067382812
11 532.2288208007812
12 519.4381713867188
13 507.0624084472656
14 495.1191101074219
15 483.52239990234375
16 472.25860595703125
17 461.33990478515625
18 450.69482421875
19 440.3429260253906
20 430.3147277832031
21 420.54010009765625
22 411.0066223144531
23 401.7523193359375
24 392.7303466796875
25 383.99005126953125
26 375.47381591796875
27 367.13311767578125
28 359.0373840332031
29 351.11932373046875
30 343.3945007324219
31 335.8393859863281
32 328.4432067871094
33 321.2178039550781
34 314.1488952636719
35 307.2657165527344
36 300.5420837402344
37 293.9593505859375
38 287.5122985839844
39 281.19732666015625
40 275.0151062011719
41 268.96832275390625
42 263.03765869140625
43 257.224609375
44 251.52333068847656
45 245.91842651367188
46 240.42181396484375
47 235.0494537353

373 9.701394446892664e-05
374 9.182669600704685e-05
375 8.692045230418444e-05
376 8.223496843129396e-05
377 7.781106978654861e-05
378 7.361898315139115e-05
379 6.96443603374064e-05
380 6.58768112771213e-05
381 6.230540020624176e-05
382 5.892527406103909e-05
383 5.571862493525259e-05
384 5.268226595944725e-05
385 4.980642916052602e-05
386 4.708217966253869e-05
387 4.4501128286356106e-05
388 4.2058771214215085e-05
389 3.974361970904283e-05
390 3.7554851587628946e-05
391 3.548131280695088e-05
392 3.3518121199449524e-05
393 3.1660594686400145e-05
394 2.990268694702536e-05
395 2.823919385264162e-05
396 2.666457658051513e-05
397 2.5175331757054664e-05
398 2.3766950107528828e-05
399 2.243388917122502e-05
400 2.1175221263547428e-05
401 1.9982106095994823e-05
402 1.8856546375900507e-05
403 1.779151352820918e-05
404 1.678293847362511e-05
405 1.5832109056646004e-05
406 1.4932480553397909e-05
407 1.4082490451983176e-05
408 1.3279674931254704e-05
409 1.2520165000751149e-05
410 1.1805179383372888e-0

In [13]:
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable


class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Variables.
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 653.1563720703125
1 607.1088256835938
2 567.3455810546875
3 532.3733520507812
4 500.9659729003906
5 472.1373291015625
6 445.85565185546875
7 421.60260009765625
8 399.0929260253906
9 378.32208251953125
10 358.8457336425781
11 340.457763671875
12 323.2088928222656
13 306.8302307128906
14 291.2291564941406
15 276.3922119140625
16 262.2750549316406
17 248.8591766357422
18 235.9830780029297
19 223.6206817626953
20 211.81011962890625
21 200.54074096679688
22 189.79006958007812
23 179.48980712890625
24 169.6558837890625
25 160.2659149169922
26 151.32931518554688
27 142.80374145507812
28 134.6969757080078
29 126.98110961914062
30 119.664794921875
31 112.73690795898438
32 106.17303466796875
33 99.94434356689453
34 94.03459930419922
35 88.43726348876953
36 83.13970184326172
37 78.13540649414062
38 73.407958984375
39 68.94552612304688
40 64.73893737792969
41 60.77462387084961
42 57.039371490478516
43 53.52426528930664
44 50.220489501953125
45 47.11456298828125
46 44.20143508911133
47 41.4637298

402 5.548599438043311e-05
403 5.3948282584315166e-05
404 5.245090869721025e-05
405 5.10012214363087e-05
406 4.9588619731366634e-05
407 4.821844777325168e-05
408 4.688254193752073e-05
409 4.559004446491599e-05
410 4.432866262504831e-05
411 4.310575241106562e-05
412 4.19142706959974e-05
413 4.0757695387583226e-05
414 3.963395647588186e-05
415 3.854039096040651e-05
416 3.7480160244740546e-05
417 3.644719618023373e-05
418 3.544273567968048e-05
419 3.4469379897927865e-05
420 3.351965642650612e-05
421 3.2599244150333107e-05
422 3.170393392792903e-05
423 3.083159026573412e-05
424 2.9986560548422858e-05
425 2.9162445571273565e-05
426 2.836126986949239e-05
427 2.7583908376982436e-05
428 2.6828238333109766e-05
429 2.6091631298186257e-05
430 2.537755608500447e-05
431 2.468212551320903e-05
432 2.400707126071211e-05
433 2.334915188839659e-05
434 2.27098171308171e-05
435 2.2089076082920656e-05
436 2.1485178876901045e-05
437 2.089836743834894e-05
438 2.032862357737031e-05
439 1.9773213352891617e-05
4

In [14]:
# -*- coding: utf-8 -*-
import random
import torch
from torch.autograd import Variable


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.data[0])

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 630.86767578125
1 614.9512329101562
2 611.9270629882812
3 665.2872314453125
4 609.6676635742188
5 607.7323608398438
6 580.9677734375
7 603.9326782226562
8 592.02099609375
9 554.498046875
10 543.1137084960938
11 598.46337890625
12 582.7281494140625
13 579.7453002929688
14 498.099609375
15 484.9487609863281
16 425.95062255859375
17 591.22998046875
18 436.9331970214844
19 417.30450439453125
20 583.7235107421875
21 535.933349609375
22 523.7632446289062
23 329.9150085449219
24 489.6147155761719
25 543.329345703125
26 525.450439453125
27 245.59120178222656
28 477.41571044921875
29 356.23345947265625
30 322.47283935546875
31 289.1954040527344
32 260.161376953125
33 236.4407196044922
34 220.3872528076172
35 224.02645874023438
36 204.3826904296875
37 278.6517333984375
38 171.3938446044922
39 171.10311889648438
40 126.42301177978516
41 121.25909423828125
42 118.66324615478516
43 253.61463928222656
44 165.04312133789062
45 216.13174438476562
46 206.43040466308594
47 193.8282012939453
48 157.108

416 0.4893881380558014
417 2.986396312713623
418 1.0999207496643066
419 1.5423343181610107
420 2.08798885345459
421 1.3571475744247437
422 1.533272385597229
423 1.342713475227356
424 0.5374673008918762
425 5.246868133544922
426 0.6563256978988647
427 0.7880129218101501
428 0.7686330080032349
429 1.8898688554763794
430 1.4874905347824097
431 0.24756966531276703
432 0.8844889402389526
433 1.5502405166625977
434 1.658102035522461
435 0.8434939384460449
436 0.5940487384796143
437 0.7261455655097961
438 0.6902121901512146
439 0.3482138514518738
440 1.4098479747772217
441 3.853153944015503
442 0.3057435154914856
443 1.274241328239441
444 0.7506057024002075
445 0.5837569236755371
446 0.3217307925224304
447 5.976569175720215
448 0.5610032677650452
449 1.4987308979034424
450 2.8660330772399902
451 1.744945764541626
452 0.9455666542053223
453 1.7949047088623047
454 0.9603798389434814
455 4.269947528839111
456 1.4269940853118896
457 1.1003493070602417
458 2.006842851638794
459 0.7445907592773438
