In [3]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# print(x)
# print(y)

In [10]:
# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
print(f"{learning_rate:.9f}")
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0.000001000
0 39382924.0
1 43533800.0
2 51523032.0
3 51033776.0
4 36084504.0
5 17462944.0
6 6731039.0
7 2855647.0
8 1618502.0
9 1152643.625
10 912170.6875
11 752192.0
12 631880.0
13 536298.1875
14 458647.25
15 394783.09375
16 341728.5625
17 297242.1875
18 259703.140625
19 227853.109375
20 200693.109375
21 177406.734375
22 157328.71875
23 139927.890625
24 124850.71875
25 111698.203125
26 100184.125
27 90070.7265625
28 81159.078125
29 73323.5
30 66379.8125
31 60207.76953125
32 54713.2421875
33 49805.8203125
34 45414.9765625
35 41478.51953125
36 37942.1484375
37 34758.74609375
38 31888.27734375
39 29294.234375
40 26947.3671875
41 24822.13671875
42 22894.0
43 21141.09375
44 19543.470703125
45 18091.255859375
46 16767.427734375
47 15557.03125
48 14449.826171875
49 13435.2177734375
50 12504.0361328125
51 11648.857421875
52 10862.2841796875
53 10138.0654296875
54 9470.6357421875
55 8854.994140625
56 8286.1943359375
57 7760.34375
58 7273.51513671875
59 6822.4736328125
60 6404.22314453125
61 60