# Warm-up: numpy

Numpy provides an n-dimensional array object, and many functions for manipulating these arrays. Numpy is a generic framework for scientific computing; it does not know anything about computation graphs, or deep learning, or gradients. 

However we can easily use numpy to fit a two-layer network to random data by manually implementing the forward and backward passes through the network using numpy operations

In [2]:
import numpy as np

In [4]:
import numpy as np
 
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in) #(64, 1000)
y = np.random.randn(N, D_out) #(64, 10)

# Randomly initialize weights
w1 = np.random.randn(D_in, H) #(1000, 100)
w2 = np.random.randn(H, D_out) #(100, 10)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    # np.square: 平方
    loss = np.square(y_pred - y).sum() 
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    # weight = weight - learning_rate * gradient
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30843788.326063037
1 31367920.87947837
2 37112458.91486728
3 40937289.89182015
4 37000963.22359131
5 24601194.01213205
6 12543936.3760214
7 5523027.280896346
8 2603657.0933170132
9 1479395.8348649913
10 1016274.6360568616
11 785638.9687367098
12 644664.5610233278
13 544081.5808872362
14 465935.76650399633
15 402405.34819371195
16 349634.6338480711
17 305330.95787273406
18 267803.61002931907
19 235779.2000407132
20 208314.8369285774
21 184655.3508685233
22 164180.93104217795
23 146412.0971848225
24 130904.95614304283
25 117309.29841021544
26 105368.72106844778
27 94837.89033675237
28 85525.42783423269
29 77284.38747236613
30 69960.36394018034
31 63435.3821962119
32 57609.75762228678
33 52402.35258879785
34 47737.48446252477
35 43545.75189863593
36 39777.55824547561
37 36384.23375032701
38 33322.2448424537
39 30553.293870483383
40 28046.38296757559
41 25774.376827851673
42 23711.991566820972
43 21836.488710879414
44 20128.495775945623
45 18571.408549077743
46 17151.600130524137
47 1585

472 2.2574733505976947e-05
473 2.1587034511978645e-05
474 2.0642176756253517e-05
475 1.973881090582035e-05
476 1.8875216301778085e-05
477 1.804935421840091e-05
478 1.725942374955705e-05
479 1.6504116276878788e-05
480 1.5782285563870096e-05
481 1.509169855721371e-05
482 1.4431312842069247e-05
483 1.379999843661025e-05
484 1.3196098967821643e-05
485 1.2618677934609658e-05
486 1.2066487923157453e-05
487 1.1538549673007259e-05
488 1.1033729280631992e-05
489 1.0550863969186702e-05
490 1.0089435170870392e-05
491 9.647913842514718e-06
492 9.225681228810328e-06
493 8.822031017588523e-06
494 8.436258839542517e-06
495 8.067152647892582e-06
496 7.714198874929851e-06
497 7.376786135389032e-06
498 7.053983590887775e-06
499 6.74533613167014e-06


# PyTorch: Tensors

In [5]:
import torch

In [7]:
dtype = torch.float
device = torch.device("cuda:0")

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
#(64, 1000)
x = torch.randn(N, D_in, device=device, dtype=dtype)
#(64, 10)
y = torch.randn(N, D_out, device=device, dtype=dtype)


# Randomly initialize weights
#(1000, 100)
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: compute predicted y
    # mm(mat2) → Tensor
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 31636164.0
1 28779406.0
2 28897474.0
3 27303502.0
4 22266774.0
5 15179169.0
6 9013240.0
7 4993071.0
8 2828140.0
9 1733639.75
10 1175570.25
11 871045.0
12 687792.4375
13 565566.6875
14 476696.15625
15 408046.21875
16 352871.15625
17 307390.5625
18 269304.8125
19 237073.640625
20 209532.78125
21 185863.25
22 165415.734375
23 147677.0
24 132253.609375
25 118755.4609375
26 106896.9375
27 96439.8046875
28 87188.109375
29 78988.7890625
30 71700.546875
31 65205.95703125
32 59408.08203125
33 54215.171875
34 49554.74609375
35 45366.80078125
36 41594.3046875
37 38189.078125
38 35109.6171875
39 32319.138671875
40 29784.919921875
41 27482.095703125
42 25382.447265625
43 23467.970703125
44 21724.703125
45 20131.94140625
46 18674.068359375
47 17339.205078125
48 16113.6259765625
49 14988.5205078125
50 13953.6630859375
51 13000.736328125
52 12123.044921875
53 11313.904296875
54 10566.7177734375
55 9875.927734375
56 9236.4326171875
57 8644.5546875
58 8095.4287109375
59 7586.24755859375
60 7113.500488

415 0.003771651303395629
416 0.0036531141959130764
417 0.003539714962244034
418 0.0034281781408935785
419 0.0033197924494743347
420 0.0032152021303772926
421 0.0031132884323596954
422 0.0030155826825648546
423 0.002923648338764906
424 0.0028326131869107485
425 0.0027473752852529287
426 0.0026620656717568636
427 0.0025786503683775663
428 0.002502396469935775
429 0.002426410559564829
430 0.002356891753152013
431 0.0022841240279376507
432 0.002215296495705843
433 0.0021500824950635433
434 0.0020871213637292385
435 0.0020246049389243126
436 0.0019658864475786686
437 0.0019097039476037025
438 0.0018544626655057073
439 0.001800754340365529
440 0.0017470299499109387
441 0.0016983204986900091
442 0.0016489099944010377
443 0.0016009514220058918
444 0.0015566663350909948
445 0.001514179864898324
446 0.0014714718563482165
447 0.001431246055290103
448 0.0013906842796131968
449 0.0013515772297978401
450 0.001315126079134643
451 0.0012793916976079345
452 0.0012460271827876568
453 0.00121127651073038

# Autograd
PyTorch: Tensors and autograd


- Clamp all elements in input into the range [ min, max ] and return a resulting tensor:
\begin{split}y_i = \begin{cases}
    \text{min} & \text{if } x_i < \text{min} \\
    x_i & \text{if } \text{min} \leq x_i \leq \text{max} \\
    \text{max} & \text{if } x_i > \text{max}
\end{cases}\end{split}

- Parameters:	
    - input (Tensor) – the input tensor
    - min (Number) – lower-bound of the range to be clamped to
    - max (Number) – upper-bound of the range to be clamped to
    - out (Tensor, optional) – the output tensor

In [8]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    # x.mm -> multiplication matrix
    
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 32382200.0
1 30822780.0
2 33837016.0
3 35054276.0
4 30196914.0
5 20184718.0
6 10850753.0
7 5233577.5
8 2654608.25
9 1556172.125
10 1065534.5
11 813005.1875
12 659150.0
13 551606.5
14 469617.65625
15 403944.3125
16 349988.96875
17 304965.1875
18 267079.8125
19 234913.15625
20 207502.71875
21 183921.90625
22 163552.140625
23 145892.140625
24 130504.7421875
25 117024.1953125
26 105236.7578125
27 94876.84375
28 85723.28125
29 77622.5625
30 70417.34375
31 63994.92578125
32 58247.22265625
33 53090.5390625
34 48461.71875
35 44293.71484375
36 40531.98828125
37 37127.6796875
38 34045.08203125
39 31254.587890625
40 28718.6640625
41 26411.138671875
42 24308.76171875
43 22391.455078125
44 20639.294921875
45 19039.37890625
46 17576.91796875
47 16235.6494140625
48 15005.921875
49 13878.4501953125
50 12842.8271484375
51 11890.5537109375
52 11014.6865234375
53 10208.2607421875
54 9465.5556640625
55 8780.708984375
56 8149.150390625
57 7566.888671875
58 7028.6533203125
59 6531.30859375
60 6071.2744140

# PyTorch: Defining new autograd functions

In [9]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
#device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 27370144.0
1 23759168.0
2 25925586.0
3 30570600.0
4 33538258.0
5 30886738.0
6 22311488.0
7 12761852.0
8 6303907.0
9 3076793.5
10 1664823.25
11 1052315.875
12 761225.25
13 600589.9375
14 496716.25
15 421010.84375
16 361745.90625
17 313585.0625
18 273569.53125
19 239791.59375
20 211099.96875
21 186533.171875
22 165375.65625
23 147064.671875
24 131135.109375
25 117270.0859375
26 105162.3984375
27 94521.3359375
28 85136.90625
29 76836.1953125
30 69476.1171875
31 62930.203125
32 57094.015625
33 51876.49609375
34 47202.15625
35 43007.2421875
36 39233.1015625
37 35834.5859375
38 32767.83203125
39 29995.158203125
40 27488.9296875
41 25217.974609375
42 23155.142578125
43 21280.333984375
44 19579.01953125
45 18028.740234375
46 16614.23828125
47 15322.1572265625
48 14141.0107421875
49 13064.0966796875
50 12077.9296875
51 11173.9404296875
52 10344.2763671875
53 9582.4638671875
54 8881.671875
55 8236.416015625
56 7642.302734375
57 7094.87060546875
58 6590.03466796875
59 6123.865234375
60 5693.593

440 7.469355477951467e-05
441 7.327352068386972e-05
442 7.219709368655458e-05
443 7.083801756380126e-05
444 6.957535515539348e-05
445 6.83872276567854e-05
446 6.71305024297908e-05
447 6.593885336769745e-05
448 6.484379264293239e-05
449 6.385097367456183e-05
450 6.2962768424768e-05
451 6.188119732541963e-05
452 6.119927274994552e-05
453 6.0046400903956965e-05
454 5.9003235946875066e-05
455 5.8104313211515546e-05
456 5.716852319892496e-05
457 5.6133543694159016e-05
458 5.536873504752293e-05
459 5.451947799883783e-05
460 5.380613583838567e-05
461 5.3069248679094017e-05
462 5.2077797590754926e-05
463 5.116965257911943e-05
464 5.0517464842414483e-05
465 4.983931648894213e-05
466 4.915778845315799e-05
467 4.860413173446432e-05
468 4.763131801155396e-05
469 4.6999215555842966e-05
470 4.651309427572414e-05
471 4.599816384143196e-05
472 4.523892857832834e-05
473 4.4558870285982266e-05
474 4.4071795855415985e-05
475 4.3405627366155386e-05
476 4.308465213398449e-05
477 4.227769386488944e-05
478 4

# PyTorch: nn

In [10]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model. Module objects
    # override the __call__ operator so you can call them like functions. When
    # doing so you pass a Tensor of input data to the Module and it produces
    # a Tensor of output data.
    y_pred = model(x)

    # Compute and print loss. We pass Tensors containing the predicted and true
    # values of y, and the loss function returns a Tensor containing the
    # loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad

0 700.0230102539062
1 644.6672973632812
2 597.2393798828125
3 555.8089599609375
4 519.509521484375
5 487.24346923828125
6 458.12054443359375
7 431.83154296875
8 407.6788635253906
9 385.3291931152344
10 364.44696044921875
11 344.91534423828125
12 326.39013671875
13 308.93316650390625
14 292.362060546875
15 276.673583984375
16 261.7615051269531
17 247.56031799316406
18 234.02984619140625
19 221.09890747070312
20 208.8020782470703
21 197.09738159179688
22 185.9798126220703
23 175.42332458496094
24 165.428955078125
25 155.94140625
26 146.95932006835938
27 138.45230102539062
28 130.42001342773438
29 122.81071472167969
30 115.61302185058594
31 108.83698272705078
32 102.44207763671875
33 96.39069366455078
34 90.69402313232422
35 85.33393859863281
36 80.28433990478516
37 75.53512573242188
38 71.06889343261719
39 66.87731170654297
40 62.94740295410156
41 59.24843978881836
42 55.778076171875
43 52.52730178833008
44 49.48255920410156
45 46.626155853271484
46 43.948734283447266
47 41.4348297119140

356 0.000182370247785002
357 0.00017671055684331805
358 0.0001712355442577973
359 0.00016592962492723018
360 0.00016079838678706437
361 0.00015583493222948164
362 0.00015102577162906528
363 0.00014636508421972394
364 0.0001418617321178317
365 0.00013749564823228866
366 0.0001332778192590922
367 0.00012918200809508562
368 0.00012522164615802467
369 0.0001213845462189056
370 0.00011766600073315203
371 0.00011406896373955533
372 0.0001105849732994102
373 0.00010721650323830545
374 0.00010394875425845385
375 0.00010078070044983178
376 9.771650366019458e-05
377 9.474561375100166e-05
378 9.186893294099718e-05
379 8.908784366212785e-05
380 8.639002771815285e-05
381 8.376673213206232e-05
382 8.123639418045059e-05
383 7.878281030571088e-05
384 7.641138654435053e-05
385 7.410701073240489e-05
386 7.187483424786478e-05
387 6.971380207687616e-05
388 6.761861004633829e-05
389 6.558898166986182e-05
390 6.362036219798028e-05
391 6.171457062009722e-05
392 5.9868445532629266e-05
393 5.8075442211702466e-

# PyTorch: optim

In [11]:
# -*- coding: utf-8 -*-
import torch

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

0 650.2991943359375
1 633.2490234375
2 616.6775512695312
3 600.5155029296875
4 584.8032836914062
5 569.5104370117188
6 554.69140625
7 540.2872924804688
8 526.3739013671875
9 512.8858642578125
10 499.84246826171875
11 487.2327575683594
12 474.9822692871094
13 463.087158203125
14 451.446533203125
15 440.1286315917969
16 429.1712341308594
17 418.4983825683594
18 408.09259033203125
19 397.94903564453125
20 388.0597839355469
21 378.3928527832031
22 369.00274658203125
23 359.9116516113281
24 351.070068359375
25 342.44256591796875
26 334.028564453125
27 325.8651123046875
28 317.9142761230469
29 310.1639404296875
30 302.5705261230469
31 295.14202880859375
32 287.889404296875
33 280.7884216308594
34 273.8546142578125
35 267.0989685058594
36 260.48760986328125
37 253.9996337890625
38 247.64114379882812
39 241.409423828125
40 235.31539916992188
41 229.343017578125
42 223.48545837402344
43 217.75750732421875
44 212.1390380859375
45 206.6477508544922
46 201.2703857421875
47 195.98973083496094
48 19

432 6.831443783994473e-07
433 6.342585265883827e-07
434 5.888968530598504e-07
435 5.470372457239137e-07
436 5.076014417682018e-07
437 4.712018153441022e-07
438 4.3741249555750983e-07
439 4.059243394749501e-07
440 3.7660760199287324e-07
441 3.4914071989078366e-07
442 3.239641159780149e-07
443 3.00453962154279e-07
444 2.787018047456513e-07
445 2.58413535902946e-07
446 2.3950093464009115e-07
447 2.219731811692327e-07
448 2.058663568504926e-07
449 1.9066969514369703e-07
450 1.765938009157253e-07
451 1.6355963339265145e-07
452 1.5154444099607645e-07
453 1.4012360338711005e-07
454 1.2989089270831755e-07
455 1.2012760919333232e-07
456 1.1127121268827977e-07
457 1.0302248654170398e-07
458 9.530155864467815e-08
459 8.820508412554773e-08
460 8.159679509844864e-08
461 7.55298970034346e-08
462 6.980856426253013e-08
463 6.461932855472696e-08
464 5.965843996591502e-08
465 5.524741197859839e-08
466 5.107070322196705e-08
467 4.7240011014082484e-08
468 4.373415762870536e-08
469 4.04042914681213e-08
470

# Pytorch: Custom nn Modules

In [15]:
import torch
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """
        self.fc1 = torch.nn.Linear(D_in, H)
        self.fc2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        
        h_relu = self.fc1(x).clamp(min= 0)
        y_pred = self.fc2(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
    
#Create random Tensors to hold input and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiatng the class defined above
model = TwoLayerNet(D_in, H, D_out)

#Construct our Loss functions and an Optimizer. The call to model.paramerters()
# in the SGD constructor will contain the learnalbe params of the two
# nn.linear modules which are members of the model
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-4)

for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
    
    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    #Zero gradient, perform a backward pass,update the weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
            

<generator object Module.parameters at 0x00000218B578AF10>
0 623.8992919921875
1 579.819091796875
2 541.8037109375
3 508.5085144042969
4 478.7288818359375
5 451.9593811035156
6 427.5831298828125
7 405.4085998535156
8 384.78753662109375
9 365.4620666503906
10 347.2221374511719
11 330.0967102050781
12 313.9134521484375
13 298.6020202636719
14 284.0759582519531
15 270.2710266113281
16 257.1483154296875
17 244.65478515625
18 232.73422241210938
19 221.3401641845703
20 210.4174041748047
21 199.99124145507812
22 190.03912353515625
23 180.56663513183594
24 171.5507354736328
25 162.94505310058594
26 154.7418975830078
27 146.92640686035156
28 139.46742248535156
29 132.36264038085938
30 125.6011962890625
31 119.17971801757812
32 113.01514434814453
33 107.15006256103516
34 101.59026336669922
35 96.32025909423828
36 91.31040954589844
37 86.57125854492188
38 82.08958435058594
39 77.83900451660156
40 73.81904602050781
41 70.01580047607422
42 66.42098236083984
43 63.02141571044922
44 59.80782318115234

419 5.289832188282162e-05
420 5.138650885783136e-05
421 4.991948662791401e-05
422 4.8490666813449934e-05
423 4.71081402793061e-05
424 4.5763201342197135e-05
425 4.44597280875314e-05
426 4.3191674194531515e-05
427 4.196065856376663e-05
428 4.076468394487165e-05
429 3.9605227357242256e-05
430 3.847489642794244e-05
431 3.738033046829514e-05
432 3.6317196645541117e-05
433 3.528265006025322e-05
434 3.427901174291037e-05
435 3.3304651879007e-05
436 3.2359846954932436e-05
437 3.1440726161235943e-05
438 3.054811168112792e-05
439 2.967884938698262e-05
440 2.8837785066571087e-05
441 2.801814116537571e-05
442 2.7223486540606245e-05
443 2.645378299348522e-05
444 2.5703677238197997e-05
445 2.4975915948743932e-05
446 2.4267519620480016e-05
447 2.3580978449899703e-05
448 2.2914193323231302e-05
449 2.2265534425969236e-05
450 2.1636371457134373e-05
451 2.1023281078669243e-05
452 2.0428664356586523e-05
453 1.9850422177114524e-05
454 1.9290386262582615e-05
455 1.874658300948795e-05
456 1.8215461750514805

# PyTorch: Control Flow + Weight Sharing

In [16]:
import random
import torch


In [18]:
class DynamicNet(torch.nn.Module):
    """
    In the constructor we construct three nn.Linear instances that we will use
    in the forward pass.
    """
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
    
    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        
        """
        h_relu = self.input_linear(x).clamp(min= 0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min= 0)
        y_pred = self.output_linear(h_relu)
        return y_pred
    
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for t in range(500):
    #Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)
    
    #Compute and print Loss
    loss = criterion(y_pred, y)
    print(t, loss.item())
    
    #Zero gradients, perform a backward pass, update the weights
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 616.6395263671875
1 615.9946899414062
2 641.11328125
3 613.8275146484375
4 610.8023071289062
5 606.744873046875
6 609.77099609375
7 597.8768310546875
8 607.4462280273438
9 589.3106079101562
10 584.6884765625
11 404.0342102050781
12 371.9383239746094
13 570.8013916015625
14 565.2476196289062
15 601.0283813476562
16 607.0654907226562
17 594.79052734375
18 194.79782104492188
19 588.8576049804688
20 532.8668823242188
21 514.0911254882812
22 478.593017578125
23 117.3344955444336
24 416.2593078613281
25 96.50996398925781
26 446.7548522949219
27 323.3803405761719
28 290.9853820800781
29 484.8349304199219
30 355.11175537109375
31 104.41952514648438
32 301.109619140625
33 275.18463134765625
34 246.2923126220703
35 306.9319763183594
36 267.05718994140625
37 235.40501403808594
38 182.9322052001953
39 186.76559448242188
40 139.4553985595703
41 165.74160766601562
42 132.5477752685547
43 116.35525512695312
44 99.00541687011719
45 59.331031799316406
46 94.55175018310547
47 65.03093719482422
48 61.5

456 4.908196449279785
457 1.4906601905822754
458 1.311403512954712
459 2.1242644786834717
460 1.217566728591919
461 1.624269962310791
462 1.436042308807373
463 1.0156564712524414
464 0.9467114210128784
465 2.5147135257720947
466 0.8216225504875183
467 1.1364656686782837
468 1.2277086973190308
469 2.902594804763794
470 0.7013190388679504
471 1.2326560020446777
472 2.7427163124084473
473 0.655655026435852
474 1.7070181369781494
475 0.39236828684806824
476 0.41141271591186523
477 1.5739212036132812
478 1.409509539604187
479 0.4126828610897064
480 2.2845230102539062
481 0.7368876338005066
482 1.4088308811187744
483 1.4416956901550293
484 0.6089435815811157
485 0.9408570528030396
486 1.2987959384918213
487 1.0605088472366333
488 0.8761258721351624
489 0.9739263653755188
490 0.9410241842269897
491 1.599303126335144
492 0.559524655342102
493 0.9151325225830078
494 0.8296440243721008
495 0.740220308303833
496 0.9369993209838867
497 1.746253490447998
498 1.0262380838394165
499 1.074673533439636