In [1]:
# Learning PyTorch through Examples

# Pytorch: 
# two main features= an n-dimensional tensor (can run on GPUs)
#   and automatic differentiation for building and training
#   neural networks

# Warmup: NumPy
# implement the network

import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
# PyTorch: tensors
# use of GPUs can speed up the process up to 50x
# Tensor = essentially a numpy array, PyTorch provides many functions for
# operating on these tensors (keep track of graph and gradients)

import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2


0 26689305.415564276
1 21505257.375559144
2 20475303.9137953
3 20611674.381554533
4 20015444.508883726
5 17688228.835222144
6 13859162.890665963
7 9728552.517325714
8 6269642.183039892
9 3885149.457423486
10 2410785.7418537424
11 1549604.9189531624
12 1052113.6074549656
13 759126.8863035925
14 578989.8267146327
15 461757.6183209013
16 380651.5744898326
17 321143.0223025076
18 275292.6385879539
19 238607.88334637193
20 208464.73882146506
21 183239.27890568873
22 161869.89251439332
23 143558.11660331316
24 127744.84427052153
25 114015.3259740274
26 102029.81152519037
27 91525.02455512252
28 82288.2153708314
29 74132.55263255473
30 66915.08364343697
31 60520.06433805706
32 54837.98165161307
33 49773.06639625528
34 45244.21373082555
35 41189.69111558855
36 37554.043031140216
37 34285.268944256095
38 31343.487415592605
39 28690.10891423225
40 26291.70847496131
41 24121.400588458113
42 22154.698752944165
43 20370.850894263574
44 18750.089123278045
45 17275.57883507708
46 15931.680583875052
4

429 3.327402648548193e-05
430 3.176957243658834e-05
431 3.0334234966159974e-05
432 2.896321144037369e-05
433 2.7654519409885385e-05
434 2.6405161494537014e-05
435 2.5212926537229456e-05
436 2.407564482142342e-05
437 2.29888598640503e-05
438 2.195142837483082e-05
439 2.096096649795485e-05
440 2.001581714204791e-05
441 1.911318547944304e-05
442 1.825139534350054e-05
443 1.7429298641784237e-05
444 1.664420020500977e-05
445 1.589449240841403e-05
446 1.5178582537247464e-05
447 1.4495054921333228e-05
448 1.3842520606359479e-05
449 1.3219887049013327e-05
450 1.2625346155519674e-05
451 1.2057566937344784e-05
452 1.1515298569348153e-05
453 1.0997621620130277e-05
454 1.0503322952413409e-05
455 1.0031330940304286e-05
456 9.580597335547092e-06
457 9.150391710558614e-06
458 8.739795784826353e-06
459 8.347378571991263e-06
460 7.972722714461746e-06
461 7.614941747751849e-06
462 7.2733914970343765e-06
463 6.947196214495868e-06
464 6.63579332154465e-06
465 6.338467632557248e-06
466 6.05437338043999e-06

352 0.0018191777635365725
353 0.0017470482271164656
354 0.001679157023318112
355 0.0016150805167853832
356 0.0015522653702646494
357 0.0014953200006857514
358 0.0014389611314982176
359 0.0013859985629096627
360 0.0013357863062992692
361 0.0012825272278860211
362 0.0012376985978335142
363 0.0011923977872356772
364 0.0011490541510283947
365 0.00110720400698483
366 0.001069112098775804
367 0.001029981067404151
368 0.0009943068725988269
369 0.0009600748890079558
370 0.0009261902305297554
371 0.0008935726364143193
372 0.0008619152940809727
373 0.0008334921440109611
374 0.0008049439638853073
375 0.0007773207034915686
376 0.0007526952540501952
377 0.000727939244825393
378 0.0007039372576400638
379 0.0006817281246185303
380 0.0006590712582692504
381 0.0006374296499416232
382 0.000617366167716682
383 0.0005982927395962179
384 0.0005780654610134661
385 0.0005614937981590629
386 0.0005426715943031013
387 0.0005268770619295537
388 0.0005098336259834468
389 0.000494772510137409
390 0.00048140386934

In [2]:
# PyTorch: Tensors and Autograd
# in above ex, had to manually implement the forward and backward pass
# automatic diff = automate the computation of the backward pass
# automatically define a computational grah: nodes are tensors, edges
# are functions tat produce output tensors from input tensors

import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 37265244.0
1 34502688.0
2 35796400.0
3 34283840.0
4 27199880.0
5 17261894.0
6 9288065.0
7 4770355.0
8 2645245.5
9 1677238.375
10 1202908.5
11 937085.0
12 765156.75
13 641112.375
14 545380.5
15 468527.5
16 405268.5625
17 352491.59375
18 308062.1875
19 270406.9375
20 238267.625
21 210690.171875
22 186972.40625
23 166466.046875
24 148648.3125
25 133124.9375
26 119545.671875
27 107614.796875
28 97127.1953125
29 87843.84375
30 79619.8671875
31 72314.234375
32 65806.6875
33 59994.09765625
34 54791.44921875
35 50121.6015625
36 45923.4609375
37 42142.06640625
38 38730.33203125
39 35645.3359375
40 32849.640625
41 30312.73046875
42 28006.1953125
43 25905.76953125
44 23989.888671875
45 22240.65234375
46 20640.8515625
47 19174.80859375
48 17829.98046875
49 16595.458984375
50 15460.4189453125
51 14414.384765625
52 13450.939453125
53 12561.611328125
54 11740.171875
55 10980.5751953125
56 10277.478515625
57 9626.2509765625
58 9022.2353515625
59 8461.5458984375
60 7940.78271484375
61 7456.5341796875

434 0.001957942033186555
435 0.001896501868031919
436 0.0018400789704173803
437 0.0017853936878964305
438 0.001731891417875886
439 0.0016779698198661208
440 0.0016315983375534415
441 0.0015813193749636412
442 0.0015340623212978244
443 0.0014889955054968596
444 0.0014469461748376489
445 0.001403721864335239
446 0.0013651455519720912
447 0.0013236576924100518
448 0.001286976970732212
449 0.00124944350682199
450 0.0012153678108006716
451 0.0011803642846643925
452 0.0011488470481708646
453 0.00111755495890975
454 0.001085907919332385
455 0.0010541987139731646
456 0.001026396406814456
457 0.0009986513759940863
458 0.0009724927949719131
459 0.0009477019775658846
460 0.0009215681930072606
461 0.0008976069511845708
462 0.0008731776615604758
463 0.0008507546735927463
464 0.00082781893434003
465 0.0008074587676674128
466 0.0007868737448006868
467 0.0007658313261345029
468 0.0007464657537639141
469 0.0007289694622159004
470 0.000711038417648524
471 0.0006931278039701283
472 0.0006759920506738126
