In [15]:
import torch
import tensorflow as tf
import numpy as np
%matplotlib inline

## 1. Tensors and autograd

- In the above examples, we had to manually implement both the forward and backward passes of our neural network. 
- Manually implementing the backward pass is not a big deal for a small two-layer network, but can quickly get very hairy for large complex networks.
- we can use [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) to automate the computation of backward passes in neural networks. 
- The ``autograd`` package in PyTorch provides exactly this functionality. When using autograd, the forward pass of your network will define a ``computational graph``; nodes in the graph will be Tensors, and edges will be functions that produce output Tensors from input Tensors. 
- Backpropagating through this graph then allows you to easily compute gradients.
- Each Tensor represents a node in a computational graph. If ``x`` is a Tensor that has ``x.requires_grad=True`` then ``x.grad`` is another Tensor holding the gradient of ``x`` with respect to some scalar value.

In [2]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 33016882.0
1 28244012.0
2 26021324.0
3 22972844.0
4 18206440.0
5 12823937.0
6 8235750.5
7 5075709.5
8 3156311.25
9 2055365.625
10 1424860.25
11 1050842.25
12 816193.6875
13 658883.0625
14 546431.8125
15 461623.53125
16 394924.5
17 341099.75
18 296758.1875
19 259686.125
20 228379.21875
21 201686.890625
22 178797.296875
23 159050.890625
24 141920.15625
25 126995.6953125
26 113897.390625
27 102414.1015625
28 92319.5859375
29 83416.8984375
30 75542.171875
31 68560.7734375
32 62351.4453125
33 56823.9296875
34 51889.76953125
35 47469.15234375
36 43498.45703125
37 39921.6796875
38 36693.50390625
39 33776.64453125
40 31137.333984375
41 28744.044921875
42 26568.146484375
43 24585.978515625
44 22778.412109375
45 21125.57421875
46 19613.091796875
47 18227.0390625
48 16955.6015625
49 15786.9638671875
50 14711.498046875
51 13720.3486328125
52 12806.6669921875
53 11962.546875
54 11182.0830078125
55 10461.283203125
56 9793.609375
57 9174.5078125
58 8599.662109375
59 8065.6953125
60 7569.2734375
61 

444 0.0006219332572072744
445 0.0006043183384463191
446 0.0005899432580918074
447 0.0005747248651459813
448 0.0005593239911831915
449 0.0005458110244944692
450 0.0005309059051796794
451 0.000517076812684536
452 0.0005032930639572442
453 0.0004898622864857316
454 0.00047815105062909424
455 0.00046553235733881593
456 0.00045333156595006585
457 0.00044187105959281325
458 0.00043106861994601786
459 0.00042093993397429585
460 0.0004116682684980333
461 0.000401783938286826
462 0.0003928134683519602
463 0.00038355126162059605
464 0.00037498067831620574
465 0.0003666254342533648
466 0.00035774457501247525
467 0.0003488239599391818
468 0.0003410040808375925
469 0.00033281443757005036
470 0.00032502118847332895
471 0.0003175628080498427
472 0.0003107347874902189
473 0.0003032810927834362
474 0.000296848826110363
475 0.0002904311113525182
476 0.00028415065025910735
477 0.00027800502721220255
478 0.00027128131478093565
479 0.00026688817888498306
480 0.0002605086483526975
481 0.00025459466269239783

## 2.  Defining new autograd functions

- Under the hood, each primitive autograd operator is really two functions that operate on Tensors. The forward function computes output Tensors from input Tensors. The backward function receives the gradient of the output Tensors with respect to some scalar value, and computes the gradient of the input Tensors with respect to that same scalar value.

- In PyTorch we can easily define our own autograd operator by defining a subclass of torch.autograd.Function and implementing the forward and backward functions. We can then use our new autograd operator by constructing an instance and calling it like a function, passing Tensors containing input data.

- In this example we define our own custom autograd function for performing the ReLU nonlinearity, and use it to implement our two-layer network:

In [13]:
class MyReLU1(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input

In [14]:
dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 35078848.0
1 33475930.0
2 32745788.0
3 28216562.0
4 19994878.0
5 11816578.0
6 6383971.0
7 3542082.0
8 2187618.75
9 1523508.625
10 1160604.5
11 933829.9375
12 774863.5
13 654107.375
14 558172.3125
15 479918.8125
16 415001.15625
17 360588.25
18 314639.4375
19 275581.125
20 242258.703125
21 213680.109375
22 189024.09375
23 167652.171875
24 149067.171875
25 132857.59375
26 118685.0625
27 106240.609375
28 95281.34375
29 85616.5703125
30 77079.6484375
31 69500.2734375
32 62764.08984375
33 56758.30078125
34 51391.0234375
35 46595.0390625
36 42296.890625
37 38442.734375
38 34978.3984375
39 31856.94921875
40 29042.162109375
41 26501.189453125
42 24203.388671875
43 22124.9296875
44 20242.310546875
45 18535.19140625
46 16985.013671875
47 15576.4677734375
48 14293.6103515625
49 13128.0556640625
50 12066.3046875
51 11097.3466796875
52 10212.6806640625
53 9405.4404296875
54 8666.2978515625
55 7989.140625
56 7368.8701171875
57 6799.8984375
58 6278.3193359375
59 5799.7841796875
60 5360.16845703125
6

434 0.00011205147893633693
435 0.00010974762699333951
436 0.00010784299956867471
437 0.00010595571075100452
438 0.00010425727668916807
439 0.00010225272126263008
440 0.00010055547318188474
441 9.861459693638608e-05
442 9.708567085908726e-05
443 9.56562944338657e-05
444 9.426892938790843e-05
445 9.25842541619204e-05
446 9.10312301130034e-05
447 8.940070983953774e-05
448 8.77333150128834e-05
449 8.611906378064305e-05
450 8.468777377856895e-05
451 8.338928455486894e-05
452 8.19775159470737e-05
453 8.072082709986717e-05
454 7.946923142299056e-05
455 7.825759530533105e-05
456 7.717753760516644e-05
457 7.6169082603883e-05
458 7.465239468729123e-05
459 7.345811172854155e-05
460 7.217844540718943e-05
461 7.124476542230695e-05
462 7.01950048096478e-05
463 6.911564560141414e-05
464 6.791950727347285e-05
465 6.690211739623919e-05
466 6.586840027011931e-05
467 6.515279528684914e-05
468 6.402654253179207e-05
469 6.307702278718352e-05
470 6.221963849384338e-05
471 6.139762990642339e-05
472 6.0607599

## 3. TensorFlow: Static Graphs

- PyTorch autograd looks a lot like TensorFlow: in both frameworks we define a computational graph, and use automatic differentiation to compute gradients. The biggest difference between the two is that TensorFlow’s computational graphs are static and PyTorch uses dynamic computational graphs.

- In TensorFlow, we define the computational graph once and then execute the same graph over and over again, possibly feeding different input data to the graph. In PyTorch, each forward pass defines a new computational graph.

- Static graphs are nice because you can optimize the graph up front; for example a framework might decide to fuse some graph operations for efficiency, or to come up with a strategy for distributing the graph across many GPUs or many machines. If you are reusing the same graph over and over, then this potentially costly up-front optimization can be amortized as the same graph is rerun over and over.

- One aspect where static and dynamic graphs differ is control flow. For some models we may wish to perform different computation for each data point; for example a recurrent network might be unrolled for different numbers of time steps for each data point; this unrolling can be implemented as a loop. With a static graph the loop construct needs to be a part of the graph; for this reason TensorFlow provides operators such as tf.scan for embedding loops into the graph. With dynamic graphs the situation is simpler: since we build graphs on-the-fly for each example, we can use normal imperative flow control to perform computation that differs for each input.

- To contrast with the PyTorch autograd example above, here we use TensorFlow to fit a simple two-layer net:

In [3]:
# First we set up the computational graph:

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create placeholders for the input and target data; these will be filled
# with real data when we execute the graph.
x = tf.placeholder(tf.float32, shape=(None, D_in))
y = tf.placeholder(tf.float32, shape=(None, D_out))

# Create Variables for the weights and initialize them with random data.
# A TensorFlow Variable persists its value across executions of the graph.
w1 = tf.Variable(tf.random_normal((D_in, H)))
w2 = tf.Variable(tf.random_normal((H, D_out)))

# Forward pass: Compute the predicted y using operations on TensorFlow Tensors.
# Note that this code does not actually perform any numeric operations; it
# merely sets up the computational graph that we will later execute.
h = tf.matmul(x, w1)
h_relu = tf.maximum(h, tf.zeros(1))
y_pred = tf.matmul(h_relu, w2)

# Compute loss using operations on TensorFlow Tensors
loss = tf.reduce_sum((y - y_pred) ** 2.0)

# Compute gradient of the loss with respect to w1 and w2.
grad_w1, grad_w2 = tf.gradients(loss, [w1, w2])

# Update the weights using gradient descent. To actually update the weights
# we need to evaluate new_w1 and new_w2 when executing the graph. Note that
# in TensorFlow the the act of updating the value of the weights is part of
# the computational graph; in PyTorch this happens outside the computational
# graph.
learning_rate = 1e-6
new_w1 = w1.assign(w1 - learning_rate * grad_w1)
new_w2 = w2.assign(w2 - learning_rate * grad_w2)

# Now we have built our computational graph, so we enter a TensorFlow session to
# actually execute the graph.
with tf.Session() as sess:
    # Run the graph once to initialize the Variables w1 and w2.
    sess.run(tf.global_variables_initializer())

    # Create numpy arrays holding the actual data for the inputs x and targets
    # y
    x_value = np.random.randn(N, D_in)
    y_value = np.random.randn(N, D_out)
    for _ in range(500):
        # Execute the graph many times. Each time it executes we want to bind
        # x_value to x and y_value to y, specified with the feed_dict argument.
        # Each time we execute the graph we want to compute the values for loss,
        # new_w1, and new_w2; the values of these Tensors are returned as numpy
        # arrays.
        loss_value, _, _ = sess.run([loss, new_w1, new_w2],
                                    feed_dict={x: x_value, y: y_value})
        print(loss_value)

32715924.0
29125668.0
30016506.0
30220736.0
26611302.0
19329942.0
11741231.0
6365991.5
3421416.0
1974623.5
1274740.8
913962.56
707143.9
573955.5
479082.75
406612.9
348929.4
301845.75
262654.3
229682.14
201698.44
177816.55
157317.34
139629.98
124302.86
110975.46
99340.984
89138.14
80161.84
72234.56
65218.33
58997.785
53464.88
48536.617
44132.414
40188.793
36651.227
33469.586
30604.592
28018.541
25679.996
23563.53
21643.879
19900.082
18314.252
16870.396
15554.379
14354.58
13258.964
12256.441
11338.295
10496.338
9723.522
9013.996
8361.281
7760.55
7207.157
6697.362
6227.3496
5793.7725
5392.7993
5021.9785
4679.126
4361.77
4067.631
3794.938
3542.0195
3307.3396
3089.4895
2887.0605
2698.8655
2523.8298
2361.016
2209.421
2068.2983
1936.8138
1814.398
1700.2499
1593.7512
1494.376
1401.6581
1315.0405
1234.1036
1158.4437
1087.7001
1021.5371
959.66986
901.72626
847.4718
796.6662
749.0558
704.4446
662.65674
623.4586
586.69196
552.1978
519.8264
489.4749
460.99512
434.24365
409.12396
385.5163
363.3295
3