# Learning PyTorch with Examples

Notes on [Learning PyTorch with Examples](https://pytorch.org/tutorials/beginner/pytorch_with_examples.html), MLW, 2018-08-24

## numpy

Feed-forward neural network with RELU activation, training by backprop, implemented in numpy

In [1]:
import numpy as np

In [2]:
%%time

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    if t % 100 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)     # d(loss)/d(ypred)
    grad_w2 = h_relu.T.dot(grad_y_pred)  # d(loss)/d(w2)
    grad_h_relu = grad_y_pred.dot(w2.T)  # preliminary calc to get d(loss)/dh
    grad_h = grad_h_relu.copy()          # (...not sure if this copy is necessary...)
    grad_h[h < 0] = 0                    # d(loss)/dh (handles floor correctly)
    grad_w1 = x.T.dot(grad_h)            # d(loss)/dw1

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 35732805.56735114
100 629.571156889594
200 3.473492796774245
300 0.030967769234765373
400 0.00037982706822323333
CPU times: user 1.33 s, sys: 111 ms, total: 1.45 s
Wall time: 761 ms


## pytorch without autograd

Same network but with pytorch tensors

In [3]:
import torch

In [4]:
%%time

dtype = torch.float
device = torch.device("gpu") if torch.cuda.is_available() else torch.device("cpu")

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data (on specific device, with specific dtype)
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)              # matrix multiply, i.e. dot product
    h_relu = h.clamp(min=0)   # equivalent of max(x, 0), i.e. ReLu
    y_pred = h_relu.mm(w2)    # matrix multiply

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()  # get scalar value from Tensor of shape (1,)
    if t % 100 == 0:
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)   # .t() method equivalent to .T attribute in numpy
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()           # .clone() method equivalent to x.copy() in numpy
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 40108388.0
100 904.4852294921875
200 8.899497985839844
300 0.13888390362262726
400 0.00272913696244359
CPU times: user 738 ms, sys: 47.2 ms, total: 785 ms
Wall time: 627 ms


## pytorch with autograd

In [5]:
%%time

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Setting requires_grad=False (the default) indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass. We don't need those gradients because 
# the input/output is fixed, so there's no use in knowning the gradients.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    # Forward pass: same operations as by hand implentation but we don't need intermediate values
    # since we are not implementing the backward pass by hand. They're kept in buffers.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Calling loss.item(), i.e. detatching from graph, to print here, but keeping loss on the
    # graph for backprop
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 0:
        print(t, loss.item())

    # Backward pass: i.e. compute the gradient of loss with respect to all Tensors with
    # requires_grad=True. After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
        
    # Other options for update: 
    # - operate on weight.data and weight.grad.data. 
    # This works because the data attribute gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # - use torch.optim.SGD

0 32613180.0
100 350.14276123046875
200 1.2929446697235107
300 0.007486655376851559
400 0.00017354465671814978
CPU times: user 640 ms, sys: 56.1 ms, total: 696 ms
Wall time: 540 ms


## Defining functions that support autograd

Subclass `torch.autograd.Function` and implementing the forward and backward staticmethods

These methods receive two parameters. The first is context that can be used to stash information, most usuually to make a note of stuff that's going to be needed on the backward pass.

For the `forward` method, the second parameter is the input to the function. It computes the output.

For the `backward` method, the second parameter is the gradient of a scalar value (typically the loss) wrt to the output. It computes the gradient of the loss wrt the input.

In [6]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input):
        ctx.save_for_backward(input)  # cache arbitrary object for use in the backward path
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        input, = ctx.saved_tensors        # recall tuple of cached objects
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0         # this is where we use the cached input of the most recent forward pass
        return grad_input

Replacing clamp method in previous implemetation with our own `MyReLU` class

In [7]:
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

# note we don't instantiate the class, and we need to call it's apply method to actually apply it
relu = MyReLU.apply  

for t in range(500):
    y_pred = relu(x.mm(w1)).mm(w2)

    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 0:
        print(t, loss.item())

    loss.backward()

    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        w1.grad.zero_()
        w2.grad.zero_()

0 27504324.0
100 453.4012145996094
200 3.307530403137207
300 0.04264489561319351
400 0.0009226317051798105


## Using the higher level `nn` module

In [8]:
%%time

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# This creates a module which is made up of a pipeline of other modules.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

# This defines a loss function (an object with __call__)
# In this case the sum of the MSE of the training batch (by default it computes the mean)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Not clear to me why this network learns more slowly with the same learning rate
learning_rate = 1e-4

for t in range(500):
    # Module objects override the __call__ operator so you can call them like functions.
    # Why don't autograd functions do this too?!
    y_pred = model(x)

    loss = loss_fn(y_pred, y)
    if t % 100 == 0:
        print(t, loss.item())

    # Zero the gradients before running the backward pass (or after the update step)
    model.zero_grad()
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its gradients like we did before.
    with torch.no_grad():
        # This iterates over all the trainable parameters of the model
        for param in model.parameters():
            param -= learning_rate * param.grad

0 724.2374267578125
100 2.6408348083496094
200 0.05410875752568245
300 0.0028587800916284323
400 0.00023422302911058068
CPU times: user 772 ms, sys: 63.1 ms, total: 835 ms
Wall time: 706 ms


In [9]:
list(model.parameters())[0].shape  # weights of first linear layer

torch.Size([100, 1000])

In [10]:
list(model.parameters())[1].shape  # biases of first linear layer

torch.Size([100])

In [11]:
list(model.parameters())[2].shape  # weights of second linear layer

torch.Size([10, 100])

In [12]:
list(model.parameters())[3].shape  # biases of second linear layer

torch.Size([10])

## Using torch.optim to handle updates

In [13]:
%%time

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)

loss_fn = torch.nn.MSELoss(reduction='sum')

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)
    loss = loss_fn(y_pred, y)
    if t % 100 == 0:
        print(t, loss.item())
    model.zero_grad()
    loss.backward()
    optimizer.step()  # this replaces the for loop and mutation

0 652.7678833007812
100 54.322296142578125
200 0.5582529902458191
300 0.0030172590631991625
400 6.950896204216406e-05
CPU times: user 1.11 s, sys: 68.6 ms, total: 1.18 s
Wall time: 1.05 s


## Custom modules

Subclass `nn.Module` and define a `forward` method which receives input Tensors and produces output Tensors using other modules or other autograd operations on Tensors. The `backward` method is then inferred.

In [14]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        # this can be super().__init__() in python3 I think
        super(TwoLayerNet, self).__init__() 
        # trainable (sub)modules are typically instance variables
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        # Forward method accepts a Tensor of input, returns a Tensor of output.
        # Can use instance attribute modules and arbitrary operators on Tensors.
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [15]:
%%time

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    if t % 100 == 0:
        print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 725.6034545898438
100 2.743405818939209
200 0.047122273594141006
300 0.0015621890779584646
400 6.537259469041601e-05
CPU times: user 835 ms, sys: 52 ms, total: 887 ms
Wall time: 787 ms


## Weird network to demonstrate flexible control flow

In [16]:
import random


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super().__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.
        
        Note the number of trainable parameters doesn't change. The weights are shared
        by the 0, 1, 2 or 3 hidden layers.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

In [17]:
%%time

N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    if t % 100 == 0:
        print(t, loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 794.5697631835938
100 1.9950809478759766
200 0.02538307011127472
300 0.0009435893152840436
400 5.847218199050985e-05
CPU times: user 688 ms, sys: 49.4 ms, total: 738 ms
Wall time: 586 ms
