In [1]:
import torch
import numpy as np

# Introduction

https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html

# What is PyTorch ?

Uninitialized matrix, contains whatever values were allocated in memory:

In [2]:
torch.empty(5, 3)

tensor([[2.0362e-19, 7.5555e+31, 5.0778e+31],
        [1.2708e+31, 2.0362e-19, 4.9569e+33],
        [2.0193e-19, 4.2951e+24, 1.7866e+25],
        [6.6635e-33, 4.3612e+27, 8.7518e-04],
        [1.1692e-19, 1.5637e-01, 1.3000e+34]])

Construct randomly initialized matrix:

In [3]:
torch.rand(5, 4)

tensor([[0.3520, 0.9774, 0.3662, 0.7200],
        [0.9304, 0.6910, 0.8304, 0.4230],
        [0.8009, 0.0699, 0.1161, 0.0706],
        [0.4658, 0.3401, 0.7360, 0.5461],
        [0.1339, 0.6584, 0.5822, 0.2751]])

Once can specify the data type:

In [4]:
torch.empty(1).dtype, torch.empty(1, dtype=torch.double).dtype, torch.empty(1, dtype=torch.long).dtype

(torch.float32, torch.float64, torch.int64)

Construct tensor from data:

In [5]:
torch.tensor([5.5, -1])

tensor([ 5.5000, -1.0000])

Get the size of tensor, which is actually a tuple:

In [6]:
x = torch.randn(10, 2)
x.size()

torch.Size([10, 2])

## Operations

### In place operations

In [7]:
y = torch.zeros(3, 2)
x = torch.ones_like(y)
y.add_(x)
print(y)

tensor([[1., 1.],
        [1., 1.],
        [1., 1.]])


All operations that mutate tensors in place are post-fixed with an underscore `_`.

### Indexing

Can use Numpy-like indexing:

In [8]:
y[:, 0]

tensor([1., 1., 1.])

### Resizing



In [9]:
x = torch.tensor(range(10))
x.view(2, 5)

tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])

If tensor is a one-element tensor, use `item`:

In [10]:
x = torch.randn(1)
print(x)
print(x.item())

tensor([-0.9036])
-0.9035592675209045


## NumPy Bridge

The **Torch tensor and NumPy array will share their underlying memory location, if the Torch tensor is on CPU**. Changing one will change the other:

### Torch Tensor -> NumPy array

In [11]:
torch_array = torch.ones(2, 3)
torch_array

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [12]:
numpy_array = torch_array.numpy()
numpy_array

array([[1., 1., 1.],
       [1., 1., 1.]], dtype=float32)

In [13]:
torch_array[0, 0] = -1
numpy_array

array([[-1.,  1.,  1.],
       [ 1.,  1.,  1.]], dtype=float32)

In [14]:
numpy_array[-1, -1] = 8.0
torch_array

tensor([[-1.,  1.,  1.],
        [ 1.,  1.,  8.]])

### NumPy array -> Torch tensor

In [15]:
numpy_array = np.ones(5)
torch_array = torch.from_numpy(numpy_array)
np.add(numpy_array, 1, out=numpy_array)
torch_array

tensor([2., 2., 2., 2., 2.], dtype=torch.float64)

As a sidenote, we can see how the default data type differ depending on whether we initially build the array from numpy or Torch. Torch uses 32-bit float by default, while NumPy uses 64-bit float. 

## CUDA Tensors

One can move tensors around from one device to another:

In [16]:
torch.cuda.is_available()

  return torch._C._cuda_getDeviceCount() > 0


False

In [17]:
x = torch.ones(2, 3)
print('x is on', x.device)

if torch.cuda.is_available():
    device = torch.device('cuda')
    x = x.to('cuda')
    print('x is on', x.device)

# This also outputs the device
x

x is on cpu


tensor([[1., 1., 1.],
        [1., 1., 1.]])

# AutoGrad: Automatic Differentiation

> The `autograd` package provides automatic differentiation for all operations on Tensors. It is a define-by-run framework, which means that your backprop is defined by how your code is run, and that every single iteration can be different.

## Tensor

* One can set `torch.Tensor.requires_grad=True` to **track all operations on the tensor**
* Once computations are done, call `.backward()` to compute gradients automatically
* Gradient for this tensor will be accumulated into `.grad` attribute
* To stop a tensor from tracking history, call `.detach()`, or wrap code block in `with torch.no_grad()`

The `Function` class is interconnected with `Tensor`. Each tensor has a `.grad_fn` attribute referencing to the `Function` that created the tensor. Tensors created by user have their `grad_fn is None`.

In [18]:
x = torch.ones(2, 3, requires_grad=True)
print('x.grad_fn =', x.grad_fn)
x

x.grad_fn = None


tensor([[1., 1., 1.],
        [1., 1., 1.]], requires_grad=True)

In [19]:
y = x + 2
y

tensor([[3., 3., 3.],
        [3., 3., 3.]], grad_fn=<AddBackward0>)

In [20]:
z = y * y * 3
z

tensor([[27., 27., 27.],
        [27., 27., 27.]], grad_fn=<MulBackward0>)

In [21]:
z.mean()

tensor(27., grad_fn=<MeanBackward0>)

One can modify the `requires_grad` of a Tensor in place:

In [22]:
a = torch.randn(2, 4)
a = a * 3 / (a - 1)
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
b

False
True


tensor(49.9867, grad_fn=<SumBackward0>)

## Gradients


We first show an example on a scalar tensor. Such 1x1 tensors do not need any argument with `.backward()`:

In [23]:
w = torch.ones(4, requires_grad=True)
x = w + 2
y = x * x - 1
z = y.mean()
z

tensor(8., grad_fn=<MeanBackward0>)

In [24]:
z.backward()

In [25]:
w.grad

tensor([1.5000, 1.5000, 1.5000, 1.5000])

Let's write what we just did:

$$\begin{align}
z &= \frac {1}{N} \sum _{i=1}^{N} y_i\\
&= \frac {1}{N} \sum _{i=1}^{N} \left[ \left( w_i + 2 \right)^2 - 1  \right]
\end{align}$$

with $N$ the number of elements. Calling `z.backward()` computes the gradient $\frac{d z}{dW}$ and stores it in `w.grad`: 

$$\frac{d z}{dw_i} = \frac {2}{N} \left( w_i + 2 \right) = \frac 3 2$$

Let's see what happens when calling `.grad` on intermediate nodes:

In [26]:
y.grad

  y.grad


Or calling `.backward` two times:

In [27]:
try:
    z.backward()
except Exception as e:
    print(e)

Trying to backward through the graph a second time, but the saved intermediate results have already been freed. Specify retain_graph=True when calling backward the first time.


Here's another example:

In [28]:
x = torch.randn(3, requires_grad=True)

y = 2*x
while y.data.norm() < 1000:
    y *= 2
y

tensor([-869.7633,  133.1075, -560.7034], grad_fn=<MulBackward0>)

In [29]:
v = torch.tensor([0.1, 1.0, .0001], dtype=torch.float)
y.backward(v)
x.grad

tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])

# Neural networks

https://pytorch.org/tutorials/beginner/blitz/neural_networks_tutorial.html#sphx-glr-beginner-blitz-neural-networks-tutorial-py

We'll use `torch.nn` package, which depends on `autograd`. The `nn.Module` contains layers with a method `forward(input)`. The typical NN training procedure is:

1. Define NN 
1. Iterate over input dataset
1. Process input through network (forward propagation)
1. Compute loss
1. Backward propagation: propagate gradients back into network's params
1. Update weights

## Define network




In [30]:
import torch.nn as nn
import torch.nn.functional as F

In [32]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        # 1 input image channel
        # 6 output channels
        # 3x3 square convolution
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # Affine operations y=Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120) # 6*6 for image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # Max pooling over 2x2 window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), 2) # single number means square
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:] # all dimensions except batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [33]:
net = Net()
net

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

In [38]:
p = list(net.parameters())
print(len(p))
# Weights of conv1
p[0].size()

10


torch.Size([6, 1, 3, 3])

We try a random 32x32 input:

In [43]:
# nsamples x nchannels x height x width
inp = torch.randn(1, 1, 32, 32)
out = net(inp)
out

tensor([[-0.0061, -0.0051,  0.0457, -0.0585,  0.1030, -0.0492,  0.0259,  0.0036,
          0.0830,  0.0232]], grad_fn=<AddmmBackward>)

We set gradient buffers to zero and backpropagate with random gradients:

In [44]:
net.zero_grad()
out.backward(torch.randn(1, 10))

## Loss function

In [83]:
out = net(inp)
target = torch.randn(10)
target = target.view(1, -1)
criterion = nn.MSELoss()

loss = criterion(out, target)
loss

tensor(0.7985, grad_fn=<MseLossBackward>)

If we follow `loss` in backward direction:

    input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
          -> view -> linear -> relu -> linear -> relu -> linear
          -> MSELoss
          -> loss

## Backprop

We call `loss.backward()` but **should clear the existing gradients before**. 


In [84]:
net.zero_grad()
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0013,  0.0091, -0.0020,  0.0023, -0.0072,  0.0136])


## Update weights

In [85]:
learning_rate = .01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

But we might want to use other rules (e.g. SGD). The `torch.optim` implements all these methods:

In [86]:
import torch.optim as optim

In [87]:
optimizer = optim.SGD(net.parameters(), lr=.01)

# In the training loop
optimizer.zero_grad()
output = net(inp)
loss = criterion(output, target)
loss.backward()
optimizer.step() # does the actual update

# Learning PyTorch with examples

https://pytorch.org/tutorials/beginner/pytorch_with_examples.html



## Autograd

We implement a two-layet net:

In [98]:
dtype = torch.float
device = torch.device('cpu')
kwargs = {'dtype': dtype, 'device': device}

# N = batch size, D_in = input dimension
# H = hidden dimension, D_out = output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Implicitly set requires_grad=False, we don't need to compute gradients
# with respect to input/output
x = torch.randn(N, D_in, **kwargs)
y = torch.randn(N, D_out, **kwargs)

w1 = torch.randn(D_in, H, requires_grad=True, **kwargs)
w2 = torch.randn(H, D_out, requires_grad=True, **kwargs)


lr = 1e-6
for t in range(500):
    # Forward prop
    yhat = x.mm(w1).clamp(min=0).mm(w2)
    
    # Loss: tensor of shape (1, )
    loss = (yhat - y).pow(2).sum()
    if t % 100 == 0:
        print(t, loss.item())
        
    # Backprop
    loss.backward()
    
    # Manually update weights with GD, wrap in torch.no_grad since we don't need to
    # track weights updates
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        
        # Manually zero the gradients
        w1.grad.zero_()
        w2.grad.zero_()

0 40586116.0
100 666.6690063476562
200 4.381230354309082
300 0.039915841072797775
400 0.0006489593652077019


## Defining new autograd functions

Each primitive autograd operator is two functions:

* forward function
* backward function

We define our own relu activation function to implement a two-layer net:

In [101]:
class MyReLU(torch.autograd.Function):
    @staticmethod
    def forward(ctx, inp):
        """ctx is a context object to stash information in, for backward computation.
        One can cache arbitrary objects for use in backward pass with save_for_backward()"""
        ctx.save_for_backward(inp)
        return inp.clamp(min=0)
    
    @staticmethod
    def backward(ctx, grad_output):
        """We receive gradient of loss w.r.t. output, we want to compute
        the gradient of loss w.r.t. input."""
        inp, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[inp < 0] = 0
        return grad_input

    
dtype = torch.float
device = torch.device('cpu')
kwargs = {'dtype': dtype, 'device': device}

# N = batch size, D_in = input dimension
# H = hidden dimension, D_out = output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Implicitly set requires_grad=False, we don't need to compute gradients
# with respect to input/output
x = torch.randn(N, D_in, **kwargs)
y = torch.randn(N, D_out, **kwargs)

w1 = torch.randn(D_in, H, requires_grad=True, **kwargs)
w2 = torch.randn(H, D_out, requires_grad=True, **kwargs)

lr = 1e-6
for t in range(500):
    relu = MyReLU.apply
    
    yhat = relu(x.mm(w1)).mm(w2)
    
    loss = (yhat - y).pow(2).sum()
    if t % 100 == 0:
        print(t, loss.item())
    
    loss.backward()
    
    with torch.no_grad():
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        
        # Manually zero the gradients
        w1.grad.zero_()
        w2.grad.zero_()

0 32518864.0
100 458.3034362792969
200 1.2079037427902222
300 0.005408933851867914
400 0.00013244572619441897
