# A gentle introduction to autograd

In [1]:
import torch
from torchvision.models import resnet18, ResNet18_Weights


In [2]:
model = resnet18(weights=ResNet18_Weights.DEFAULT)
model

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /home/jacky/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:02<00:00, 16.8MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

random datapoint and random label

In [3]:
data = torch.rand(1, 3, 64, 64)
data.shape

torch.Size([1, 3, 64, 64])

In [4]:
labels = torch.rand(1, 1000)
len(labels[0])

1000

In [5]:
# forward pass
prediction = model(data)
prediction

tensor([[-0.5471, -0.3601, -0.5766, -1.6097, -0.8004, -0.0734, -0.4240,  0.5162,
          0.5866, -0.7817, -0.7790, -0.5281, -0.1267, -0.5577, -0.6884, -0.4300,
         -0.7605, -0.1028, -0.3426, -0.0780, -1.2749, -0.8207, -1.3612,  0.2971,
         -1.0207, -1.2224, -0.8409, -1.0722, -0.9225, -0.3185, -0.7497, -0.4956,
         -0.2488, -0.4675, -0.1717, -0.3460,  0.8394, -0.4641, -0.0452,  0.0781,
         -0.3977, -0.6807, -1.0175, -0.3272, -0.5962, -0.3508, -0.5133, -0.3403,
         -0.9671, -0.8803, -0.5656,  0.5173, -0.2384, -0.5200, -0.2775, -0.9980,
         -0.3741, -1.3372, -0.5660, -0.2946,  0.7124,  0.3102,  0.0271,  0.1963,
         -0.9435, -0.3425, -0.4200, -0.5194, -0.8051, -1.0086, -1.5574,  0.2412,
         -1.3263, -0.2419, -1.1495, -1.1833,  0.0692, -0.4113,  0.3540, -0.1846,
         -0.7113, -1.5173, -0.0337, -0.8982, -0.3972,  0.0255, -0.0175,  0.3435,
         -0.1315, -0.5954, -1.2194, -0.7814, -1.9044, -0.4405,  0.5860, -1.9948,
         -0.6230, -0.2824, -

In [6]:
loss = (prediction - labels).sum()
loss

tensor(-509.2353, grad_fn=<SumBackward0>)

In [7]:
type(loss)

torch.Tensor

In [8]:
loss.backward()

In [9]:
optim = torch.optim.SGD(
    model.parameters(),
    lr=1e-2,
    momentum=0.95
)
optim

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0.95
    nesterov: False
    weight_decay: 0
)

In [10]:
optim.step()

# differentiation in autograd

In [42]:
import torch

a = torch.tensor([3., 4.], requires_grad=True)
b = torch.tensor([5., 7.], requires_grad=True)
a, b

(tensor([3., 4.], requires_grad=True), tensor([5., 7.], requires_grad=True))

In [43]:
Q = 3*a**3 - b**2
Q

tensor([ 56., 143.], grad_fn=<SubBackward0>)

if we have `a` and `b` be the parameters of the NN, and `Q` to be the error, then we want the gradients of the errors to be the partial derivatives with respect to the parameters. Calling something like `.backward()` on `Q`, the autograd function will automatically calculate the partial derivatives (graidents and store them in the tensor's `.grad` attribute). a specific `gradient` argument is neeeded in `Q.backward()` since it is a vector. We can also aggregate `Q` into a scalar using `.sum()`, then pass the backward argument on top. 

In [44]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient = external_grad)

In [45]:
a, b, Q

(tensor([3., 4.], requires_grad=True),
 tensor([5., 7.], requires_grad=True),
 tensor([ 56., 143.], grad_fn=<SubBackward0>))

In [46]:
a.grad, b.grad

(tensor([ 81., 144.]), tensor([-10., -14.]))

In [47]:
9*a**2

tensor([ 81., 144.], grad_fn=<MulBackward0>)

In [49]:
-2*b

tensor([-10., -14.], grad_fn=<MulBackward0>)

In [48]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)


tensor([True, True])
tensor([True, True])
