# A gentle introduction to autograd

In [1]:
import torch
from torchvision.models import resnet18, ResNet18_Weights


In [17]:
model = resnet18(weights=ResNet18_Weights.DEFAULT)
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

random datapoint and random label

In [18]:
data = torch.rand(1, 3, 64, 64)
data.shape

torch.Size([1, 3, 64, 64])

In [19]:
labels = torch.rand(1, 1000)
len(labels[0])

1000

In [20]:
# forward pass
prediction = model(data)
prediction

tensor([[-6.3137e-01, -5.1080e-01, -1.9999e-01, -1.4795e+00, -4.7816e-01,
         -3.1859e-03, -5.1857e-01,  4.0462e-01,  2.7463e-01, -9.4749e-01,
         -1.1470e+00, -9.0036e-01, -5.1541e-01, -8.9454e-01, -1.4960e+00,
         -6.8003e-01, -6.6436e-01, -3.3788e-01, -5.0312e-01, -4.7172e-01,
         -1.4053e+00, -6.8696e-01, -1.5180e+00,  9.7142e-02, -9.0908e-01,
         -1.3029e+00, -1.0694e+00, -1.3421e+00, -1.0289e+00, -6.3022e-01,
         -1.2259e+00, -9.3632e-01, -7.3545e-01, -4.2059e-01, -3.0475e-01,
         -4.0363e-01,  6.5356e-01, -6.9702e-01, -3.6015e-01, -1.6268e-01,
         -9.7075e-01, -1.0906e+00, -1.3824e+00, -5.9527e-01, -7.7175e-01,
         -2.5186e-01, -9.2992e-01, -7.5942e-01, -1.2696e+00, -9.1445e-01,
         -2.2646e-01,  5.5522e-01, -5.3136e-01, -4.3897e-01, -1.4480e-01,
         -1.0336e+00, -1.4188e-01, -1.4580e+00, -2.6605e-01, -4.3717e-01,
          7.2876e-01,  1.3341e-01, -2.0620e-01,  2.8914e-01, -7.8507e-01,
         -4.6990e-02, -1.1672e-01,  6.

In [21]:
loss = (prediction - labels).sum()
loss

tensor(-505.1766, grad_fn=<SumBackward0>)

In [22]:
type(loss)

torch.Tensor

In [23]:
loss.backward()

In [28]:
optim = torch.optim.SGD(
    model.parameters(),
    lr=1e-2,
    momentum=0.95
)
optim

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0.95
    nesterov: False
    weight_decay: 0
)

In [29]:
optim.step()

# differentiation in autograd

In [42]:
import torch

a = torch.tensor([3., 4.], requires_grad=True)
b = torch.tensor([5., 7.], requires_grad=True)
a, b

(tensor([3., 4.], requires_grad=True), tensor([5., 7.], requires_grad=True))

In [43]:
Q = 3*a**3 - b**2
Q

tensor([ 56., 143.], grad_fn=<SubBackward0>)

if we have `a` and `b` be the parameters of the NN, and `Q` to be the error, then we want the gradients of the errors to be the partial derivatives with respect to the parameters. Calling something like `.backward()` on `Q`, the autograd function will automatically calculate the partial derivatives (graidents and store them in the tensor's `.grad` attribute). a specific `gradient` argument is neeeded in `Q.backward()` since it is a vector. We can also aggregate `Q` into a scalar using `.sum()`, then pass the backward argument on top. 

In [44]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient = external_grad)

In [45]:
a, b, Q

(tensor([3., 4.], requires_grad=True),
 tensor([5., 7.], requires_grad=True),
 tensor([ 56., 143.], grad_fn=<SubBackward0>))

In [46]:
a.grad, b.grad

(tensor([ 81., 144.]), tensor([-10., -14.]))

In [47]:
9*a**2

tensor([ 81., 144.], grad_fn=<MulBackward0>)

In [49]:
-2*b

tensor([-10., -14.], grad_fn=<MulBackward0>)

In [48]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)


tensor([True, True])
tensor([True, True])
