# A gentle introduction to autograd

In [1]:
import torch
from torchvision.models import resnet18, ResNet18_Weights


In [5]:
model = resnet18(weights=ResNet18_Weights.DEFAULT)
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

random datapoint and random label

In [6]:
data = torch.rand(1, 3, 64, 64)
data.shape

torch.Size([1, 3, 64, 64])

In [7]:
labels = torch.rand(1, 1000)
len(labels[0])

1000

In [8]:
# forward pass
prediction = model(data)
prediction

tensor([[-8.4540e-01, -5.7048e-01, -6.7406e-01, -1.8693e+00, -8.6959e-01,
         -2.5654e-01, -7.7244e-01,  4.8245e-01,  5.7862e-01, -6.7575e-01,
         -7.8480e-01, -8.4431e-01, -1.6934e-01, -9.1002e-01, -1.2424e+00,
         -5.0489e-01, -8.2507e-01, -3.1685e-01, -2.8090e-01, -5.0147e-01,
         -1.2400e+00, -6.6627e-01, -1.2883e+00,  1.3171e-01, -1.0650e+00,
         -1.3272e+00, -8.7364e-01, -1.2918e+00, -1.1954e+00, -4.2201e-01,
         -8.7971e-01, -8.4451e-01, -5.1901e-01, -9.6086e-01, -5.5852e-01,
         -5.7839e-01,  4.5073e-01, -9.6552e-01, -6.5845e-01,  6.7949e-02,
         -7.0722e-01, -8.6755e-01, -1.2081e+00, -2.3011e-01, -6.1143e-01,
         -4.7931e-01, -9.6323e-01, -2.4569e-01, -1.1342e+00, -1.1293e+00,
         -6.1079e-01,  4.1723e-01, -3.2438e-01, -7.2536e-01, -4.3572e-01,
         -1.3641e+00, -4.4534e-01, -1.5375e+00, -7.0851e-01, -5.2085e-01,
          7.3145e-01,  1.2916e-01, -1.7046e-01, -1.5844e-01, -1.0370e+00,
         -6.1310e-01, -4.0188e-01, -3.

In [9]:
# Check grad after backward (should have values)
print(list(model.parameters())[0].grad)  # Tensor with gradient values

None


In [10]:
loss = (prediction - labels).sum()
loss

tensor(-500.3949, grad_fn=<SumBackward0>)

In [11]:
type(loss)

torch.Tensor

In [12]:
loss.backward()

In [13]:
# Check grad after backward (should have values)
print(list(model.parameters())[0].grad)  # Tensor with gradient values

tensor([[[[-2.8034e-04, -4.3571e-04, -6.0502e-04,  ...,  1.4827e-04,
           -6.7896e-04,  6.9517e-04],
          [-2.6247e-04,  1.0883e-03, -1.1197e-03,  ..., -5.0035e-04,
            2.7686e-04, -4.4323e-04],
          [ 3.5893e-04,  4.2283e-04,  3.1428e-04,  ..., -7.0000e-04,
           -1.4655e-05,  2.6408e-04],
          ...,
          [ 1.3935e-05,  2.9207e-04,  1.1957e-03,  ...,  3.1835e-04,
           -7.7647e-04,  1.8239e-04],
          [ 7.5836e-06, -4.3775e-04, -6.2779e-04,  ...,  7.9577e-05,
           -5.7791e-04, -2.1012e-04],
          [ 1.6683e-04,  6.8008e-04, -9.2604e-04,  ..., -4.3892e-04,
           -5.1387e-04, -3.9908e-04]],

         [[-2.1648e-04, -3.3837e-04, -4.9625e-04,  ..., -1.0410e-03,
            1.8463e-04, -2.0342e-04],
          [-3.4355e-04,  3.1044e-04, -3.0627e-04,  ...,  9.9713e-05,
           -7.5910e-05,  8.9708e-04],
          [-1.9173e-04, -8.2307e-04,  5.1731e-04,  ..., -8.8237e-05,
            6.0596e-04,  4.1310e-04],
          ...,
     

In [14]:
optim = torch.optim.SGD(
    model.parameters(),
    lr=1e-2,
    momentum=0.95
)
optim

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    momentum: 0.95
    nesterov: False
    weight_decay: 0
)

In [15]:
optim.step()

# differentiation in autograd

In [16]:
import torch

a = torch.tensor([3., 4.], requires_grad=True)
b = torch.tensor([5., 7.], requires_grad=True)
a, b

(tensor([3., 4.], requires_grad=True), tensor([5., 7.], requires_grad=True))

when we add the requires_grad function, looks like we are adding the ability of simply *tracking the operations on the tensors* via the computation graph in pytorch 

In [17]:
Q = 3*a**3 - b**2
Q

tensor([ 56., 143.], grad_fn=<SubBackward0>)

if we have `a` and `b` be the parameters of the NN, and `Q` to be the error, then we want the gradients of the errors to be the partial derivatives with respect to the parameters. Calling something like `.backward()` on `Q`, the autograd function will automatically calculate the partial derivatives (graidents and store them in the tensor's `.grad` attribute). a specific `gradient` argument is neeeded in `Q.backward()` since it is a vector. We can also aggregate `Q` into a scalar using `.sum()`, then pass the backward argument on top. 

In [18]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient = external_grad) # looks like every tensor has a backward function

In [19]:
a, b, Q

(tensor([3., 4.], requires_grad=True),
 tensor([5., 7.], requires_grad=True),
 tensor([ 56., 143.], grad_fn=<SubBackward0>))

In [None]:
a.grad, b.grad

(tensor([ 81., 144.]), tensor([-10., -14.]))

In [None]:
9*a**2

tensor([ 81., 144.], grad_fn=<MulBackward0>)

In [None]:
-2*b

tensor([-10., -14.], grad_fn=<MulBackward0>)

In [None]:
print(9*a**2 == a.grad)
print(-2*b == b.grad)


tensor([True, True])
tensor([True, True])


# vector calculus with `autograd`


mathmatically speaking, `torch.autograd` is just an engine for computing the vector-Jacobian product. Mathmatically, it is $$J^T \cdot \overrightarrow{v}$$

now if we take it another step furtther, if $\overrightarrow{v}$ just so happens to be the gradient of a scalar function $l = g(\overrightarrow{y})$, then bythe chain rule, the vector jacobian product would be the gradient of $l$ with respect to $\overrightarrow{x}$. in the example above, `external_grad` represents `\overrightarrow{v}`.

## computational graphs 

you can understand autograd as something that keeps a record of data/tensors and all executed operations in a DAG, consisting of function objects. the leaves are the input tensors, roots are the output tensors. trace the graph from the root to the leaves, and you can get the gradients using the chain rule. 

In the forward pass, `autograd` does two things:

- run the operation to compute a resulting tensor
- maintain the **gradient function** of the operation in the DAG

the backward pass then kicks off when you call `.backward()` (called on the DAG root). `autograd` will then

- compute the gradients from each `.grad_fn` (grad function)
- accumulate them in the respective tensors's `.grad` attribute, and 
- use the chain rule to propagate everything back to the leaf tensors. 



Visually:

![image.png](figures/dag_autograd.png)

> DAGs are dynamic in pytorch!  this means that tyhe graph is recreated from scratch after each `.backward()` cal. there is a new graph created every time. this allows you to control the flow statements in the model and allows you to change the shape, size, and operations at each iteration. 

## exclusion from DAG

if the `requires_grad` is set to False, then it's goig to be excluded from the DAG. this is how you freeze parameters!

in this resnet example, you freeze the parameters by excluding all of the parameters from the DAG

In [20]:
from torch import nn, optim

model = resnet18(weights=ResNet18_Weights.DEFAULT)



In [22]:
for idx, param in enumerate(model.parameters()):
    print(f"Layer: {idx}")
    param.requires_grad = False

Layer: 0
Layer: 1
Layer: 2
Layer: 3
Layer: 4
Layer: 5
Layer: 6
Layer: 7
Layer: 8
Layer: 9
Layer: 10
Layer: 11
Layer: 12
Layer: 13
Layer: 14
Layer: 15
Layer: 16
Layer: 17
Layer: 18
Layer: 19
Layer: 20
Layer: 21
Layer: 22
Layer: 23
Layer: 24
Layer: 25
Layer: 26
Layer: 27
Layer: 28
Layer: 29
Layer: 30
Layer: 31
Layer: 32
Layer: 33
Layer: 34
Layer: 35
Layer: 36
Layer: 37
Layer: 38
Layer: 39
Layer: 40
Layer: 41
Layer: 42
Layer: 43
Layer: 44
Layer: 45
Layer: 46
Layer: 47
Layer: 48
Layer: 49
Layer: 50
Layer: 51
Layer: 52
Layer: 53
Layer: 54
Layer: 55
Layer: 56
Layer: 57
Layer: 58
Layer: 59
Layer: 60
Layer: 61


then say we want ot finetune this model on a dataset with 10 new labels. for this particular model, the classifier is the last linear layer (`model.fc`). we can just replace it with a new linear layer (unfrozen by defaul that is the classifier.)

In [26]:
print(f"original layer: {model.fc}")

original layer: Linear(in_features=512, out_features=1000, bias=True)


In [None]:
model.fc = nn.Linear(512, 10)
print(f"updated layer: {model.fc}")

updated layer: Linear(in_features=512, out_features=10, bias=True)


In [30]:
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

In [38]:
data = torch.rand(1, 3, 64, 64)
labels = torch.rand(1, 10)
prediction = model(data)

In [39]:
loss = (prediction - labels).sum()
loss.backward()
optimizer.step()