# torch.autograd for automatic differentiation (CPU implementation)



## General Usage

In [30]:
import torch
from torchvision.models import resnet18, ResNet18_Weights # trained on ImageNet dataset (1000 classes of objects)

In [31]:
model = resnet18(weights=ResNet18_Weights.DEFAULT)
data = torch.rand(1, 3, 64, 64) # dummy image of (batch_size, channels, height, width)
labels = torch.rand(1, 1000) # 1000 classes of objects

In [32]:
prediction = model(data) # forward pass: the computational graph is implicitly built here

In [33]:
loss = (prediction - labels).sum() # calculate error/loss
loss.backward() # error tensor backward pass 
# autograd calculates and stores the gradients for each model parameter in the parameter's grad attribute

In [34]:
# Load SGD optimizer: Register all model parameters, define LR and momentum
optim = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
optim.step() # initiate GD: optimizer adjusts each parameter by its gradient stored in .grad

## Differentiation in Autograd

In the directed acyclic graph (DAG) / computational graph, leaves are input tensors, roots are output tensors. 

By tracing graph from roots to leaves, the gradients can be automatically computed using chain rule.

In [35]:
import torch

a = torch.tensor([2., 3.], requires_grad=True) # tell autograd that every operation on this tensor must be tracked. dot after number is to init datatype as float
b = torch.tensor([6., 4.], requires_grad=True)

In [36]:
Q = 3 * a ** 3 - b ** 2 # create new tensor from a and b

a and b represents parameters of NN and Q the error  

**need to explicitly pass a gradient argument in Q.backward() because it is a vector. gradient is a tensor of the same shape as Q, and it represents the gradient of Q w.r.t itself.**

In [37]:
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)
# after each backward call, autograd starts populating a new graph (recreated from scratch each time)

# Check if collected gradients are correct
print(9 * a ** 2 == a.grad)
print(-2 * b == b.grad)

tensor([True, True])
tensor([True, True])


## Exclusion from the DAG

Parameters that don't compute gradients are usually called **frozen parameters** which helps to offload computational demand. This is often done when performing finetuning.

In [38]:
x = torch.rand(5, 5)
y = torch.rand(5, 5)
z = torch.rand((5, 5), requires_grad=True)

a = x + y
print(f"Does `a` require gradients?: {a.requires_grad}") # both don't require gradients by default so output don't require gradients
b = x + z
print(f"Does `b` require gradients?: {b.requires_grad}") # at least one requires gradients so output requires gradients

Does `a` require gradients?: False
Does `b` require gradients?: True


### Fine-Tuning Example

In [39]:
from torch import nn, optim

model  = resnet18(weights=ResNet18_Weights.DEFAULT) # imagenett1k_v1 weights

# Freeze all parameters in network
for param in model.parameters():
    param.requires_grad = False
    
print(model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

**Bias set to False mainly because using BatchNorm, since bias purpose is to allow layer to shift output values up or down, and BatchNorm essentially does this.**

Fine-tune model on new dataset with 10 labels.

In restnet, classifier layer is last linear layer model.fc. Simply replace it with new linear layer (unfrozen by default) that acts as our classifier.


In [40]:
model.fc = nn.Linear(512, 10) # new linear layer with unfrozen parameters
# for tensors, default is requires_grad=False!!!

In [41]:
# Optimize only the classifier (last layer)
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)