
# Automatic Differentiation
- [2.5 Automatic Differentiation](https://d2l.ai/chapter_preliminaries/autograd.html)
- [Automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
- [A Gentle Introduction to torch.autograd](https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html)
- [Automatic Differentiation with torch.autograd](https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html)
- [Of VJPs and JVPs](https://maximerobeyns.com/of_vjps_and_jvps)
- [The Autodiff Cookbook](https://jax.readthedocs.io/en/latest/notebooks/autodiff_cookbook.html)

## Optimization
- [Optimization problem](https://en.wikipedia.org/wiki/Optimization_problem)
- [Gradient descent](https://en.wikipedia.org/wiki/Gradient_descent)
- [Optimization and Convergence](https://physicsbaseddeeplearning.org/overview-optconv.html)

## Matrix Multiplication
- [Inside the Matrix: Visualizing Matrix Multiplication, Attention and Beyond](https://pytorch.org/blog/inside-the-matrix/)

## Matrix Calculus
- [Layout conventions](https://en.wikipedia.org/wiki/Matrix_calculus#Layout_conventions)
- [Matrix Calculus for Machine Learning and Beyond](https://ocw.mit.edu/courses/18-s096-matrix-calculus-for-machine-learning-and-beyond-january-iap-2023/pages/syllabus/)
- [The Matrix Calculus You Need For Deep Learning](https://arxiv.org/abs/1802.01528)

## backward()

$$
\vec{a} = \begin{bmatrix}
           a_{1} \\
           a_{2} \\
     \end{bmatrix}
$$

$$
\vec{b} = \begin{bmatrix}
           b_{1} \\
           b_{2} \\
     \end{bmatrix}
$$

$$
L = \sum_{i=1}^{2} (3a_i^3 - b_i^2)
$$

$$
\frac{\partial L}{\partial \vec{a}} = \left[ \frac{\partial L}{\partial a_1} ; \frac{\partial L}{\partial a_2}\right] = \left[ 9a_1^2 ; 9a_2^2 \right]
$$

$$
\frac{\partial L}{\partial \vec{b}} = \left[ \frac{\partial L}{\partial b_1} ; \frac{\partial L}{\partial b_2}\right] = \left[ -2b_1 ; -2b_2 \right]
$$

---

If $\vec{a}^T=[2 ; 3]$ and $\vec{b}^T=[6 ; 4]$, then

$$
\frac{\partial L}{\partial \vec{a}} = \left[ 36;81 \right]
$$

$$
\frac{\partial L}{\partial \vec{b}} = \left[ -12;-8 \right]
$$

In [None]:
import torch

In [None]:
a = torch.tensor([2., 3.])
print(a.shape)
a

torch.Size([2])


tensor([2., 3.])

In [None]:
a = torch.tensor([2., 3.])
b = torch.tensor([6., 4.])

In [None]:
L = torch.sum((3*a**3 - b**2))
L

tensor(53.)

In [None]:
L.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
L.requires_grad

False

In [None]:
a.requires_grad, b.requires_grad

(False, False)

In [None]:
a.requires_grad = True
b.requires_grad = True

In [None]:
a.requires_grad, b.requires_grad

(True, True)

In [None]:
L = torch.sum((3*a**3 - b**2))
L

tensor(53., grad_fn=<SumBackward0>)

In [None]:
L.requires_grad

True

In [None]:
L.backward()

In [None]:
a.grad

tensor([36., 81.])

In [None]:
b.grad

tensor([-12.,  -8.])

## Optimizer

In [None]:
from torch.optim import SGD

In [None]:
c = torch.tensor([2., 3.], requires_grad=True)
d = torch.tensor([6., 4.], requires_grad=True)

In [None]:
c0 = c.clone()
d0 = d.clone()

In [None]:
lr = 0.1
optimizer = SGD([c, d], lr=lr)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.1
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [None]:
loss = torch.sum((3*c**3 - d**2))
loss

tensor(53., grad_fn=<SumBackward0>)

In [None]:
print(c.grad)
print(d.grad)

None
None


In [None]:
loss.backward()

In [None]:
print(c.grad)
print(d.grad)

tensor([36., 81.])
tensor([-12.,  -8.])


In [None]:
optimizer.step()

for t=1 to ... do
$$
g_t \leftarrow \nabla_\theta f_t(\theta_t-1)
$$

$$
\theta_t \leftarrow \theta_{t-1} - \gamma g_t
$$

In [None]:
print(c)
print(c0-lr*c.grad)

tensor([-1.6000, -5.1000], requires_grad=True)
tensor([-1.6000, -5.1000], grad_fn=<SubBackward0>)


In [None]:
print(d)
print(d0-lr*d.grad)

tensor([7.2000, 4.8000], requires_grad=True)
tensor([7.2000, 4.8000], grad_fn=<SubBackward0>)


In [None]:
from torch.optim import Adam

In [None]:
c = torch.tensor([2., 3.], requires_grad=True)
d = torch.tensor([6., 4.], requires_grad=True)

In [None]:
c0 = c.clone()
d0 = d.clone()

In [None]:
lr = 0.1
optimizer = Adam([c, d], lr=lr)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.1
    maximize: False
    weight_decay: 0
)

In [None]:
loss = torch.sum((3*c**3 - d**2))
loss

tensor(53., grad_fn=<SumBackward0>)

In [None]:
loss.backward()

In [None]:
print(c.grad)
print(d.grad)

tensor([36., 81.])
tensor([-12.,  -8.])


In [None]:
optimizer.step()

In [None]:
print(c)
print(d)

tensor([1.9000, 2.9000], requires_grad=True)
tensor([6.1000, 4.1000], requires_grad=True)


In [None]:
c1 = c.clone()
d1 = d.clone()

for t=1 to ... do
$$
g_t \leftarrow \nabla_\theta f_t(\theta_t-1)
$$

$$
m_t \leftarrow \beta_1 m_{t-1} + (1-\beta_1)g_t
$$

$$
v_t \leftarrow \beta_2 v_{t-1} + (1-\beta_2)g_t^2
$$

$$
\widehat{m_t} \leftarrow m_t/(1-\beta_1^t)
$$

$$
\widehat{v_t} \leftarrow v_t/(1-\beta_2^t)
$$


$$
\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/(\sqrt{\widehat{v_t}}+\epsilon)
$$

In [None]:
eps = 1e-08
beta1 = 0.9
beta2 = 0.999

g1 = c.grad.clone()
m0 = 0
v0 = 0
m1 = beta1*m0 + (1-beta1)*g1
v1 = beta2*v0 + (1-beta2)*g1**2
m1_hat = m1/(1-beta1**1)
v1_hat = v1/(1-beta2**1)

c0 - lr*m1_hat/(torch.sqrt(v1_hat)+eps)

tensor([1.9000, 2.9000], grad_fn=<SubBackward0>)

In [None]:
optimizer.zero_grad()

In [None]:
print(c.grad)
print(d.grad)

None
None


In [None]:
loss = torch.sum((3*c**3 - d**2))
loss

tensor(39.7240, grad_fn=<SumBackward0>)

In [None]:
loss.backward()

In [None]:
print(c.grad)
print(d.grad)

tensor([32.4900, 75.6900])
tensor([-12.2000,  -8.2000])


In [None]:
optimizer.step()

In [None]:
print(c)
print(d)

tensor([1.8004, 2.8002], requires_grad=True)
tensor([6.2000, 4.2001], requires_grad=True)


In [None]:
g2 = c.grad.clone()
m2 = beta1*m1 + (1-beta1)*g2
v2 = beta2*v1 + (1-beta2)*g2**2
m2_hat = m2/(1-beta1**2)
v2_hat = v2/(1-beta2**2)

c1 - lr*m2_hat/(torch.sqrt(v2_hat)+eps)

tensor([1.8004, 2.8002], grad_fn=<SubBackward0>)