# PyTorch

## Automatic Differentiation

$$
\vec{a} = \begin{bmatrix}
           a_{1} \\
           a_{2} \\
     \end{bmatrix}
$$

$$
\vec{b} = \begin{bmatrix}
           b_{1} \\
           b_{2} \\
     \end{bmatrix}
$$

$$
L = \sum_{i=1}^{2} (3a_i^3 - b_i^2)
$$

$$
\frac{\partial L}{\partial \vec{a}} = \left[ \frac{\partial L}{\partial a_1} ; \frac{\partial L}{\partial a_2}\right] = \left[ 9a_1^2 ; 9a_2^2 \right]
$$

$$
\frac{\partial L}{\partial \vec{b}} = \left[ \frac{\partial L}{\partial b_1} ; \frac{\partial L}{\partial b_2}\right] = \left[ -2b_1 ; -2b_2 \right]
$$

---

If $\vec{a}^T=[2 ; 3]$ and $\vec{b}^T=[6 ; 4]$, then

$$
\frac{\partial L}{\partial \vec{a}} = \left[ 36;81 \right]
$$

$$
\frac{\partial L}{\partial \vec{b}} = \left[ -12;-8 \right]
$$

In [None]:
import torch

In [None]:
dLda = lambda a: torch.Tensor([9*a[0]**2, 9*a[1]**2])
dLdb = lambda b: torch.Tensor([-2*b[0], -2*b[1]])

In [None]:
a = torch.tensor([2., 3.])
b = torch.tensor([6., 4.])

L = torch.sum((3*a**3 - b**2))
L

tensor(53.)

https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html

In [None]:
try: 
    L.backward()
except Exception as e:
    print(e)

element 0 of tensors does not require grad and does not have a grad_fn


In [None]:
L.requires_grad

False

In [None]:
print(a.requires_grad)
print(b.requires_grad)

False
False


In [None]:
a.requires_grad = True
b.requires_grad = True

In [None]:
print(a.requires_grad)
print(b.requires_grad)

True
True


In [None]:
L = torch.sum((3*a**3 - b**2))
L

tensor(53., grad_fn=<SumBackward0>)

In [None]:
L.requires_grad

True

In [None]:
L.backward()

In [None]:
print(a.grad)
print(b.grad)

tensor([36., 81.])
tensor([-12.,  -8.])


In [None]:
print(dLda(a))
print(dLdb(b))

tensor([36., 81.])
tensor([-12.,  -8.])


## Optimizer

### SGD

In [None]:
from torch.optim import SGD

In [None]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

In [None]:
a0 = a.clone()
b0 = b.clone()

In [None]:
lr = 0.005
optimizer = SGD([a, b], lr=lr)
optimizer

SGD (
Parameter Group 0
    dampening: 0
    differentiable: False
    foreach: None
    fused: None
    lr: 0.005
    maximize: False
    momentum: 0
    nesterov: False
    weight_decay: 0
)

In [None]:
loss = torch.sum((3*a**3 - b**2))
loss

tensor(53., grad_fn=<SumBackward0>)

In [None]:
print(a.grad)
print(b.grad)

None
None


In [None]:
loss.backward()

In [None]:
print(a.grad)
print(b.grad)

tensor([36., 81.])
tensor([-12.,  -8.])


In [None]:
print(a)
print(b)

tensor([2., 3.], requires_grad=True)
tensor([6., 4.], requires_grad=True)


In [None]:
optimizer.step()

In [None]:
print(a)
print(b)

tensor([1.8200, 2.5950], requires_grad=True)
tensor([6.0600, 4.0400], requires_grad=True)


https://pytorch.org/docs/stable/generated/torch.optim.SGD.html

for t=1 to ... do
$$
g_t \leftarrow \nabla_\theta f_t(\theta_{t-1})
$$

$$
\theta_t \leftarrow \theta_{t-1} - \gamma g_t
$$

In [None]:
print(a)
print(a0-lr*a.grad)

tensor([1.8200, 2.5950], requires_grad=True)
tensor([1.8200, 2.5950], grad_fn=<SubBackward0>)


In [None]:
print(b)
print(b0-lr*b.grad)

tensor([6.0600, 4.0400], requires_grad=True)
tensor([6.0600, 4.0400], grad_fn=<SubBackward0>)


In [None]:
loss = torch.sum((3*a**3 - b**2))
loss

tensor(17.4649, grad_fn=<SumBackward0>)

In [None]:
loss.backward()

In [None]:
print(a.grad)
print(b.grad)

tensor([ 65.8116, 141.6062])
tensor([-24.1200, -16.0800])


In [None]:
print(dLda(a))
print(dLdb(b))

tensor([29.8116, 60.6062])
tensor([-12.1200,  -8.0800])


In [None]:
print(a.grad - dLda(a0))
print(b.grad - dLdb(b0))

tensor([29.8116, 60.6062])
tensor([-12.1200,  -8.0800])


https://pytorch.org/docs/stable/generated/torch.Tensor.backward.html

This function accumulates gradients in the leaves - you might need to zero `.grad` attributes or set them to `None` before calling it.

In [None]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

lr = 0.005
optimizer = SGD([a, b], lr=lr)

print('===============')
print(a.grad)
print(b.grad)
L = torch.sum((3*a**3 - b**2))
L.backward()
print(L)
print()
print(a.grad)
print(b.grad)
print()
print(dLda(a))
print(dLdb(b))

print('===============')
optimizer.step()
print(a)
print(b)

print('===============')
optimizer.zero_grad()
print(a.grad)
print(b.grad)

print('===============')
L = torch.sum((3*a**3 - b**2))
L.backward()
print(L)
print()
print(a.grad)
print(b.grad)
print()
print(dLda(a))
print(dLdb(b))

None
None
tensor(53., grad_fn=<SumBackward0>)

tensor([36., 81.])
tensor([-12.,  -8.])

tensor([36., 81.])
tensor([-12.,  -8.])
tensor([1.8200, 2.5950], requires_grad=True)
tensor([6.0600, 4.0400], requires_grad=True)
None
None
tensor(17.4649, grad_fn=<SumBackward0>)

tensor([29.8116, 60.6062])
tensor([-12.1200,  -8.0800])

tensor([29.8116, 60.6062])
tensor([-12.1200,  -8.0800])


In [None]:
L.item()

17.464889526367188

In [None]:
type(L)

torch.Tensor

In [None]:
type(L.item())

float

### Adam

In [None]:
from torch.optim import Adam

In [None]:
a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)

a0 = a.clone()
b0 = b.clone()

lr = 0.1
optimizer = Adam([a, b], lr=lr)
optimizer

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.1
    maximize: False
    weight_decay: 0
)

In [None]:
loss = torch.sum((3*a**3 - b**2))
loss

tensor(53., grad_fn=<SumBackward0>)

In [None]:
print(a.grad)
print(b.grad)

None
None


In [None]:
loss.backward()

In [None]:
print(a.grad)
print(b.grad)

tensor([36., 81.])
tensor([-12.,  -8.])


In [None]:
print(a)
print(b)

tensor([2., 3.], requires_grad=True)
tensor([6., 4.], requires_grad=True)


In [None]:
optimizer.step()

In [None]:
print(a)
print(b)

tensor([1.9000, 2.9000], requires_grad=True)
tensor([6.1000, 4.1000], requires_grad=True)


https://pytorch.org/docs/stable/generated/torch.optim.Adam.html

for t=1 to ... do
$$
g_t \leftarrow \nabla_\theta f_t(\theta_t-1)
$$

$$
m_t \leftarrow \beta_1 m_{t-1} + (1-\beta_1)g_t
$$

$$
v_t \leftarrow \beta_2 v_{t-1} + (1-\beta_2)g_t^2
$$

$$
\widehat{m_t} \leftarrow m_t/(1-\beta_1^t)
$$

$$
\widehat{v_t} \leftarrow v_t/(1-\beta_2^t)
$$


$$
\theta_t \leftarrow \theta_{t-1} - \gamma \widehat{m_t}/(\sqrt{\widehat{v_t}}+\epsilon)
$$

In [None]:
eps = 1e-08
beta1 = 0.9
beta2 = 0.999

g1 = a.grad.clone()
m0 = 0
v0 = 0
m1 = beta1*m0 + (1-beta1)*g1
v1 = beta2*v0 + (1-beta2)*g1**2
m1_hat = m1/(1-beta1**1)
v1_hat = v1/(1-beta2**1)

print(a0 - lr*m1_hat/(torch.sqrt(v1_hat)+eps))
print(a)

tensor([1.9000, 2.9000], grad_fn=<SubBackward0>)
tensor([1.9000, 2.9000], requires_grad=True)


In [None]:
print(a.grad)

tensor([36., 81.])


In [None]:
optimizer.zero_grad()

In [None]:
print(a.grad)

None


In [None]:
loss = torch.sum((3*a**3 - b**2))
loss

tensor(39.7240, grad_fn=<SumBackward0>)

In [None]:
loss.backward()

In [None]:
print(a.grad)

tensor([32.4900, 75.6900])


In [None]:
print(a)

tensor([1.9000, 2.9000], requires_grad=True)


In [None]:
a1 = a.clone()

In [None]:
optimizer.step()

In [None]:
print(a)

tensor([1.8004, 2.8002], requires_grad=True)


In [None]:
g2 = a.grad.clone()
m2 = beta1*m1 + (1-beta1)*g2
v2 = beta2*v1 + (1-beta2)*g2**2
m2_hat = m2/(1-beta1**2)
v2_hat = v2/(1-beta2**2)

print(a1 - lr*m2_hat/(torch.sqrt(v2_hat)+eps))
print(a)

tensor([1.8004, 2.8002], grad_fn=<SubBackward0>)
tensor([1.8004, 2.8002], requires_grad=True)
