In [1]:
import torch

In [2]:
x = torch.randn(3)

In [3]:
x

tensor([0.0349, 0.9773, 0.7097])

In [5]:
x = torch.randn(3, requires_grad=True)

In [6]:
x

tensor([ 0.3224,  1.0536, -0.1895], requires_grad=True)

In [7]:
y = x + 2

In [8]:
y

tensor([2.3224, 3.0536, 1.8105], grad_fn=<AddBackward0>)

In [17]:
z = y * y * 2
z

tensor([10.7874, 18.6489,  6.5561], grad_fn=<MulBackward0>)

In [18]:
z = z.mean()
z

tensor(11.9975, grad_fn=<MeanBackward0>)

In [19]:
z.backward()


In [21]:
x.grad

tensor([6.1932, 8.1429, 4.8281])

grad can only be called on scalar outputs, so what if z is not of a scalar value?
apparently we need a vector of matching dimension of the base tensor, like for dz/dx, so we need a vector to match size of x.
Wish I remember more from my linear algebra class

In [24]:
z = y * y * 2
v = torch.tensor([0.1, 1, 0.001], dtype=torch.float32)
z.backward(v)
x.grad

tensor([ 7.1221, 20.3573,  4.8354])

Ways to disable the auto grad:

In [25]:
# x.requires_grad_(False)
# x.detach() # this creates a new tensor that doesn't have the auto grad

Or use the 'with' statement with torch.no_grad()

In [27]:
# with torch.no_grad():

Grad values are accumulatives, as in they add up when getting called repeatedly.

In [32]:
weights = torch.ones(4, requires_grad=True)
weights

tensor([1., 1., 1., 1.], requires_grad=True)

In [35]:
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad)

tensor([15., 15., 15., 15.])
tensor([18., 18., 18., 18.])
tensor([21., 21., 21., 21.])


So key thing to remember is to zero out the grad attributes when you want fresh grad values.
Note the sytax to zero it out looks funny, 'grad' looks like an attribute, but has function.

In [36]:
for epoch in range(3):
    model_output = (weights * 3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([24., 24., 24., 24.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
