In [1]:
# https://github.com/patrickloeber/pytorchTutorial/blob/master/03_autograd.py

import torch

In [2]:
# The autograd package provides automatic differentiation
# for all operations on Tensors

# requires_grad = True -> tracks all operations on the tensor.
x = torch.randn(3, requires_grad=True)
y = x + 2
y

tensor([3.2765, 1.6360, 1.0300], grad_fn=<AddBackward0>)

In [3]:
# y was created as a result of an operation, s it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is Non
print(y)
print(y.grad_fn)

tensor([ 1.2765, -0.3640, -0.9700], requires_grad=True)
tensor([3.2765, 1.6360, 1.0300], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x000002A93A3BAD40>


In [4]:
# Do more operations on y
z = y * y * 3
print(z)
z = z.mean()
z

tensor([32.2070,  8.0299,  3.1824], grad_fn=<MulBackward0>)


tensor(14.4731, grad_fn=<MeanBackward0>)

In [5]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the graidents computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute
# It is the partial derivate of the function w.r.t. the tensor

z.backward()
x.grad # dz/dx

tensor([6.5531, 3.2721, 2.0599])

In [6]:
# Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
# It computes partial derviates while applyting the chain rule

# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward()
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product

x = torch.randn(3, requires_grad=True)
y = x * 2
for _ in range(10):
    y = y * 2

print(y)
print(y.shape)

v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
x.grad

tensor([  599.3216, -1282.8489,   319.4081], grad_fn=<MulBackward0>)
torch.Size([3])


tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])

In [7]:
# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

# .requires_grad_(...) hcanges an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x000002A9426D5840>


In [8]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

True
False


In [9]:
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights * 3).sum()
    model_output.backward()
    
    print(weights.grad)
    
    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad
        
    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

# Optimizer has zero_grad() method
# optimizer = torch.optim.SGC([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)
