In [1]:
import torch

- The autograd package provides automatic differentiation for all operations on Tensors

In [2]:
# requires_grad = True -> tracks all operations on the tensor. 
x = torch.randn(3, requires_grad=True)
y = x + 2

In [3]:
# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

tensor([-1.0429, -1.6254,  0.4558], requires_grad=True)
tensor([0.9571, 0.3746, 2.4558], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x0000027A56DE7BE0>


In [6]:
# Do more operations on y
z = y * y * 3
print(z)
z = z.mean()
print(z)

tensor([ 2.7482,  0.4209, 18.0934], grad_fn=<MulBackward0>)
tensor(7.0875, grad_fn=<MeanBackward0>)


- Let's compute the gradients with backpropagation
- When we finish our computation we can call .backward() and have all the gradients computed automatically.
- The gradient for this tensor will be accumulated into .grad attribute.
- It is the partial derivate of the function w.r.t. the tensor

In [7]:
z.backward()
print(x.grad) # dz/dx

tensor([1.9142, 0.7491, 4.9117])


- Generally speaking, torch.autograd is an engine for computing vector-Jacobian product
- It computes partial derivates while applying the chain rule

In [8]:
# If we don't specify requires_grad
x = torch.randn(3, requires_grad=False)
print(x)
y = x + 2
print(y)
z = y * y * 3
print(z)
z = z.mean()
print(z)

tensor([-0.2425, -0.1758, -0.7688])
tensor([1.7575, 1.8242, 1.2312])
tensor([9.2660, 9.9826, 4.5474])
tensor(7.9320)


In [None]:
z.backward()

### Model with non-scalar output:
- If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
- specify a gradient argument that is a tensor of matching shape.
- needed for vector-Jacobian product

In [10]:
x = torch.randn(3, requires_grad=True)

y = x * 2
for _ in range(10):
    y = y * 2

print(y)
print(y.shape)

tensor([-1052.9594, -3035.4553,  4437.3286], grad_fn=<MulBackward0>)
torch.Size([3])


In [11]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)   # Argument is required to pass since y is not scalar
print(x.grad)

tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


### Stop a tensor from tracking history:
- For example during our training loop when we want to update our weights
- then this update operation should not be part of the gradient computation

In [12]:
# There are 3 options to stop tracking history
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

In [13]:
# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(3, requires_grad=True)
print(a)
print(a.requires_grad)
a.requires_grad_(False)
print(a)
print(a.requires_grad)

tensor([ 1.4354, -1.1359, -0.4562], requires_grad=True)
True
tensor([ 1.4354, -1.1359, -0.4562])
False


In [14]:
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x0000027A58E01AF0>


In [15]:
# .detach(): get a new Tensor with the same content but no gradient computation:
x = torch.randn(2, 2, requires_grad=True)
print(x)
print(x.requires_grad)
y = x.detach()
print(y)
print(y.requires_grad)

tensor([[-1.4638, -0.6436],
        [-0.9414,  0.9236]], requires_grad=True)
True
tensor([[-1.4638, -0.6436],
        [-0.9414,  0.9236]])
False


In [16]:
# wrap in 'with torch.no_grad():'
x = torch.randn(2, 2, requires_grad=True)
print(x)
print(x.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

tensor([[ 0.5258,  1.4379],
        [ 0.4293, -0.7375]], requires_grad=True)
True
False


In [18]:
y = x + 2
print(y)
print(y.requires_grad)

tensor([[2.5258, 3.4379],
        [2.4293, 1.2625]], grad_fn=<AddBackward0>)
True


### -------------
- backward() accumulates the gradient for this tensor into .grad attribute.
- !!! We need to be careful during optimization !!!
- Use .zero_() to empty the gradients before a new optimization step!

In [19]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(1):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

tensor([3., 3., 3., 3.])


In [20]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(2):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])


In [21]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [22]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # this is important! It affects the final weights & output
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


In [23]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    #weights.grad.zero_()

print(weights)
print(model_output)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])
tensor([-0.8000, -0.8000, -0.8000, -0.8000], requires_grad=True)
tensor(1.2000, grad_fn=<SumBackward0>)


In [24]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)


In [None]:
# Optimizer has zero_grad() method
# optimizer = torch.optim.SGD([weights], lr=0.1)
# During training:
# optimizer.step()
# optimizer.zero_grad()