- The autograd package provides automatic differentiation for all operations on Tensors

In [15]:
import torch
torch.manual_seed(123)

<torch._C.Generator at 0x24a11f11070>

In [26]:
# requires_grad = True -> tracks all operations on the tensor. 
x = torch.randn(3, requires_grad=True)
y = x + 2

In [17]:
# y was created as a result of an operation, so it has a grad_fn attribute.
# grad_fn: references a Function that has created the Tensor
print(x) # created by the user -> grad_fn is None
print(y)

tensor([-0.1115,  0.1204, -0.3696], requires_grad=True)
tensor([1.8885, 2.1204, 1.6304], grad_fn=<AddBackward0>)


In [18]:
print(y.grad_fn)

<AddBackward0 object at 0x0000024A15D222C0>


In [19]:
# y.backward()

In [20]:
# Do more operations on y
z = y * y * 3
print(z)
z = z.mean()
print(z)

tensor([10.6997, 13.4878,  7.9743], grad_fn=<MulBackward0>)
tensor(10.7206, grad_fn=<MeanBackward0>)


In [21]:
# Let's compute the gradients with backpropagation
# When we finish our computation we can call .backward() and have all the gradients computed automatically.
# The gradient for this tensor will be accumulated into .grad attribute.
# It is the partial derivate of the function w.r.t. the tensor
z.backward()
print(x.grad) # dz/dx

tensor([3.7771, 4.2407, 3.2607])


In [None]:
# -------------
# Model with non-scalar output:
# If a Tensor is non-scalar (more than 1 elements), we need to specify arguments for backward() 
# specify a gradient argument that is a tensor of matching shape.
# needed for vector-Jacobian product
# x is a tensor of shape (3,) with random values.
# requires_grad=True means PyTorch will track all operations involving x to build the computation graph, allowing gradients to be computed during backpropagation.
x = torch.randn(3, requires_grad=True)
# A new tensor y is created by element-wise multiplication of x with 2.
# The computation graph now includes the operation 
# 𝑦 = 2𝑥

y = x * 2
# This loop multiplies y by 2 in each iteration, 10 times in total.
# After the loop, 𝑦 becomes: 𝑦 = 2^11*𝑥
# because 𝑥 was first multiplied by 2 (outside the loop), then 10 additional multiplications were performed in the loop.
for _ in range(10):
    y = y * 2

In [30]:
print(y)
print(y.shape)

tensor([-1991.3831, -1546.3331,   663.3528], grad_fn=<MulBackward0>)
torch.Size([3])


In [None]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
# y.backward(v) computes the vector-Jacobian product, which allows you to specify how the gradient is weighted along the output tensor y.
# v is a vector of the same shape as y (i.e., (3,)), and it scales the gradients for each element of y.
y.backward(v)
print(x.grad)

tensor([2.0480e+02, 2.0480e+03, 2.0480e-01])


In [32]:
# Step 5: Gradient Computation
# y with respect to x:
# From the computation graph, we know 𝑦=2^11*𝑥 so the derivative of y with respect to x is:
# dy/dx = 2^11
# The gradient at each element of x is scaled by 
# 𝑣 during the backward pass.
# Scaling by v:
# The gradient of y is multiplied element-wise by the vector v during backpropagation.
# Final Gradient for x:
# For each element 
#  the gradient becomes: grad xi = vi*2^11
x = torch.tensor([1.0, -2.0, 3.0], requires_grad=True)
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
# After the loop, 𝑦=2^11*𝑥
2**11*x


tensor([ 2048., -4096.,  6144.], grad_fn=<MulBackward0>)

In [33]:
# Derivative of 𝑦 with respect to x
# Multiply by v, x*grad = v*2**1*x
2**11*x*v

tensor([ 2.0480e+02, -4.0960e+03,  6.1440e-01], grad_fn=<MulBackward0>)

In [34]:
# -------------
# Stop a tensor from tracking history:
# For example during our training loop when we want to update our weights
# then this update operation should not be part of the gradient computation
# - x.requires_grad_(False)
# - x.detach()
# - wrap in 'with torch.no_grad():'

In [35]:
# .requires_grad_(...) changes an existing flag in-place.
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x0000024A189BB340>


In [36]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

True
False


In [39]:
# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

print(a)

True
False
tensor([[-0.4087,  1.0764],
        [-0.4015, -0.7291]], requires_grad=True)


In [42]:
# -------------
# backward() accumulates the gradient for this tensor into .grad attribute.
# !!! We need to be careful during optimization !!!
# Use .zero_() to empty the gradients before a new optimization step!
weights = torch.ones(4, requires_grad=True)
for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

tensor([3., 3., 3., 3.])
tensor([6., 6., 6., 6.])
tensor([9., 9., 9., 9.])


In [45]:
weights = torch.ones(4, requires_grad=True)
for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad
    print(weights)

tensor([3., 3., 3., 3.])
tensor([0.7000, 0.7000, 0.7000, 0.7000], requires_grad=True)
tensor([6., 6., 6., 6.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor([9., 9., 9., 9.])
tensor([-0.8000, -0.8000, -0.8000, -0.8000], requires_grad=True)


In [46]:
for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

tensor([12., 12., 12., 12.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([-2.6000, -2.6000, -2.6000, -2.6000], requires_grad=True)
tensor(-27.6000, grad_fn=<SumBackward0>)
