In [1]:
##gradients are essential for model optmization

In [2]:
import torch

In [3]:
a = torch.randn(3)
print(a)

tensor([ 0.1738, -0.4091,  2.1841])


In [4]:
b = torch.rand(3)
print(b)

tensor([0.6425, 0.2787, 0.4399])


rand() returns random values between 0 and 1. The random values would follow a uniform distribution and hence the mean value would be 0.5

randn() returns random values between -infinity and +inifinity. The random values would follow a normal distribution with a mean value 0 and a standard deviation 1

In [5]:
x = torch.randn(3, requires_grad=True) #-> default is False
print(x)

tensor([ 0.7121, -0.2480, -1.8401], requires_grad=True)


In [6]:
y = x + 2
print(y)

tensor([2.7121, 1.7520, 0.1599], grad_fn=<AddBackward0>)


In [7]:
z_add = a + y
z_add

tensor([2.8858, 1.3429, 2.3440], grad_fn=<AddBackward0>)

In [8]:
z_mul = y*y*2
print(z_mul)

tensor([14.7107,  6.1389,  0.0511], grad_fn=<MulBackward0>)


In [10]:
z_mean = y.mean()
print(z_mean)

tensor(1.5413, grad_fn=<MeanBackward0>)


#### Calculate the gradient
##### If the requires_grad is set to False then the .backward() shows error
##### Behind the scene during the .backward(), creates the vectorjacobian products(chain rule) and get the final gradients.
#### NB: grad can be implicitly created only for scalar outputs

In [11]:
z_mean.backward() #dz_mean/dx

In [12]:
#x now has the .grad attribute that has all the gradients calculated
print(x.grad)

tensor([0.3333, 0.3333, 0.3333])


### Ways to prevent pytorch from tracking the gradients
    * call the requires_grad(False)
    * call the .detach()
    * wrapping with torch.no_grad():

In [13]:
print(x)
x.requires_grad_(False)
print(x)

tensor([ 0.7121, -0.2480, -1.8401], requires_grad=True)
tensor([ 0.7121, -0.2480, -1.8401])


In [14]:
x = torch.randn(5, requires_grad=True)
print(x)
y = x.detach()
print(y)

tensor([ 1.0601,  0.6534,  1.3868, -0.2363, -1.5331], requires_grad=True)
tensor([ 1.0601,  0.6534,  1.3868, -0.2363, -1.5331])


In [15]:
x = torch.randn(4, requires_grad=True)
print(x)
with torch.no_grad():
    y = x +2
    print(y)

tensor([-0.5113,  0.3730,  1.0853, -0.8524], requires_grad=True)
tensor([1.4887, 2.3730, 3.0853, 1.1476])


#### Very important -> gradients keep accumulating so keep closer look

In [20]:
weights = torch.ones(5, requires_grad=True)
print(weights)

tensor([1., 1., 1., 1., 1.], requires_grad=True)


In [21]:
for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)

tensor([3., 3., 3., 3., 3.])
tensor([6., 6., 6., 6., 6.])
tensor([9., 9., 9., 9., 9.])


##### As clearly, we see that the gradients keep on accumulating in each epoch, so must be very careful and zero the gradients after each epoch

In [23]:
for epoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3., 3.])
tensor([3., 3., 3., 3., 3.])
tensor([3., 3., 3., 3., 3.])


In [None]:
#this is how it looks in real trainig in pytorch
#dummy example
for input, target in dataset:
    optimizer.zero_grad()
    output = model(input)
    loss = loss_fn(output, target)
    loss.backward()
    optimizer.step()

In [None]:
!jupyter nbconvert --execute --to script 02-gradient_with_autograd.ipynb

[NbConvertApp] Converting notebook 02-gradient_with_autograd.ipynb to script
[NbConvertApp] Executing notebook with kernel: python3
[NbConvertApp] Writing 2375 bytes to 02-gradient_with_autograd.py
