# Automatic Differentiation with torch.autograd

In [4]:
# https://pytorch.org/tutorials/beginner/basics/autogradqs_tutorial.html

# When training neural networks the most frequently used algorithm is back propagation, in this algorithm
# models weights are adjusted according to the gradient of the loss function with respect to the given parameter 

# To compute gradients we can use `torch.autograd`

import torch 

x = torch.ones(5) # input tensor
y = torch.zeros(3) # expected output
w = torch.randn(5, 3, requires_grad=True)
b = torch.randn(3, requires_grad=True)
z = torch.matmul(x, w)+b
loss = torch.nn.functional.binary_cross_entropy_with_logits(z, y)

# In this network w and b are parameters which we need to optimize. Thus we need to be able to compute 
# the gradients of loss function with respect to those variables, so we set `required_grad` property of those tensors
print(f"Gradient function for z = {z.grad_fn}")
print(f"Gradient function for loss = {loss.grad_fn}")

Gradient function for z = <AddBackward0 object at 0x0000024C73D67D90>
Gradient function for loss = <BinaryCrossEntropyWithLogitsBackward0 object at 0x0000024C73D67910>


# Computing Gradients

In [5]:
# To optimize weights in the neural network, we need to compute the derivative of our loss function with respect to its parameters
# To compute those derivatives we call `loss.backward()`

loss.backward()
print(w.grad)
print(b.grad)

tensor([[0.0785, 0.0115, 0.0034],
        [0.0785, 0.0115, 0.0034],
        [0.0785, 0.0115, 0.0034],
        [0.0785, 0.0115, 0.0034],
        [0.0785, 0.0115, 0.0034]])
tensor([0.0785, 0.0115, 0.0034])


# Disabling Gradient Tracking

In [None]:
# By default all tensors require `requires_grad=True` are tracking their computational history and support gradient computation

z = torch.matmul(x, w)+b
print(z.requires_grad)

with torch.no_grad():
    z = torch.matmul(x, w)+b
print(z.requires_grad)

# There are some reasons why you might want to disable gradient tracking
# - To mark some parameters in your neural network as frozen parameters
# - To speed up computations when you are only doing a forward pass

True
False


# More on Computational Graphs

Conceptually autograd keeps a record of data(tensors) and all executed operations (along with the resulting new tensors) in a 
directed acyclic graph (DAG) consisting of Function objects. In this DAG, leaves are the input sensors, roots are the output tensors.
By tracing this graph from roots to leaves, you can automatically compute the gradients using the chain rule.