In [2]:
import torch

# The `autograd` package provides automatic differentiation for all operations on tensors

### `requires_grad = True` -> tracks all operations on the tensor. 

In [3]:
x = torch.randn(3, requires_grad=True)
y = x + 2

## NOTE: `y` was created as a result of an operation $\therefore$ it has a `grad_fn` attribute

## `grad_fn` -> references a Function that has created the Tensor


In [6]:
print(x) # created by the user -> grad_fn is None
print(y)
print(y.grad_fn)

tensor([-0.2455,  1.5564,  1.3749], requires_grad=True)
tensor([1.7545, 3.5564, 3.3749], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x10d1cd0d0>


### Operating some more on `y`

In [9]:
z = y * y * 3
print(z)
z = z.mean()
print(z)

tensor([ 9.2344, 37.9442, 34.1698], grad_fn=<MulBackward0>)
tensor(27.1161, grad_fn=<MeanBackward0>)


# Computing the gradients with `backpropagation`

### When we finish our computation we can call ` .backward()` and have all the gradients computed automatically.

* The gradient for this tensor will be accumulated into `.grad attribute`.
* * It is the partial derivate of the function w.r.t. the tensor

In [10]:
z.backward()
print(x.grad) # dz/dx

tensor([3.5089, 7.1128, 6.7498])


* Generally speaking, `torch.autograd` is an engine for computing `vector-Jacobian product`

* It computes partial derivates while applying the chain rule

# Model with non-scalar output:

 <u> If a Tensor is non-scalar</u>(more than 1 elements), we need to specify arguments for `backward() ` 
 
 * specify a gradient argument that is a tensor of matching shape needed for vector-Jacobian product


In [11]:
x = torch.randn(3, requires_grad=True)

y = x * 2

In [12]:
for _ in range(10):
    y = y * 2


print(y)
print(y.shape)

tensor([2530.4995, -730.6630,  812.3691], grad_fn=<MulBackward0>)
torch.Size([3])


In [None]:
v = torch.tensor([0.1, 1.0, 0.0001], dtype=torch.float32)
y.backward(v)
print(x.grad)

# Stop a tensor from tracking history:

 For example during our training loop when we want to update our weights then this update operation should not be part of the gradient computation
*  x.requires_grad_(False)
*  x.detach()

wrap in `with torch.no_grad()`:

# `.requires_grad_(...)` changes an existing flag in-place.

In [13]:
a = torch.randn(2, 2)
print(a.requires_grad)
b = ((a * 3) / (a - 1))
print(b.grad_fn)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)

False
None
True
<SumBackward0 object at 0x10d1d3910>


In [14]:
# .detach(): get a new Tensor with the same content but no gradient computation:
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
b = a.detach()
print(b.requires_grad)

True
False


In [15]:
# wrap in 'with torch.no_grad():'
a = torch.randn(2, 2, requires_grad=True)
print(a.requires_grad)
with torch.no_grad():
    print((x ** 2).requires_grad)

True
False


# `backward()` accumulates the gradient for this tensor into .grad attribute.

# NOTE: We need to be careful during optimization!

* Use `.zero_()` to empty the gradients before a new optimization step!

In [16]:
weights = torch.ones(4, requires_grad=True)

for epoch in range(3):
    # just a dummy example
    model_output = (weights*3).sum()
    model_output.backward()
    
    print(weights.grad)

    # optimize model, i.e. adjust weights...
    with torch.no_grad():
        weights -= 0.1 * weights.grad

    # this is important! It affects the final weights & output
    weights.grad.zero_()

print(weights)
print(model_output)

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([0.1000, 0.1000, 0.1000, 0.1000], requires_grad=True)
tensor(4.8000, grad_fn=<SumBackward0>)


## Optimizer has `zero_grad()` method

In [None]:
# optimizer = torch.optim.SGD([weights], lr=0.1)

# During training:

In [17]:
# optimizer.step()
# optimizer.zero_grad()