### autograd explanation

Pipleline:

1. forward (d(cur_var)/d(prev_var) is calculated at first)

2. backward (grads are put into vars at backward by chain rules)


In [2]:
import torch
import numpy

1. x = Variable(torch.rand(3,3))

       Variable() is not needed in the latest version of pytorch
   
2. three key attributes in Variable()

   2.1 x.data 
       represents its tensor value
   2.2 x.grad 
       represents its gradient value 
       only available in starting nodes in computing graph
       intermediate nodes have empty .grad attributes
       x.grad is (partial loss)/(partial x)
   2.3 x.grad_fn 
       represents the create function type of intermediate nodes
       c = a*b, c has grad_fn=<MulBackward0>, gradients are calculated based on the grad_fn types
    

In [2]:
a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)
c = a * b
d = c ** 2
e = torch.sum(d)
e.backward()

print(a.data, b.data)
print(a.grad, b.grad)
print(c.grad_fn)
print(d.grad_fn)
print(e.grad_fn)

tensor([1., 2., 3.]) tensor([2., 4., 6.])
tensor([  8.,  64., 216.]) tensor([  4.,  32., 108.])
<MulBackward0 object at 0x7fd4e56f44e0>
<PowBackward0 object at 0x7fd4e65162b0>
<SumBackward0 object at 0x7fd4e56f44e0>


3. x.backward()

   3.1 ratain_graph = True
       use x.backward( retain_graph=True ) at the first time of backward() allows using backward functions for several times 
       we can only use c.backward() for only one time without retain_graph since nodes of the computing graph would be released after backward() and no buffer is available for next backward()
       
   3.2 tensor backward
       if x is a tensor instead of a scalar, we should use x.backward(z) (x.shape = z.shape)
       we actually do torch.sum(x*z).backward()
       if we want to backward from middle of the computing graph (backward from y), we can do x.backward(dy/dx)

In [3]:
a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)
c = a * b
d = c ** 2
e = torch.sum(d)
e.backward(retain_graph=True)
e.backward()

print(a.data, b.data)
print(a.grad, b.grad)
print(c.grad_fn)
print(d.grad_fn)
print(e.grad_fn)

tensor([1., 2., 3.]) tensor([2., 4., 6.])
tensor([ 16., 128., 432.]) tensor([  8.,  64., 216.])
<MulBackward0 object at 0x7fd4e6ebfc18>
<PowBackward0 object at 0x7fd4e6eb8a20>
<SumBackward0 object at 0x7fd4e6ebfc18>


In [4]:
a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)
c = a * b
d = c ** 2
aux = torch.ones(c.shape[0])
d.backward(aux)

print(a.data, b.data)
print(a.grad, b.grad)
print(c.grad_fn)
print(d.grad_fn)

tensor([1., 2., 3.]) tensor([2., 4., 6.])
tensor([  8.,  64., 216.]) tensor([  4.,  32., 108.])
<MulBackward0 object at 0x7fd4e56f4668>
<PowBackward0 object at 0x7fd4e67d27f0>


4. x.detach()

   cut variable from the computing graph
   cannot backward when backward pass this variable
   x = x.detach() can do x splitting
   
   1) grad_fn == None
   2) requires_grad == False

In [5]:
# a.detach() would split one tensor from computing graph and make a copy of this a
# a.detach() variable has grad_fn == False and requires_grad == False
# b = a.detach() would change b into a and split backward
one = torch.tensor((1.,1.,1.), requires_grad=True)
a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)
c = a * b
c = one.detach()
d = c ** 2
e = torch.sum(d)
e.backward()

print(c.requires_grad, c.grad_fn)

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

5. x.register_hook(func)

       def func(grad):
           /* grad operations 
           */
           return grad
    
   can modify gradient when backward by binding variable with func by register_hook

In [6]:
# can freely change backward procedure during backward
# can freely change any variable and bind with it
def modify_grad(grad):
    print(grad)
    return -grad
a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)
c = a * b
d = c ** 2
d.register_hook(modify_grad)
e = torch.sum(d)
e.backward()

print(a.data, b.data)
print(a.grad, b.grad)
print(c.grad_fn)
print(d.grad_fn)

tensor([1., 1., 1.])
tensor([1., 2., 3.]) tensor([2., 4., 6.])
tensor([  -8.,  -64., -216.]) tensor([  -4.,  -32., -108.])
<MulBackward0 object at 0x7fd4e6ebfb70>
<PowBackward0 object at 0x7fd4e6ebf898>


6. custom_function

       class custom_function(torch.autograd.Function):
           @staticmethod
           def forward(ctx,inpu1t,input2):
               /* do forwarding function
                  ctx can be stored by variables and used at backward function
                  ctx.save_for_backward(input1,input2)
               */
               return output // shoulde be in one variable     
           @staticmethod
           def backward(ctx,grad_output):
               /* do backwarding function
                  ctx can be used for backward function
                  input1, input2 = ctx.saved_tensors
               */
               return input // shoulde be the same size with forward input


In [6]:
class custom_function(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, y):
        # store input var
        ctx.x = x
        ctx.y = y
        
        return x*y
    
    @staticmethod
    def backward(ctx, grad_output):
        print("bp_grad : {}".format(grad_output))
        # get backward output via input var
        grad_x = -ctx.x * grad_output
        grad_y = -ctx.y * grad_output
        return grad_x, grad_y

a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)   
c = custom_function.apply(a,b)
e = torch.sum(c)
e.backward()

print(a.data, b.data)
print(a.grad, b.grad)
print(c.grad_fn)
print(d.grad_fn)

bp_grad : tensor([1., 1., 1.])
tensor([1., 2., 3.]) tensor([2., 4., 6.])
tensor([-1., -2., -3.]) tensor([-2., -4., -6.])
<torch.autograd.function.custom_functionBackward object at 0x7fbb15535c78>
<PowBackward0 object at 0x7fbb1559be80>


7. torch.no_grad()

       a = b + c
   
       with torch.no_grad():
           a.mul_(2)
   operations within torch.no_grad() woulde not be tracked, grad_fn would not change
   
   it is often the case that in order to save memory, we run our evaluation and test code under torch.no_grad() to avoid backward memory cost

In [8]:
a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)
c = a * b
d = c ** 2
with torch.no_grad():
    d.mul_(2)
# grad not tracked under torch.no_grad()
print(d.grad_fn)
e = torch.sum(d)
e.backward()

print(a.data, b.data)
print(a.grad, b.grad)
print(c.grad_fn)
print(d.grad_fn)

<PowBackward0 object at 0x7fd4e6f10be0>
tensor([1., 2., 3.]) tensor([2., 4., 6.])
tensor([  8.,  64., 216.]) tensor([  4.,  32., 108.])
<MulBackward0 object at 0x7fd4e6f10ba8>
<PowBackward0 object at 0x7fd4e6f10be0>


8. y.retain_grad()

   store the grad of intermediate vars
   cost more memory

In [10]:
a = torch.tensor((1., 2., 3.), requires_grad=True)
b = torch.tensor((2., 4., 6.), requires_grad=True)
c = a * b
d = c ** 2
d.retain_grad()
print(d.grad)
e = torch.sum(d)
e.backward()

print(d.grad)

None
tensor([1., 1., 1.])
