# PyTorch Internals

In [1]:
import torch

### Backward errors with requires_grad set to False

In [2]:
p = torch.randn(6)
p.size()

torch.Size([6])

In [3]:
q = torch.randn(6)
q.size()

torch.Size([6])

In [4]:
m = p + q

In [5]:
m.requires_grad = False

In [6]:
m.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [10]:
print(p.grad_fn)

None


In [11]:
print(q.grad_fn)

None


In [12]:
print(m.grad_fn)

None


### With requires_grad set to True

In [13]:
a = torch.rand(5, 3, requires_grad=True)

In [14]:
a.requires_grad

True

In [15]:
b = 3.0 * a + torch.randn(5, 3)

In [16]:
b.grad_fn

<AddBackward0 at 0x7f61ebb7d048>

In [17]:
b.backward()

RuntimeError: grad can be implicitly created only for scalar outputs

In [18]:
b.backward(torch.ones(5, 3))

In [19]:
a.grad

tensor([[3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.],
        [3., 3., 3.]])

In [20]:
b.grad

In [21]:
b.backward(torch.ones(5, 3))

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

In [22]:
c = b + 5

### Backward clears the graph
Once backward is executed, it'll lose information required to evaluate backward. So any extension to backward on any function won't work and will give the error as below.

This problem can be solved usign retain_graph = True in backward function for all backwars being computed on the same function except the last backward function. 

In [23]:
# 
c.backward(torch.ones(5, 3))

RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.

### Gradient accumulation

In [24]:
a = torch.randn(5, requires_grad=True)
b = torch.randn(5, requires_grad=True)

In [25]:
c = 3 * a + 5 * b

In [26]:
c.backward(torch.ones(5), retain_graph=True)

In [27]:
a.grad

tensor([3., 3., 3., 3., 3.])

In [28]:
b.grad

tensor([5., 5., 5., 5., 5.])

### See the gradients getting multiplied by 2. This is known as gradient accumulation which is desirable when parameters are being shared as in RNNs.

In [29]:
c.backward(torch.ones(5))

In [30]:
a.grad

tensor([6., 6., 6., 6., 6.])

In [31]:
b.grad

tensor([10., 10., 10., 10., 10.])

### Using torch.no_grad() and detach()

### With no_grad()

In [32]:
# Any additional tensors or operations inside a no_grad block return with a requires_grad False. 
with torch.no_grad():
    print(a.requires_grad)
    print(b.requires_grad)
    print((3*b).requires_grad)
    
    z = torch.ones(1, requires_grad=True)
    print(z.requires_grad)

True
True
False
True


### With detach() 

In [33]:
a = torch.randn(5, 3, requires_grad=True)
b = torch.randn(5, 3, requires_grad=True)
c = 3 * a.detach() + 2 * b
# c = 3 * a + 2 * b

In [34]:
c.backward(torch.ones(5, 3))

In [35]:
# becausse of detach gradients wrt a were not calculated
print(a.grad)
print(b.grad)

None
tensor([[2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.],
        [2., 2., 2.]])


# Deep into nn.Module

In [36]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [42]:
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [38]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [39]:
net.parameters()

<generator object Module.parameters at 0x7f61ebb8e4c0>

In [40]:
params = list(net.parameters())
len(params)
params[0].size()

torch.Size([6, 1, 3, 3])

In [41]:
params[0]

Parameter containing:
tensor([[[[-0.1369, -0.1977, -0.3144],
          [ 0.1588,  0.2970, -0.2408],
          [ 0.2835,  0.0907, -0.2538]]],


        [[[ 0.2823,  0.0224, -0.1275],
          [ 0.0322,  0.0405,  0.2065],
          [ 0.0587, -0.2864,  0.0359]]],


        [[[-0.0062,  0.2356,  0.1773],
          [ 0.2435,  0.1075, -0.2533],
          [-0.3113, -0.0684,  0.2189]]],


        [[[ 0.0852, -0.1470,  0.1616],
          [ 0.1421,  0.2172, -0.2870],
          [-0.0116,  0.1703, -0.0591]]],


        [[[-0.2026,  0.0159, -0.1561],
          [-0.2744, -0.1268, -0.3268],
          [-0.1646,  0.0113,  0.2688]]],


        [[[ 0.2179,  0.1531,  0.2704],
          [-0.1295, -0.2900, -0.2129],
          [ 0.1495,  0.1606, -0.1538]]]], requires_grad=True)

In [46]:
class Net2(nn.Module):

    def __init__(self):
        super(Net2, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        # NOTICE THE CHANGE IN NO OF INPUT CHANNELS
        self.conv1 = nn.Conv2d(3, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [47]:
net2 = Net2()
print(net2)

Net2(
  (conv1): Conv2d(3, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [49]:
params2 = list(net2.parameters())
print(len(params2))
print(params2[0].size())

10
torch.Size([6, 3, 3, 3])


In [50]:
params2[0]

Parameter containing:
tensor([[[[-1.2761e-01, -1.5749e-01, -1.3595e-01],
          [ 1.4525e-01, -1.0746e-02,  9.3678e-02],
          [ 1.8122e-01, -8.9776e-02, -1.0035e-01]],

         [[-1.7083e-01,  5.7872e-02, -1.5934e-01],
          [-1.8231e-01,  5.1538e-02,  1.6096e-01],
          [-1.2909e-01, -1.7455e-01,  2.6304e-02]],

         [[ 1.4798e-01,  9.5521e-02,  8.7700e-02],
          [ 7.3327e-02,  2.5389e-02,  1.2276e-01],
          [-1.8083e-01, -5.6925e-02,  7.8782e-02]]],


        [[[ 8.8384e-02, -7.8293e-02,  1.7724e-02],
          [ 1.5216e-01,  1.5532e-01,  6.6445e-02],
          [ 1.4861e-01,  1.2747e-01,  5.6395e-02]],

         [[ 4.0030e-02,  3.1046e-02,  5.7953e-02],
          [-1.4433e-01,  7.3532e-03, -1.8850e-02],
          [ 1.1188e-01,  1.9380e-02, -8.7861e-02]],

         [[-3.6651e-03,  1.3366e-01,  1.1445e-01],
          [-4.3057e-02, -2.3421e-03,  1.3645e-01],
          [ 6.5544e-02, -9.9577e-02, -1.1401e-01]]],


        [[[ 6.8335e-02,  6.3177e-02,  1.7513

In [52]:
type(params[0])

torch.nn.parameter.Parameter

In [None]:
nn.Conv2d()