In [1]:
import warnings
warnings.filterwarnings("ignore")

## Construct a simple model

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3, stride=1)
        self.conv2 = nn.Conv2d(6, 16, 3)
        
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        
        x = x.view(-1, self.num_flat_features(x))
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

The dimension match should be careful between the last `Conv2d` layer and the first `Linear` layer.

Layer Dimension change:

    W1 * H1 * D1 --> W2 * H2 * D2

For `conv` layer:

    W2 = (W1 - F + 2P)/S + 1
    H2 = (H2 - F + 2P)/S + 1
    D2 = K
    
For `maxpool` layer:
    
    W2 = (W1 - F)/S + 1
    H2 = (H1 - F)/S + 1
    D2 = D1
    
In this example:
- `input`: 32 * 32 * 1.
- `conv1`: 30 * 30 * 6.
- `max_pool2d`: 14 * 14 * 6.
- `conv2`: 12 * 12 * 16.
- `max_pool2d`: **6 * 6 * 16**.

Therefore, the input dimension of FC layer is _16 * 6 * 6_:

```python
self.fc1 = nn.Linear(16 * 6 * 6, 120)
```

Also, keep in mind to reshape the tensor inside `forward()` method. Here, we have a helper function `num_flat_features()` to compute the dimension.

In [3]:
nn.Conv2d.__init__

<function torch.nn.modules.conv.Conv2d.__init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')>

In [4]:
nn.Linear.__init__

<function torch.nn.modules.linear.Linear.__init__(self, in_features, out_features, bias=True)>

In [5]:
F._max_pool2d

<function torch.nn.functional._max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False, return_indices=False)>

### Note:

In `MaxPool2d`, the default value for `stride` is the `kernel_size`. Check [here](https://pytorch.org/docs/stable/nn.html#torch.nn.MaxPool2d).

In [6]:
F.relu

<function torch.nn.functional.relu(input, inplace=False)>

### NOTE: `forward()` must be implemented to allow forward propogation.

In [7]:
print(nn.Module.__doc__)

Base class for all neural network modules.

    Your models should also subclass this class.

    Modules can also contain other Modules, allowing to nest them in
    a tree structure. You can assign the submodules as regular attributes::

        import torch.nn as nn
        import torch.nn.functional as F

        class Model(nn.Module):
            def __init__(self):
                super(Model, self).__init__()
                self.conv1 = nn.Conv2d(1, 20, 5)
                self.conv2 = nn.Conv2d(20, 20, 5)

            def forward(self, x):
               x = F.relu(self.conv1(x))
               return F.relu(self.conv2(x))

    Submodules assigned in this way will be registered, and will have their
    parameters converted too when you call :meth:`to`, etc.
    


In [8]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


## Check parameters

In [9]:
params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

10
torch.Size([6, 1, 3, 3])


In [10]:
for p in params:
    print(p.size())

torch.Size([6, 1, 3, 3])
torch.Size([6])
torch.Size([16, 6, 3, 3])
torch.Size([16])
torch.Size([120, 576])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


The format is weight of layer 1, bias of layer 1, weight of layer 2, bias of layer 2, etc.

In [11]:
p = params[0]

In [12]:
repr(p)

'Parameter containing:\ntensor([[[[-0.0032, -0.2884,  0.1934],\n          [-0.2924,  0.2555,  0.0155],\n          [ 0.2796,  0.2962, -0.2466]]],\n\n\n        [[[ 0.2415,  0.1273,  0.1293],\n          [ 0.1940,  0.1790,  0.2177],\n          [ 0.2358, -0.0839,  0.3071]]],\n\n\n        [[[ 0.1891,  0.2000,  0.1162],\n          [ 0.0717,  0.1210, -0.2494],\n          [ 0.3034, -0.0455,  0.1981]]],\n\n\n        [[[ 0.2080, -0.0486,  0.0419],\n          [-0.1845, -0.0011,  0.1465],\n          [ 0.1239, -0.0538, -0.1986]]],\n\n\n        [[[-0.0278,  0.0943,  0.2926],\n          [-0.3070,  0.3168, -0.0244],\n          [ 0.0730,  0.0077,  0.1895]]],\n\n\n        [[[-0.0435, -0.0272, -0.0296],\n          [-0.1941, -0.2457,  0.0320],\n          [-0.2812, -0.2611,  0.2281]]]], requires_grad=True)'

## Forwarding

In [13]:
input = torch.randn(1, 1, 32, 32)
out = net(input) 
print(out)

tensor([[-0.0270,  0.1041, -0.1182,  0.0681, -0.0702, -0.0923, -0.0845,  0.1088,
         -0.0210,  0.0042]], grad_fn=<AddmmBackward>)


Here, `out` is a tensor, the output of the net.

In [14]:
print(input)

tensor([[[[ 0.1903, -0.3203,  0.4022,  ..., -1.2648,  1.3155, -0.2763],
          [ 0.5091, -1.6100, -1.8171,  ...,  2.0692,  0.1607, -0.1866],
          [ 0.0499,  0.3511, -0.4415,  ...,  0.3035,  3.5271,  0.3674],
          ...,
          [ 1.0345, -0.1714, -1.4335,  ...,  0.7592,  0.4028,  0.6199],
          [-0.3137, -0.0210, -0.4815,  ..., -0.0518,  0.2046, -0.2769],
          [-1.7634, -1.0415,  0.6456,  ..., -0.2587, -0.0298,  0.8212]]]])


In [15]:
net.forward(input)

tensor([[-0.0270,  0.1041, -0.1182,  0.0681, -0.0702, -0.0923, -0.0845,  0.1088,
         -0.0210,  0.0042]], grad_fn=<AddmmBackward>)

`net(input)` is equilvalent to `net.forward(input)`.

### Note: 
`torch.nn` only supports mini-batches. For example, `nn.Conv2d` will take in a 4D Tensor of `nSamples x nChannels x Height x Width`.

If you have a single sample, just use `input.unsqueeze(0)` to add a fake batch dimension.

In [16]:
single_input = torch.randn(1, 32, 32)

In [17]:
single_input.shape

torch.Size([1, 32, 32])

In [18]:
single_input.unsqueeze(0).shape

torch.Size([1, 1, 32, 32])

In [19]:
net(single_input.unsqueeze(0))

tensor([[-0.0209,  0.1042, -0.1215,  0.0354, -0.0676, -0.0959, -0.0522,  0.0996,
         -0.0114, -0.0009]], grad_fn=<AddmmBackward>)

### Calling backward

In [20]:
net.zero_grad()

In [21]:
out.backward(torch.randn(1, 10))

## Loss function

In [22]:
# Get a dummy target, for example
target = torch.randn(10)

In [23]:
target

tensor([ 0.5903,  0.6047,  0.7664,  0.5518, -0.3725,  1.2649, -0.6915, -1.5780,
         0.0733,  0.2612])

In [24]:
target.shape

torch.Size([10])

In [25]:
# make it the same shape as output
target = target.view(1, -1) 

In [26]:
target

tensor([[ 0.5903,  0.6047,  0.7664,  0.5518, -0.3725,  1.2649, -0.6915, -1.5780,
          0.0733,  0.2612]])

In [27]:
target.shape

torch.Size([1, 10])

Official doc for [`Tensor.view`](https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view)

In [28]:
# Get an output by forward propogation
output = net(input)

In [29]:
output

tensor([[-0.0270,  0.1041, -0.1182,  0.0681, -0.0702, -0.0923, -0.0845,  0.1088,
         -0.0210,  0.0042]], grad_fn=<AddmmBackward>)

In [30]:
criterion = nn.MSELoss()

### Note: 

A full list of loss functions in pytorch can be found [here](https://pytorch.org/docs/stable/nn.html#loss-functions).

In [31]:
loss = criterion(output, target)
print(loss)

tensor(0.6870, grad_fn=<MseLossBackward>)


In [32]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

print(loss.grad_fn.next_functions[0][0].next_functions[1][0]) 
print(loss.grad_fn.next_functions[0][0].next_functions[2][0])  

<MseLossBackward object at 0x7f2ace1db9e8>
<AddmmBackward object at 0x7f2ace1dba20>
<AccumulateGrad object at 0x7f2ace1db9e8>
<ReluBackward0 object at 0x7f2ace1dba90>
<TBackward object at 0x7f2ace1dba58>


In [33]:
print(loss.grad_fn.next_functions[0][0].next_functions[1][0].next_functions)

((<AddmmBackward object at 0x7f2ace1d8e80>, 0),)


## Back Propogation

In [34]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0019,  0.0030,  0.0087,  0.0046, -0.0154, -0.0070])


In [35]:
net.conv2.weight.shape

torch.Size([16, 6, 3, 3])

In [36]:
net.conv2.weight.grad.shape

torch.Size([16, 6, 3, 3])

In [37]:
net.conv2.bias.shape

torch.Size([16])

## Update the weights

In [38]:
import torch.optim as optim

In [39]:
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=10)

Check full optim list ([here](https://pytorch.org/docs/stable/optim.html)).

In [40]:
# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)

In [41]:
net.conv1.bias

Parameter containing:
tensor([ 0.1756,  0.0457,  0.0422,  0.1254, -0.1126,  0.0537],
       requires_grad=True)

In [42]:
net.conv1.bias.grad

tensor([0., 0., 0., 0., 0., 0.])

### Perform back propogation

In [43]:
loss.backward()

In [44]:
net.conv1.bias

Parameter containing:
tensor([ 0.1756,  0.0457,  0.0422,  0.1254, -0.1126,  0.0537],
       requires_grad=True)

In [45]:
net.conv1.bias.grad

tensor([ 0.0019,  0.0030,  0.0087,  0.0046, -0.0154, -0.0070])

The paramters are the same but the gradients are updated.

### Update the paramters

In [46]:
optimizer.step()    # Does the update

In [47]:
net.conv1.bias

Parameter containing:
tensor([ 0.1570,  0.0157, -0.0448,  0.0790,  0.0409,  0.1240],
       requires_grad=True)

In [48]:
net.conv1.bias.grad

tensor([ 0.0019,  0.0030,  0.0087,  0.0046, -0.0154, -0.0070])

The parameters are updated and the gradients stay the same.