In [1]:
import warnings
warnings.filterwarnings("ignore")

## Construct a simple model

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3, stride=1)
        self.conv2 = nn.Conv2d(6, 16, 3)
        
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        
        x = x.view(-1, self.num_flat_features(x))
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

The dimension match should be careful between the last `Conv2d` layer and the first `Linear` layer.

Layer Dimension change:

    W1 * H1 * D1 --> W2 * H2 * D2

For `conv` layer:

    W2 = (W1 - F + 2P)/S + 1
    H2 = (H2 - F + 2P)/S + 1
    D2 = K
    
For `maxpool` layer:
    
    W2 = (W1 - F)/S + 1
    H2 = (H1 - F)/S + 1
    D2 = D1
    
In this example:
- `input`: 32 * 32 * 1.
- `conv1`: 30 * 30 * 6.
- `max_pool2d`: 14 * 14 * 6.
- `conv2`: 12 * 12 * 16.
- `max_pool2d`: **6 * 6 * 16**.

Therefore, the input dimension of FC layer is _16 * 6 * 6_:

```python
self.fc1 = nn.Linear(16 * 6 * 6, 120)
```

Also, keep in mind to reshape the tensor inside `forward()` method. Here, we have a helper function `num_flat_features()` to compute the dimension.

In [3]:
nn.Conv2d.__init__

<function torch.nn.modules.conv.Conv2d.__init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros')>

In [4]:
nn.Linear.__init__

<function torch.nn.modules.linear.Linear.__init__(self, in_features, out_features, bias=True)>

In [5]:
F._max_pool2d

<function torch.nn.functional._max_pool2d(input, kernel_size, stride=None, padding=0, dilation=1, ceil_mode=False, return_indices=False)>

### Note:

In `MaxPool2d`, the default value for `stride` is the `kernel_size`. Check [here](https://pytorch.org/docs/stable/nn.html#torch.nn.MaxPool2d).

In [6]:
F.relu

<function torch.nn.functional.relu(input, inplace=False)>

### NOTE: `forward()` must be implemented to allow forward propogation.

In [7]:
print(nn.Module.__doc__)

Base class for all neural network modules.

    Your models should also subclass this class.

    Modules can also contain other Modules, allowing to nest them in
    a tree structure. You can assign the submodules as regular attributes::

        import torch.nn as nn
        import torch.nn.functional as F

        class Model(nn.Module):
            def __init__(self):
                super(Model, self).__init__()
                self.conv1 = nn.Conv2d(1, 20, 5)
                self.conv2 = nn.Conv2d(20, 20, 5)

            def forward(self, x):
               x = F.relu(self.conv1(x))
               return F.relu(self.conv2(x))

    Submodules assigned in this way will be registered, and will have their
    parameters converted too when you call :meth:`to`, etc.
    


In [8]:
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


## Check parameters

In [9]:
params = list(net.parameters())
print(len(params))
print(params[0].size())  # conv1's .weight

10
torch.Size([6, 1, 3, 3])


In [10]:
for p in params:
    print(p.size())

torch.Size([6, 1, 3, 3])
torch.Size([6])
torch.Size([16, 6, 3, 3])
torch.Size([16])
torch.Size([120, 576])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


The format is weight of layer 1, bias of layer 1, weight of layer 2, bias of layer 2, etc.

## Forwarding

In [11]:
input = torch.randn(1, 1, 32, 32)
out = net(input) 
print(out)

tensor([[-0.0590,  0.0326,  0.0151, -0.0278,  0.0290, -0.0590,  0.0264, -0.0979,
          0.0484,  0.1259]], grad_fn=<AddmmBackward>)


Here, `out` is a tensor, the output of the net.

In [12]:
print(input)

tensor([[[[-0.2681,  0.4807, -0.6466,  ..., -0.1946, -0.0474,  1.2671],
          [ 0.5506,  0.8510, -0.1146,  ...,  0.6029, -0.2611,  1.2922],
          [ 1.2703, -1.2346, -0.7000,  ...,  1.0226, -0.7621,  0.2494],
          ...,
          [ 1.9969,  0.5349, -0.8191,  ..., -0.8485,  0.3072, -0.1953],
          [-0.3355, -0.7920, -0.0837,  ...,  0.0290,  0.6604,  1.6055],
          [-1.3876,  0.8811, -0.4657,  ..., -0.5167,  0.1634,  0.8793]]]])


In [13]:
net.forward(input)

tensor([[-0.0590,  0.0326,  0.0151, -0.0278,  0.0290, -0.0590,  0.0264, -0.0979,
          0.0484,  0.1259]], grad_fn=<AddmmBackward>)

`net(input)` is equilvalent to `net.forward(input)`.

### Note: 
`torch.nn` only supports mini-batches. For example, `nn.Conv2d` will take in a 4D Tensor of `nSamples x nChannels x Height x Width`.

If you have a single sample, just use `input.unsqueeze(0)` to add a fake batch dimension.

In [14]:
single_input = torch.randn(1, 32, 32)

In [15]:
single_input.shape

torch.Size([1, 32, 32])

In [16]:
single_input.unsqueeze(0).shape

torch.Size([1, 1, 32, 32])

In [17]:
net(single_input.unsqueeze(0))

tensor([[-0.0408,  0.0480,  0.0079,  0.0126,  0.0540, -0.0716,  0.0116, -0.0941,
          0.0815,  0.1049]], grad_fn=<AddmmBackward>)

### Calling backward

In [18]:
net.zero_grad()

In [19]:
out.backward(torch.randn(1, 10))

## Loss function

In [20]:
# Get a dummy target, for example
target = torch.randn(10)

In [21]:
target

tensor([ 0.4007,  0.7389,  2.0597,  0.2588, -0.4709,  0.9864, -0.1973, -0.4430,
        -0.7187,  0.8494])

In [22]:
target.shape

torch.Size([10])

In [23]:
# make it the same shape as output
target = target.view(1, -1) 

In [24]:
target

tensor([[ 0.4007,  0.7389,  2.0597,  0.2588, -0.4709,  0.9864, -0.1973, -0.4430,
         -0.7187,  0.8494]])

In [25]:
target.shape

torch.Size([1, 10])

Official doc for [`Tensor.view`](https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view)

In [26]:
# Get an output by forward propogation
output = net(input)

In [27]:
output

tensor([[-0.0590,  0.0326,  0.0151, -0.0278,  0.0290, -0.0590,  0.0264, -0.0979,
          0.0484,  0.1259]], grad_fn=<AddmmBackward>)

In [28]:
criterion = nn.MSELoss()

### Note: 

A full list of loss functions in pytorch can be found [here](https://pytorch.org/docs/stable/nn.html#loss-functions).

In [29]:
loss = criterion(output, target)
print(loss)

tensor(0.7597, grad_fn=<MseLossBackward>)


In [30]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU

print(loss.grad_fn.next_functions[0][0].next_functions[1][0]) 
print(loss.grad_fn.next_functions[0][0].next_functions[2][0])  

<MseLossBackward object at 0x7fe53dbecdd8>
<AddmmBackward object at 0x7fe53dbece80>
<AccumulateGrad object at 0x7fe53dbecdd8>
<ReluBackward0 object at 0x7fe53dbecef0>
<TBackward object at 0x7fe53dbeceb8>


In [31]:
print(loss.grad_fn.next_functions[0][0].next_functions[1][0].next_functions)

((<AddmmBackward object at 0x7fe53dbe8c18>, 0),)


## Back Propogation

In [32]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0097, -0.0107, -0.0066,  0.0141, -0.0073, -0.0014])


In [33]:
net.conv2.weight.shape

torch.Size([16, 6, 3, 3])

In [34]:
net.conv2.weight.grad.shape

torch.Size([16, 6, 3, 3])

In [35]:
net.conv2.bias.shape

torch.Size([16])

## Update the weights

In [36]:
import torch.optim as optim

In [37]:
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=10)

In [38]:
print(optim.SGD.__doc__)

Implements stochastic gradient descent (optionally with momentum).

    Nesterov momentum is based on the formula from
    `On the importance of initialization and momentum in deep learning`__.

    Args:
        params (iterable): iterable of parameters to optimize or dicts defining
            parameter groups
        lr (float): learning rate
        momentum (float, optional): momentum factor (default: 0)
        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
        dampening (float, optional): dampening for momentum (default: 0)
        nesterov (bool, optional): enables Nesterov momentum (default: False)

    Example:
        >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
        >>> optimizer.zero_grad()
        >>> loss_fn(model(input), target).backward()
        >>> optimizer.step()

    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf

    .. note::
        The implementation of SGD with Momentum/Nesterov subtly dif

Here, the input of pytorch optimizer includes a parameter iterator `net.paramters()`.

Check full optim list ([here](https://pytorch.org/docs/stable/optim.html)).

In [39]:
# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)

In [40]:
net.conv1.bias

Parameter containing:
tensor([-0.2270, -0.2612, -0.0787,  0.2637, -0.0495,  0.1804],
       requires_grad=True)

In [41]:
net.conv1.bias.grad

tensor([0., 0., 0., 0., 0., 0.])

### Perform back propogation

In [42]:
loss.backward()

In [43]:
net.conv1.bias

Parameter containing:
tensor([-0.2270, -0.2612, -0.0787,  0.2637, -0.0495,  0.1804],
       requires_grad=True)

In [44]:
net.conv1.bias.grad

tensor([ 0.0097, -0.0107, -0.0066,  0.0141, -0.0073, -0.0014])

The paramters are the same but the gradients are updated.

### Update the paramters

In [45]:
optimizer.step()    # Does the update

In [46]:
net.conv1.bias

Parameter containing:
tensor([-0.3237, -0.1547, -0.0128,  0.1222,  0.0238,  0.1943],
       requires_grad=True)

In [47]:
net.conv1.bias.grad

tensor([ 0.0097, -0.0107, -0.0066,  0.0141, -0.0073, -0.0014])

The parameters are updated and the gradients stay the same.