In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

In [2]:
print(torch.cuda.is_available())

False


# Linear Layer ([Documentation](http://pytorch.org/docs/stable/generated/torch.nn.Linear.html))
``torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)``


Let us start with a single neuron having 10 inputs and just 1 output, but no bias

## Single Neuron

Let us start with a single neuron having 10 input features and just 1 output feature

In [3]:
single_neuron = nn.Linear(10, 1, bias = False)

PyTorch automatically ininitializes the weights of all layers, we can look at those weights by calling

In [4]:
print(single_neuron.weight)
print(single_neuron.bias)

Parameter containing:
tensor([[ 0.2796,  0.0697, -0.0067, -0.2011,  0.2039, -0.1499, -0.1548, -0.0609,
         -0.2994,  0.0562]], requires_grad=True)
None


An All-Zero Output always evaluates to 0 when there is no bias. Let us verify this by creating a suitable input tensor of ``(b, input_features) == (1, 10)`` with ``b`` being the batch size, ``input_features`` being the number of input features.

In [37]:
input_0 = torch.zeros((1, 10))
input_1 = torch.ones((1, 10))
input_1000 = 1000 * torch.ones((1, 10))

In [38]:
print(single_neuron(input_0))
print(single_neuron(input_1))
print(single_neuron(input_1000))

tensor([[0.]], grad_fn=<MmBackward0>)
tensor([[0.3221]], grad_fn=<MmBackward0>)
tensor([[322.0690]], grad_fn=<MmBackward0>)


In [39]:
# check the sum of weights
print(torch.sum(single_neuron.weight))

tensor(0.3221, grad_fn=<SumBackward0>)


## Linear Layer with 10 input features and 10 output features

In [40]:
linear_layer = nn.Linear(10, 5, bias = True)

In [41]:
linear_input = torch.rand((1, 10))

In [42]:
linear_layer(linear_input)

tensor([[0.6386, 0.6646, 0.1082, 0.4698, 0.2296]], grad_fn=<AddmmBackward0>)

A linear layer is applied to an input tensor by applying a linear transformation of the form $y = xA^T + b$. This can be easily varified by evaluating the equation manually:

In [43]:
torch.mm(linear_input, linear_layer.weight.T)+linear_layer.bias

tensor([[0.6386, 0.6646, 0.1082, 0.4698, 0.2296]], grad_fn=<AddBackward0>)

In [44]:
# easy to produce errors with wrong layout
print( linear_layer.bias.shape)
print( torch.mm(linear_layer.weight, linear_input.T) + linear_layer.bias.T )
print( torch.mm(linear_layer.weight, linear_input.T) + linear_layer.bias)
print( torch.mm(linear_layer.weight, linear_input.T) + linear_layer.bias.reshape(5,1) ) #only working line of code

print( torch.mm(linear_layer.weight, linear_input) + linear_layer.bias)



torch.Size([5])
tensor([[0.6386, 0.5418, 0.2708, 0.4533, 0.4407],
        [0.7613, 0.6646, 0.3936, 0.5761, 0.5634],
        [0.4759, 0.3791, 0.1082, 0.2906, 0.2780],
        [0.6551, 0.5583, 0.2873, 0.4698, 0.4572],
        [0.4275, 0.3307, 0.0598, 0.2423, 0.2296]], grad_fn=<AddBackward0>)
tensor([[0.6386, 0.5418, 0.2708, 0.4533, 0.4407],
        [0.7613, 0.6646, 0.3936, 0.5761, 0.5634],
        [0.4759, 0.3791, 0.1082, 0.2906, 0.2780],
        [0.6551, 0.5583, 0.2873, 0.4698, 0.4572],
        [0.4275, 0.3307, 0.0598, 0.2423, 0.2296]], grad_fn=<AddBackward0>)
tensor([[0.6386],
        [0.6646],
        [0.1082],
        [0.4698],
        [0.2296]], grad_fn=<AddBackward0>)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (5x10 and 1x10)

In [45]:
# numpy verification

np.dot( linear_layer.weight.detach().numpy(), linear_input.T.detach().numpy()) + linear_layer.bias.reshape(5,1).detach().numpy()

array([[0.6385689 ],
       [0.664554  ],
       [0.10815333],
       [0.46982756],
       [0.22959767]], dtype=float32)

## Multi Layer Perceptron (MLP)

A Neural Network consisting of more than one linear layer (also referred to as Fully Connected Layer) is also called a Multi Layer Perceptron (MLP).

In [46]:
# often a neural network is defined as object derived from "nn.Module"
# we would like to define an "__init__" and "forward" function

class ThreeLayerMLP(nn.Module):
    def __init__(self, input_features, hidden_features, output_features):
        super(ThreeLayerMLP, self).__init__()
        self.layer_0 = nn.Linear(in_features = input_features, out_features = hidden_features)
        self.layer_1 = nn.Linear(in_features = hidden_features, out_features = hidden_features)
        self.layer_2 = nn.Linear(in_features = hidden_features, out_features = output_features)
        
    def forward(self, x):
        x = self.layer_0(x)
        x = self.layer_1(x)
        x = self.layer_2(x)
        
        return x

In [47]:
my_mlp = ThreeLayerMLP(10, 5, 1)

In [48]:
mlp_input = torch.rand((1, 10))

In [49]:
my_mlp(mlp_input) # this calls the forward function; handles distribution on multiple GPUs

tensor([[-0.2671]], grad_fn=<AddmmBackward0>)

In [50]:
# add activation functions
# 

class ThreeLayerMLPNew(nn.Module):
    def __init__(self, input_features, hidden_features, output_features):
        super(ThreeLayerMLPNew, self).__init__()
        self.layer_0 = nn.Linear(in_features = input_features, out_features = hidden_features)
        self.layer_1 = nn.Linear(in_features = hidden_features, out_features = hidden_features)
        self.layer_2 = nn.Linear(in_features = hidden_features, out_features = output_features)
        self.relu = nn.ReLU() # only defined once not three times / depends on coding style / better to define layer for each use

    def forward(self, x):
        x = self.layer_0(x)
        x = self.relu(x)
        x = self.layer_1(x)
        x = self.relu(x)
        x = self.layer_2(x)
        x = self.relu(x)
        
        return x

In [51]:
# execute multiple times to see that output never gets < 0

my_mlpnew = ThreeLayerMLPNew(3, 3, 1)
mlp_input = torch.rand((1, 3))
my_mlpnew(mlp_input)

tensor([[0.0154]], grad_fn=<ReluBackward0>)

# Conv2d Layer ([Documentation](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html))
``torch.nn.Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode='zeros', device=None, dtype=None)``

In [52]:
conv_input = torch.randn(1, 3, 9, 9) # batch_size == 1, num_channels == 3, height == 9, width == 9

In [53]:
# With square 3x3 kernels
m = nn.Conv2d(3, 1, 3, stride = 1, padding = 1)

In [54]:
output = m(conv_input)
print(output.shape)
print(output)

torch.Size([1, 1, 9, 9])
tensor([[[[-0.0688,  1.0080, -0.1952, -0.4213,  1.2057,  0.0352,  0.3721,
           -0.7106,  0.6480],
          [ 0.3263, -1.1588,  0.7259, -0.0271,  1.2648, -0.8364,  0.0496,
            1.6218, -0.6744],
          [-0.2796,  0.4528, -0.0030,  0.3925,  0.7096, -0.0309,  0.0199,
            0.3052,  0.7400],
          [ 0.2252,  0.1637, -0.1927,  0.8032,  0.8487, -0.1268, -0.1083,
           -0.5677, -0.1207],
          [-0.1011, -0.3054,  1.5335,  0.0322, -0.6163,  0.1893,  0.5967,
           -0.3229, -0.3054],
          [ 0.3376,  0.0649, -0.5177,  0.4741,  0.7551, -0.4433, -0.2757,
            0.1929,  0.1562],
          [ 0.8507, -0.1239,  0.6604,  0.1699, -1.7184,  0.3322,  0.9810,
            0.6598, -0.1032],
          [ 0.4704,  1.1027,  0.0681, -0.8049, -0.2732,  0.2082,  0.4457,
           -0.2880, -0.4796],
          [ 0.2129, -0.7431, -1.0015,  0.9351,  0.2950,  0.4925,  0.3716,
            0.2472,  0.0209]]]], grad_fn=<ConvolutionBackward0>)


In [55]:
# With square 3x3 kernels and stride 2
m = nn.Conv2d(3, 1, 3, stride =  2, padding = 1)

In [56]:
output = m(conv_input)
print(output.shape)
print(output)

torch.Size([1, 1, 5, 5])
tensor([[[[ 0.7016, -0.4116,  0.5285, -0.2563, -0.1253],
          [ 0.2424,  0.0719,  0.1447,  0.5472, -0.0517],
          [-0.5323,  0.9849, -0.1615,  0.7256,  0.7983],
          [ 0.1683,  1.2420, -1.2064,  1.2024,  0.5533],
          [ 0.0993, -0.6590,  0.4505,  0.6603,  0.5297]]]],
       grad_fn=<ConvolutionBackward0>)


## Convolutional Neural Network

A Neural Network that only consists of convolutional layers is also referred to as a convolutional neural network. The big advantage is that the input size can vary as it only has to have the right number of input channels but the spatial height and width can vary.

In [57]:
class ConvNet(nn.Module):
    def __init__(self, input_channels, hidden_channels, output_channels):
        super(ConvNet, self).__init__()
        self.layer_0 = nn.Conv2d(in_channels = input_channels, out_channels = hidden_channels, kernel_size = 3, stride = 1, padding = 1)
        self.relu0 = nn.ReLU()
        self.layer_1 = nn.Conv2d(in_channels = hidden_channels, out_channels = hidden_channels, kernel_size = 3, stride = 1, padding = 1)
        self.relu1 = nn.ReLU()
        self.layer_2 = nn.Conv2d(in_channels = hidden_channels, out_channels = output_channels, kernel_size = 3, stride = 1, padding = 1)
        self.relu2 = nn.ReLU()

    def forward(self, x):
        x = self.layer_0(x)
        x = self.relu0( x )
        x = self.layer_1(x)
        x = self.relu1( x )
        x = self.layer_2(x)
        x = self.relu2( x )
        
        return x

In [58]:
conv_net = nn.Conv2d(3, 3, 1)

In [59]:
conv_input_small = torch.rand((1, 3, 16, 16)) # batch_size == 1, num_channels == 3, height == 16, width == 16
conv_input_large = torch.rand((1, 3, 32, 32)) # batch_size == 1, num_channels == 3, height == 32, width == 32

In [60]:
print(conv_net(conv_input_small).shape)
print(conv_net(conv_input_large).shape)

torch.Size([1, 3, 16, 16])
torch.Size([1, 3, 32, 32])


## Conv-Net with FC Layer

In practice, Neural Networks often consist of a combination of different layer types like Conv2d and Linear. A linear layer is often used in the very end of a classifier network to obtain the right number of image classes. The network can therefore handle only on particular input image resolution. Let us define a classifier that takes RGB images (3 channels) of size 32x32 as input.

In [61]:
class Classifier(nn.Module): 
    def __init__(self, input_channels, hidden_channels, num_output_classes):
        super(Classifier, self).__init__()
        
        self.conv_part = ConvNet(input_channels, hidden_channels, hidden_channels)
        
        self.fc_layer = nn.Linear(in_features = 3 * 32 * 32, out_features = num_output_classes)
        
    def forward(self, x):
        x = self.conv_part(x)
        x = torch.flatten(x, start_dim = 1) # Reshape to a single vector of size (3 * 32 * 32)
        x = self.fc_layer(x)
        return x

In [62]:
classifier = Classifier(3, 3, 10)
good_input = torch.rand((1, 3, 32, 32))

In [63]:
print(classifier(good_input).shape)

torch.Size([1, 10])


In [64]:
bad_input = torch.rand((1, 3, 16, 16))
print(classifier(bad_input).shape) # This does not work because the Classifier contains a fully-connected layer

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x768 and 3072x10)

# TODO
* Randomly initialise weight
* Implement forward propagation to get ) for any
* Implement backprop to compute partial derivatives
* For all the samples, perform forward propagation and
backpropagation
* Using numerical estimation of gradient to check the gradient
calculation, disable after checking
* Use gradient descent or advanced optimization method with
backpropagation to try to minimize cost function