In [1]:
import torch                     # for all things PyTorch
import torch.nn as nn            # for torch.nn.Module, the parent object for PyTorch models
import torch.nn.functional as F  # for the activation function

## torch.nn.Module

### PyTorch base class meant to encapsulate behaviors specific to PyTorch Models and their components.


In [2]:
class TinyModel(torch.nn.Module):

    def __init__(self):
        super(TinyModel, self).__init__()

        self.linear1 = torch.nn.Linear(100, 200) # 100 input features, 200 output features
        self.activation = torch.nn.ReLU()        # activation function
        self.linear2 = torch.nn.Linear(200, 10)  # 200 input features, 10 output features
        self.softmax = torch.nn.Softmax()       # softmax function for the output layer

    # Forward pass is where the computation happens
    # we first pass the input through the first linear layer, then apply the activation function
    # then pass it through the second linear layer and apply the softmax function
    # the output is the prediction
    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x

tinymodel = TinyModel()

print('The model:')
print(tinymodel)

print('\n\nJust one layer:')
print(tinymodel.linear2)

print('\n\nModel params:')
for param in tinymodel.parameters():
    print(param)

# The parameters are the weights and biases of the linear layers, which are initialized randomly
# The weights and biases of the linear layers are learned during training
print('\n\nLayer params:')
for param in tinymodel.linear2.parameters():
    print(param)

The model:
TinyModel(
  (linear1): Linear(in_features=100, out_features=200, bias=True)
  (activation): ReLU()
  (linear2): Linear(in_features=200, out_features=10, bias=True)
  (softmax): Softmax(dim=None)
)


Just one layer:
Linear(in_features=200, out_features=10, bias=True)


Model params:
Parameter containing:
tensor([[ 0.0636, -0.0642,  0.0836,  ...,  0.0769,  0.0700, -0.0863],
        [-0.0265,  0.0723, -0.0642,  ..., -0.0276, -0.0171, -0.0007],
        [-0.0558, -0.0938,  0.0804,  ..., -0.0170, -0.0043,  0.0638],
        ...,
        [ 0.0080,  0.0462,  0.0293,  ...,  0.0191,  0.0253, -0.0472],
        [ 0.0822,  0.0016, -0.0180,  ...,  0.0374,  0.0943,  0.0608],
        [-0.0666,  0.0545,  0.0786,  ...,  0.0687,  0.0679, -0.0440]],
       requires_grad=True)
Parameter containing:
tensor([-0.0062,  0.0739, -0.0933,  0.0564,  0.0409,  0.0467,  0.0459, -0.0539,
        -0.0150,  0.0497, -0.0374,  0.0742,  0.0079, -0.0923, -0.0961, -0.0704,
         0.0828, -0.0573,  0.0010, -0.06

## Common Layer Types


## Linear Layers

### The most basic type of neural network layer is a linear or fully connected layer. This is a layer where every input influences every output of the layer to a degree specified by the layer’s weights. If a model has m inputs and n outputs, the weights will be an m x n matrix.


In [3]:
lin = torch.nn.Linear(3, 2)
x = torch.rand(1, 3)
print('Input:')
print(x)

# The weights and biases of the linear layer are initialized randomly
print('\n\nWeight and Bias parameters:')
for param in lin.parameters():
    print(param)

y = lin(x)
print('\n\nOutput:')
print(y)

# If you do the matrix multiplication of x by the linear layer’s weights, 
# and add the biases, you’ll find that you get the output vector y.

Input:
tensor([[0.9332, 0.9424, 0.9149]])


Weight and Bias parameters:
Parameter containing:
tensor([[ 0.1983, -0.3024,  0.5127],
        [-0.0189,  0.4520,  0.3182]], requires_grad=True)
Parameter containing:
tensor([0.5734, 0.4034], requires_grad=True)


Output:
tensor([[0.9425, 1.1028]], grad_fn=<AddmmBackward0>)


## Convolutional Layers


### Convolutional layers are built to handle data with a high degree of spatial correlation. They are very commonly used in computer vision, where they detect close groupings of features which the compose into higher-level features. They pop up in other contexts too - for example, in NLP applications, where a word’s immediate context (that is, the other words nearby in the sequence) can affect the meaning of a sentence.


In [4]:
class LeNet(nn.Module):

    def __init__(self):
        super(LeNet, self).__init__()
        # 1 input image channel (black & white), 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 5 * 5, 120)  # 5*5 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        # The max pooling layer takes features near each other in the activation map and groups them together.
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

In [5]:
net = LeNet()
print(net)                         # what does the object tell us about itself?

input = torch.rand(1, 1, 32, 32)   # stand-in for a 32x32 black & white image
print('\nImage batch shape:')
print(input.shape)

output = net(input)                # we don't call forward() directly
print('\nRaw output:')
print(output)
print(output.shape)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

Image batch shape:
torch.Size([1, 1, 32, 32])

Raw output:
tensor([[ 0.0284, -0.0938, -0.1077, -0.0768,  0.0210, -0.0295,  0.0396, -0.0268,
         -0.0914,  0.1199]], grad_fn=<AddmmBackward0>)
torch.Size([1, 10])


## Recurrent Layers

### Recurrent neural networks (or RNNs) are used for sequential data - anything from time-series measurements from a scientific instrument to natural language sentences to DNA nucleotides. An RNN does this by maintaining a hidden state that acts as a sort of memory for what it has seen in the sequence so far.


In [6]:
class LSTMTagger(torch.nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        # Embeddings are numerical representations of real-world objects like images, videos or words.
        # e.g words with similar meanings are close together in the space.
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = torch.nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence) # Converts the input word indices into word embeddings.
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1) # Applies the log-softmax function to produce log-probabilities for each tag.
        return tag_scores

## Transformers

### Allows you to define the overall parameters of a transformer model - the number of attention heads, the number of encoder & decoder layers, dropout and activation functions, etc


## Other layers

### Data Manipulation Layers

#### Max pooling (and its twin, min pooling) reduce a tensor by combining cells, and assigning the maximum value of the input cells to the output cell.


In [7]:
my_tensor = torch.rand(1, 6, 6)
print(my_tensor)

maxpool_layer = torch.nn.MaxPool2d(3)
print(maxpool_layer(my_tensor))

tensor([[[0.8401, 0.6217, 0.3480, 0.3335, 0.1779, 0.5798],
         [0.5546, 0.5826, 0.5134, 0.3894, 0.1720, 0.8062],
         [0.5262, 0.5287, 0.7235, 0.1462, 0.0269, 0.5949],
         [0.4697, 0.5153, 0.3973, 0.7915, 0.5233, 0.7882],
         [0.0255, 0.6345, 0.3991, 0.1427, 0.9327, 0.3483],
         [0.6112, 0.4589, 0.5478, 0.2161, 0.3562, 0.0699]]])
tensor([[[0.8401, 0.8062],
         [0.6345, 0.9327]]])


### Normalization layers re-center and normalize the output of one layer before feeding it to another. Centering and scaling the intermediate tensors has a number of beneficial effects, such as letting you use higher learning rates without exploding/vanishing gradients.


In [8]:
my_tensor = torch.rand(1, 4, 4) * 20 + 5
print(my_tensor)

print(my_tensor.mean())

norm_layer = torch.nn.BatchNorm1d(4)
normed_tensor = norm_layer(my_tensor)
print(normed_tensor)

print(normed_tensor.mean())

tensor([[[19.4567, 14.6704, 19.9919, 13.8026],
         [18.2412, 11.4850, 16.4616,  7.6896],
         [ 9.0689, 12.4020, 22.6494, 24.1114],
         [17.0753,  8.1510,  8.0968, 11.2819]]])
tensor(14.6647)
tensor([[[ 0.8948, -0.8347,  1.0882, -1.1483],
         [ 1.1484, -0.4775,  0.7201, -1.3909],
         [-1.2382, -0.7216,  0.8666,  1.0932],
         [ 1.6207, -0.8208, -0.8356,  0.0357]]],
       grad_fn=<NativeBatchNormBackward0>)
tensor(1.6391e-07, grad_fn=<MeanBackward0>)
