In [2]:
import os
import torch

from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else 'cpu'
print(f'Using {device} device')

Using cuda device


### Define neural network

The neural network is defined by subclassing `nn.Module`. Every `nn.Module` subclass implements the operations on input data in the `forward` method. Method `__init__` initialize the neural network layers.

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten() # converts 2D 28x28 images to array of 784 pixel values
        self.linear_relu_stack = nn.Sequential( # ordered container of modules
            nn.Linear(in_features=28 * 28, out_features=512), # linear transform. using stored weights and biases
            nn.ReLU(), # non-linear activation function
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits
    
model = NeuralNetwork().to(device)
model

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

### Using the model

In [6]:
X = torch.rand(1, 28, 28, device=device)

logits = model(X)
pred_values = nn.Softmax(dim=1)(logits)
y_pred = pred_values.argmax(1)

print('Predicted class: ', y_pred.item())

Predicted class:  3


### Layer by layer

 We define a minibatch of 3 images of size 28*28:

In [15]:
input_image = torch.rand(3,28,28)
print('Input image size: ', input_image.size())

Input image size:  torch.Size([3, 28, 28])


`Flatten` converts the image to an array of 784 pixel values. The minibatch dimension is maintained:

In [20]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print('Flat image size: ', flat_image.size())

Flat image size:  torch.Size([3, 784])


`Linear` applies a linear transformation on the input using its stored weights and biases:

In [23]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print('Size after linear transformation: ', hidden1.size())

Size after linear transformation:  torch.Size([3, 20])


`ReLU` applies a non-linear transformation:

In [24]:
print('Before ReLU: ', hidden1)
hidden1 = nn.ReLU()(hidden1)
print('After ReLU: ', hidden1)

Before ReLU:  tensor([[-0.3923, -0.1192,  0.1887, -0.1081, -0.3248, -0.0498,  0.1780,  0.6743,
         -0.9071,  0.7844, -0.3542, -0.1603, -0.0810, -0.0984, -0.1905,  0.1194,
         -0.2885,  0.7343,  0.1375, -0.2281],
        [-0.4743, -0.1854,  0.0558, -0.5121, -0.2195,  0.0453,  0.1522, -0.0744,
         -0.9550,  0.4418, -0.4480, -0.3307, -0.3327, -0.0805, -0.3292, -0.2323,
         -0.2693,  0.5589, -0.0204, -0.1819],
        [-0.5580, -0.4201,  0.1537, -0.1968, -0.2932,  0.1267, -0.0822,  0.4221,
         -0.6883,  0.5122, -0.5465, -0.2546, -0.1018, -0.0270, -0.4054, -0.2970,
         -0.5301,  0.6463, -0.1101, -0.1448]], grad_fn=<AddmmBackward0>)
After ReLU:  tensor([[0.0000, 0.0000, 0.1887, 0.0000, 0.0000, 0.0000, 0.1780, 0.6743, 0.0000,
         0.7844, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1194, 0.0000, 0.7343,
         0.1375, 0.0000],
        [0.0000, 0.0000, 0.0558, 0.0000, 0.0000, 0.0453, 0.1522, 0.0000, 0.0000,
         0.4418, 0.0000, 0.0000, 0.0000, 0.0000, 0.00

`Sequential` is an ordered container of modules:

In [26]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20,10)
)

input_image = torch.rand(3, 28, 28)
logits = seq_modules(input_image)
logits

tensor([[ 0.1844,  0.1351,  0.3032, -0.0150,  0.0045, -0.0107,  0.0497, -0.4107,
          0.3296, -0.0487],
        [ 0.1987,  0.1266,  0.3192, -0.0276, -0.0069, -0.0661,  0.0207, -0.3959,
          0.3263, -0.0914],
        [ 0.1601,  0.0806,  0.2718,  0.0573,  0.0143, -0.0337,  0.0384, -0.4255,
          0.2913, -0.0039]], grad_fn=<AddmmBackward0>)

`Softmax` returns logits (raw values in [-infinity, infinity]). These are scales to [0,1], representing the "model's predicted probabilities" for each class.

In [None]:
softmax = nn.Softmax(dim=1)
pred_values = softmax(logits)
pred_values # all must sum to 1

tensor([[0.1120, 0.1066, 0.1261, 0.0917, 0.0936, 0.0921, 0.0979, 0.0618, 0.1295,
         0.0887],
        [0.1148, 0.1068, 0.1295, 0.0916, 0.0935, 0.0881, 0.0961, 0.0633, 0.1304,
         0.0859],
        [0.1103, 0.1019, 0.1234, 0.0996, 0.0954, 0.0909, 0.0977, 0.0614, 0.1258,
         0.0936]], grad_fn=<SoftmaxBackward0>)

`argmax` provides the greatest values for each image:

In [32]:
pred_values.argmax(dim=1)

tensor([8, 8, 8])

### Model parameters

We can check the model parameters and structure:

In [34]:
model

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

In [36]:
for name, param in model.named_parameters():
    print(f'Layer : {name} | Size : {param.size()} | Values : {param[:2]}\n')

Layer : linear_relu_stack.0.weight | Size : torch.Size([512, 784]) | Values : tensor([[-0.0225, -0.0345, -0.0100,  ..., -0.0295,  0.0084,  0.0331],
        [ 0.0316,  0.0332, -0.0155,  ..., -0.0191,  0.0252,  0.0142]],
       device='cuda:0', grad_fn=<SliceBackward0>)

Layer : linear_relu_stack.0.bias | Size : torch.Size([512]) | Values : tensor([-0.0192, -0.0015], device='cuda:0', grad_fn=<SliceBackward0>)

Layer : linear_relu_stack.2.weight | Size : torch.Size([512, 512]) | Values : tensor([[-0.0352,  0.0106, -0.0177,  ...,  0.0342,  0.0129, -0.0091],
        [ 0.0375, -0.0296, -0.0333,  ..., -0.0047,  0.0271, -0.0410]],
       device='cuda:0', grad_fn=<SliceBackward0>)

Layer : linear_relu_stack.2.bias | Size : torch.Size([512]) | Values : tensor([ 0.0187, -0.0039], device='cuda:0', grad_fn=<SliceBackward0>)

Layer : linear_relu_stack.4.weight | Size : torch.Size([10, 512]) | Values : tensor([[-0.0054,  0.0228, -0.0079,  ...,  0.0115,  0.0318, -0.0103],
        [ 0.0312, -0.0431, -0