# Building Neural Networks
## Importing libraries

In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# What device for training?
Use GPU of MPS if available. Device configured as such.

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


# Defining the nn class
Initialise the neural network layers in `__init__`. Every NN subclass implements the operations on input data in the `forward` method

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512,10),
        )
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

Create an instance of the class move it to the device and check the structure

In [5]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


As a test call the model with so

In [6]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([1])


# Layers
Break down the layers in the FashionMNIST model. Showing this taking a sample of 3 images of size $28 \times 28$

In [6]:
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


# `nn.Flatten`
Initialising the `nn.flatten` layer turns each 2d 28x28 imsgine into a contiguous array of 784 pixel values
(minibatch dimension(at dim = 0) is maintained)

In [7]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


# `nn.Linear`
Applies a linear transformation on the input usin gits stored weights and biases

In [8]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


# `nn.ReLU`
Non-linear activations are what creates the complex mappings between inputs and outputs. Applied after linear transformations ->introduces nonlinearity
NB : we use `nn.ReLU` between linear layers.

In [9]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.3271,  0.6551, -0.0294,  0.2055,  0.7999, -0.1869,  0.1115, -0.0953,
          0.2291,  0.0556,  0.0776, -0.1421,  0.2513, -0.3737,  0.5474, -0.1280,
         -0.3800, -0.3561,  0.0320, -0.2831],
        [ 0.0485,  0.4195,  0.1331,  0.5253,  0.4441,  0.2325, -0.1172, -0.2073,
          0.0944,  0.2236, -0.0412, -0.3232,  0.0824, -0.0556,  0.1232, -0.0487,
         -0.4097,  0.1007,  0.2564, -0.6745],
        [ 0.2458,  0.1071,  0.3514,  0.1873,  0.5955,  0.1518,  0.1190, -0.0096,
         -0.0644, -0.0054,  0.2538, -0.3848,  0.2111, -0.0580,  0.0679, -0.2022,
         -0.2319,  0.0022,  0.1452, -0.5868]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.3271, 0.6551, 0.0000, 0.2055, 0.7999, 0.0000, 0.1115, 0.0000, 0.2291,
         0.0556, 0.0776, 0.0000, 0.2513, 0.0000, 0.5474, 0.0000, 0.0000, 0.0000,
         0.0320, 0.0000],
        [0.0485, 0.4195, 0.1331, 0.5253, 0.4441, 0.2325, 0.0000, 0.0000, 0.0944,
         0.2236, 0.0000, 0.0000, 0.0824, 0.0000, 0.12

# `nn.Sequential`
An ordered container of modules. Data passed throuugh all the modules in the order defined. Sequential containers are used to put together a quick network like `seq_modules`.

In [10]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20,10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

# `nn.Softmax`
The last layer returns logits $\in [-\infty,\infty]$. These are passed to the `nn.Softmax` module. Logits scaled to values $[0,1]$ representing the models predicted probabilities for each class. `dim` parameter indicated the dimension along which the values sum to 1

In [11]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)
print(pred_probab)

tensor([[0.0957, 0.1340, 0.0760, 0.0952, 0.1268, 0.1045, 0.1037, 0.0930, 0.1001,
         0.0710],
        [0.0994, 0.1463, 0.0690, 0.1025, 0.1261, 0.0977, 0.0820, 0.1054, 0.1022,
         0.0694],
        [0.0878, 0.1439, 0.0793, 0.0925, 0.1285, 0.1042, 0.0861, 0.1084, 0.1002,
         0.0691]], grad_fn=<SoftmaxBackward0>)


# Model Parameters
Layers inside nn are parameterized (have associated weights and biases). Subclassing `nn.Module` automatically tracks all fields defined inside the object and makes all parameters accessible using the models `parameters()` or `named_parameters()` methods/

In [12]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values: {param[:2]}")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values: tensor([[ 0.0325, -0.0293, -0.0028,  ...,  0.0166, -0.0259, -0.0345],
        [-0.0055, -0.0312, -0.0004,  ...,  0.0318,  0.0341, -0.0024]],
       device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values: tensor([-0.0100, -0.0237], device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values: tensor([[-0.0078,  0.0349, -0.0391,  ..., -0.0145,  0.0326,  0.0190],
        [ 0.0028, -0.0051,  0.0187,  ...,  0.0012, -0.0079,  0.0357]],
       device='cuda:0', grad_fn=<SliceBack