# Building Neural Networks
## Importing libraries

In [2]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# What device for training?
Use GPU of MPS if available. Device configured as such.

In [4]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


# Defining the nn class
Initialise the neural network layers in `__init__`. Every NN subclass implements the operations on input data in the `forward` method

In [19]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512,10),
        )
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

Create an instance of the class move it to the device and check the structure

In [20]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


As a test call the model with so

In [22]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([5], device='cuda:0')


# Layers
Break down the layers in the FashionMNIST model. Showing this taking a sample of 3 images of size $28 \times 28$

In [24]:
input_image = torch.rand(3,28,28)
print(input_image.size())

torch.Size([3, 28, 28])


# `nn.Flatten`
Initialising the `nn.flatten` layer turns each 2d 28x28 imsgine into a contiguous array of 784 pixel values
(minibatch dimension(at dim = 0) is maintained)

In [25]:
flatten = nn.Flatten()
flat_image = flatten(input_image)
print(flat_image.size())

torch.Size([3, 784])


# `nn.Linear`
Applies a linear transformation on the input usin gits stored weights and biases

In [26]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_image)
print(hidden1.size())

torch.Size([3, 20])


# `nn.ReLU`
Non-linear activations are what creates the complex mappings between inputs and outputs. Applied after linear transformations ->introduces nonlinearity
NB : we use `nn.ReLU` between linear layers.

In [27]:
print(f"Before ReLU: {hidden1}\n\n")
hidden1 = nn.ReLU()(hidden1)
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.5523,  0.0487,  0.6696,  0.0483, -0.2111, -0.4048,  0.1154,  0.2000,
         -0.2358,  0.0392,  0.9795,  0.2493, -0.4088,  0.0634, -0.1928,  0.1731,
          0.3302,  0.2490,  0.0813,  0.3022],
        [ 0.8684,  0.0963,  0.7463,  0.2036, -0.0080, -0.1548,  0.1826,  0.2404,
         -0.2113, -0.3431,  0.9179, -0.3393, -0.3398,  0.1019, -0.0288, -0.1276,
          0.4030,  0.5859, -0.3332,  0.3952],
        [ 0.6744,  0.2774,  0.3268, -0.2159,  0.0605, -0.0430,  0.4705,  0.1292,
         -0.2732, -0.0534,  1.1249, -0.1643,  0.0082,  0.0472,  0.0759, -0.1230,
          0.6870,  0.3086,  0.0776,  0.1862]], grad_fn=<AddmmBackward0>)


After ReLU: tensor([[0.5523, 0.0487, 0.6696, 0.0483, 0.0000, 0.0000, 0.1154, 0.2000, 0.0000,
         0.0392, 0.9795, 0.2493, 0.0000, 0.0634, 0.0000, 0.1731, 0.3302, 0.2490,
         0.0813, 0.3022],
        [0.8684, 0.0963, 0.7463, 0.2036, 0.0000, 0.0000, 0.1826, 0.2404, 0.0000,
         0.0000, 0.9179, 0.0000, 0.0000, 0.1019, 0.00

# `nn.Sequential`
An ordered container of modules. Data passed throuugh all the modules in the order defined. Sequential containers are used to put together a quick network like `seq_modules`.

In [30]:
seq_modules = nn.Sequential(
    flatten,
    layer1,
    nn.ReLU(),
    nn.Linear(20,10)
)
input_image = torch.rand(3,28,28)
logits = seq_modules(input_image)

# `nn.Softmax`
The last layer returns logits $\in [-\infty,\infty]$. These are passed to the `nn.Softmax` module. Logits scaled to values $[0,1]$ representing the models predicted probabilities for each class. `dim` parameter indicated the dimension along which the values sum to 1

In [34]:
softmax = nn.Softmax(dim=1)
pred_probab = softmax(logits)
print(pred_probab)

tensor([[0.1042, 0.1252, 0.1229, 0.0744, 0.0679, 0.1198, 0.0876, 0.0953, 0.1096,
         0.0932],
        [0.1078, 0.1407, 0.1065, 0.0854, 0.0594, 0.1033, 0.0972, 0.1181, 0.1055,
         0.0760],
        [0.1142, 0.1131, 0.1159, 0.0756, 0.0674, 0.1233, 0.0992, 0.1042, 0.1018,
         0.0855]], grad_fn=<SoftmaxBackward0>)


# Model Parameters
Layers inside nn are parameterized (have associated weights and biases). Subclassing `nn.Module` automatically tracks all fields defined inside the object and makes all parameters accessible using the models `parameters()` or `named_parameters()` methods/

In [35]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values: {param[:2]}")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values: tensor([[ 0.0060, -0.0352, -0.0151,  ...,  0.0220,  0.0199,  0.0259],
        [-0.0048, -0.0066,  0.0035,  ..., -0.0191, -0.0110,  0.0138]],
       device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values: tensor([-0.0187, -0.0302], device='cuda:0', grad_fn=<SliceBackward0>)
Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values: tensor([[-0.0361,  0.0078,  0.0154,  ...,  0.0156,  0.0415, -0.0432],
        [-0.0098, -0.0143,  0.0079,  ...,  0.0245, -0.0013,  0.0203]],
       device='cuda:0', grad_fn=<SliceBack