# Build simple Neural Network

Build the neural network: https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html

This script demonstrates how to build a simple neural network using PyTorch.
In this case, we will train a network to classify images from the Fashion MNIST dataset.

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

### Get device for training

In [2]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device\n")


Using cuda device



In [3]:
'''
Define the Neural Network Class that inherit from nn.Module
'''
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()  # Call the parent class constructor

        # Define the layers of the neural network
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),  # Input layer
            nn.ReLU(),             # Activation function #1
            nn.Linear(512, 512),   # Hidden layer
            nn.ReLU(),             # Activation function #2
            nn.Linear(512, 10),    # Output layer
        )

    def forward(self, x):
        '''
        Define the forward pass of the neural network, i.e., how the input data flows through the network
        --------
        Parameters:
        x : torch.Tensor
            Input data (batch of images)
        -------
        Returns:
        torch.Tensor
            Output data (predictions)
        '''
        x = self.flatten(x) # Flatten the input data into a 1D tensor

        # Logits are the raw output scores from the network before applying softmax
        logits = self.linear_relu_stack(x) # Pass the data through the layers

        return logits # Return the output data

### Create an instance of the neural network and move it to the device


In [4]:
model = NeuralNetwork().to(device)  
print(model)  # Print the model architecture

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


### Pass a random input through the model to test it

In [7]:
# X Shape: (batch_size, height, width)
X = torch.rand(1, 28, 28, device=device)  # Create a random input tensor

# logits shape: (batch_size, num_classes)
logits = model(X)   # Pass the input through the model
print(f"logits: {logits}")  # Print the raw output scores

# pred_prob shape: (batch_size, num_classes)
pred_prob = nn.Softmax(dim=1)(logits)   # Apply softmax to get probabilities
print(f"pred_prob: {pred_prob}")  # Print the predicted probabilities

# pred_prob shape: (batch_size,)
# Gets the index of the class with the highest probability
y_pred = pred_prob.argmax(1)  # Get the predicted class index
print(f"Predicted class: {y_pred}")

logits: tensor([[ 0.0294, -0.0777, -0.0191, -0.0095, -0.0175, -0.0071,  0.0227,  0.0506,
         -0.0304,  0.0218]], device='cuda:0', grad_fn=<AddmmBackward0>)
pred_prob: tensor([[0.1033, 0.0928, 0.0984, 0.0994, 0.0986, 0.0996, 0.1026, 0.1055, 0.0973,
         0.1025]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Predicted class: tensor([7], device='cuda:0')


### Break down the Layers in this FashionMNIST model

In [8]:
# Create a simple Minibatch of 3 images of size 28x28
input_img = torch.rand(3, 28, 28)  # Create a random input image
print(input_img.size())

torch.Size([3, 28, 28])


#### `nn.Flatten`
Convert each 2D 28x28 image into a contiguous array of 784 pixel values. The minibatch dimension at dim=0 is maintained.

In [9]:
flatten = nn.Flatten()
flat_img = flatten(input_img)  # Flatten the input image
print(flat_img.size())  # Print the size of the flattened image

torch.Size([3, 784])


#### `nn.Linear`
The linear layer applies a linear transformation on the input using its stored weights and biases.

In [10]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
hidden1 = layer1(flat_img)  # Pass the flattened image through the first layer
print(hidden1.size())  # Print the size of the output from the first layer

torch.Size([3, 20])


#### `nn.RelU`
Non-linear activations are what create the complex mappings between the model's inputs and outputs. They are applied after each linear transformation to introduce non-linearity, helping *NNs* learn a wide variety of phenomena.

In this example, we use the ReLU activation function, which is defined as:
$f(x) = max(0, x)$

However, there are many other activation functions available in PyTorch, such as Sigmoid, Tanh, and Softmax.

In [11]:
print(f"Before ReLU: {hidden1}\n")
hidden1 = nn.ReLU()(hidden1)  # Apply ReLU activation function
print(f"After ReLU: {hidden1}")

Before ReLU: tensor([[ 0.0499, -0.1482, -0.1505, -0.5916,  0.1293, -0.6092,  0.1681, -0.2550,
         -0.5491, -0.2488, -0.3891, -0.1776,  0.3471,  0.1496, -0.3765,  0.0129,
         -0.1207,  0.0455, -0.3863, -0.1577],
        [ 0.2076, -0.3712, -0.0198, -0.4936, -0.4130, -0.4532,  0.5078, -0.3953,
         -0.8068, -0.1551, -0.6726, -0.8051,  0.7614, -0.0762, -0.1813, -0.2228,
          0.0135, -0.1229, -0.6482,  0.2798],
        [ 0.2676, -0.0479, -0.2583, -0.2283,  0.0872, -0.6923,  0.3567, -0.5788,
         -0.9188, -0.4029, -0.3801, -0.5342,  0.3593, -0.4055, -0.4157,  0.0733,
         -0.0943, -0.1593, -0.3489, -0.2983]], grad_fn=<AddmmBackward0>)

After ReLU: tensor([[0.0499, 0.0000, 0.0000, 0.0000, 0.1293, 0.0000, 0.1681, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.3471, 0.1496, 0.0000, 0.0129, 0.0000, 0.0455,
         0.0000, 0.0000],
        [0.2076, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.5078, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.7614, 0.0000, 0.000

#### `nn.Sequential`
`nn.Sequential` is an ordered container of modules. The data is passed through all the modules in the same order as defined.

In [12]:
seq_modules = nn.Sequential(
    flatten, layer1, nn.ReLU(), nn.Linear(20, 10)
)
input_img = torch.rand(3, 28, 28)  # Create a random input image
logits = seq_modules(input_img)  # Pass the input through the sequential model

#### `nn.Softmax`
The last linear layer of the *NN* returns logits -- raw values in the range of $(-\infty, \infty)$. To convert these logits into probabilities, we apply the softmax function. The softmax function is defined as:
$$
softmax(x_i) = \frac{e^{x_i}}{\sum_{j=1}^{K} e^{x_j}}$$
where $K$ is the number of classes and $x_i$ is the logit for class $i$. 

The softmax function normalizes the logits into a probability distribution over the classes, ensuring that the sum of all probabilities equals 1.

In [15]:
softmax = nn.Softmax(dim=1) # Define softmax layer
pred_prob = softmax(logits)  # Apply softmax to get probabilities
print(f"Predicted probabilities: {pred_prob}")  # Print the predicted probabilities
print(f"Sum of probs for each image: {pred_prob.sum(dim=1)}")  # Print the sum of probabilities for each image

Predicted probabilities: tensor([[0.1222, 0.1069, 0.0824, 0.1375, 0.0820, 0.0881, 0.1000, 0.0850, 0.1014,
         0.0946],
        [0.1242, 0.0998, 0.0753, 0.1422, 0.0821, 0.0944, 0.1005, 0.0951, 0.0971,
         0.0893],
        [0.1251, 0.1019, 0.0872, 0.1273, 0.0846, 0.1005, 0.0975, 0.0843, 0.1036,
         0.0879]], grad_fn=<SoftmaxBackward0>)
Sum of probs for each image: tensor([1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)


### Model Parameters

In [16]:
print(f"Model structure: {model}\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()} | Values: {param[:2]} \n")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Layer: linear_relu_stack.0.weight | Size: torch.Size([512, 784]) | Values: tensor([[-0.0178,  0.0171,  0.0287,  ...,  0.0027, -0.0306, -0.0193],
        [ 0.0144, -0.0280, -0.0281,  ..., -0.0159,  0.0232,  0.0346]],
       device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.0.bias | Size: torch.Size([512]) | Values: tensor([ 0.0236, -0.0293], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer: linear_relu_stack.2.weight | Size: torch.Size([512, 512]) | Values: tensor([[-0.0071, -0.0361,  0.0435,  ..., -0.0114,  0.0295, -0.0391],
        [-0.0044,  0.0296, -0.0325,  ...,  0.0058,  0.0262,  0.0212]],
       device='cuda:0', grad_fn=<Slice

print()