# Basics of PyTorch - Neural Network
### Date: 02/20/2025
### by Malik N. Mohammed

## Objectives
- Building Neural networks using PyTorch
- Analyze the parameters of the model
- Dive into activation functions


In [19]:
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch import nn
import torch

In [20]:
device = 'cpu'

if torch.accelerator.is_available():
    device = torch.accelerator.current_accelerator()

device

device(type='cuda')

In [69]:
# Set print option for truncated output.
torch.set_printoptions(precision=2, threshold=10)

In [28]:
# Get the MNIST Dataset
data = datasets.MNIST(
  'data',
  download=True,
  transform=transforms.ToTensor() # Convert PIL Image to tensor using torchvision.transforms.ToTensor()
)

In [22]:
data

Dataset MNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: ToTensor()

In [23]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        """
        Initialize the NeuralNetwork module class.
        """
        
        super(NeuralNetwork, self).__init__()

        # Flatten layer - Converts nd array to 1d
        self.flatten = nn.Flatten()

        # Linear neural network with ReLU (max(0, x)) activation functions.
        # Final layer has 10 nodes which are 10 classes in the dataset.
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        # Flatten the dataset before passing through the NN.
        x = self.flatten(x)

        # Pass the flatten data and get the logits.
        # Logits - raw, unnormalized scores output by the last layer.
        # We generally use Softmax function to get the probability distribution over classes.
        
        logits = self.linear_relu_stack(x)
        return logits

In [43]:
# Instantiate the model on the accelerator (if available)
model = NeuralNetwork().to(device)
model

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

In [45]:
# Get the dataloader on the MNIST dataset with batch size of 32
data_loader = DataLoader(data, batch_size=32, shuffle=True)
data_loader

<torch.utils.data.dataloader.DataLoader at 0x75fe70726ba0>

In [46]:
# Get the first batch from the data loader.
image, label = next(iter(data_loader))
image.size()

torch.Size([32, 1, 28, 28])

In [34]:
# Move the image and lable to the accelerator or cpu.
X = image.to(device)
y = label.to(device)

y

tensor([9], device='cuda:0')

In [49]:
# Get the 10 logits from the last layer.
logits = model(X)

logits, logits.size()

(tensor([[ 0.0207, -0.0332,  0.0160, -0.0336,  0.0548, -0.0132, -0.0141,  0.0242,
          -0.0277, -0.0407]], device='cuda:0', grad_fn=<AddmmBackward0>),
 torch.Size([1, 10]))

In [50]:
# Since our logits are in the dimension 1 we apply softmax on that dimension
# and get the probability distribution
probability = nn.Softmax(dim=1)(logits)
probability

tensor([[0.1025, 0.0971, 0.1020, 0.0971, 0.1061, 0.0991, 0.0990, 0.1029, 0.0977,
         0.0964]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [51]:
# To get the predicted label we get the index of the maximum probability distribution.
probability.argmax(dim=1)

tensor([4], device='cuda:0')

## Investigating what happens at each layer.
- In this section we explore how each layer is applied without using the NeuralNetwork class.

In [77]:
# Lets get the first image and label to use as reference here.
image, label = data[0]

image.size()

torch.Size([1, 28, 28])

In [78]:
# We first flatten the image and get it's 1 dimension tensor.
x = nn.Flatten()(image)
x, x.size()

(tensor([[0., 0., 0.,  ..., 0., 0., 0.]]), torch.Size([1, 784]))

In [79]:
# We then pass this value to the first Linear layer.
x = nn.Linear(in_features=28*28, out_features=512)(x)
x, x.size()

(tensor([[ 0.01,  0.36, -0.43,  ...,  0.03, -0.05,  0.30]],
        grad_fn=<AddmmBackward0>),
 torch.Size([1, 512]))

In [80]:
# We then pass the output of the first layer to the ReLU activation function (amx(0, x)). So all the values
# < 0 are replaced with 0
x = nn.ReLU()(x)
x, x.size()

(tensor([[0.01, 0.36, 0.00,  ..., 0.03, 0.00, 0.30]], grad_fn=<ReluBackward0>),
 torch.Size([1, 512]))

In [81]:
# We do the same with another Linear layer, followed by ReLU and finally the last layer with 10 nodes.
x = nn.Linear(512, 512)(x) # Takes input of 512 size and outputs the same
x = nn.ReLU()(x) # Applies ReLU activation on the 512 size tensor.
x = nn.Linear(512, 10)(x) # Finally get the tensor of size 10 i.e; 10 classes in the dataset

x, x.size()

(tensor([[-0.08, -0.01,  0.02,  0.03,  0.02,  0.00,  0.06,  0.03,  0.06, -0.04]],
        grad_fn=<AddmmBackward0>),
 torch.Size([1, 10]))

In [83]:
# You can see that the output of the final layer is not a probability distrbution but a logits. So,
# we pass the final layer output through the softmax activation function to get the probabilitic distribution
# and pick the index with the maximum probability.
x = nn.Softmax(dim=1)(x)
x, x.size()

(tensor([[0.09, 0.10, 0.10, 0.10, 0.10, 0.10, 0.11, 0.10, 0.10, 0.10]],
        grad_fn=<SoftmaxBackward0>),
 torch.Size([1, 10]))

In [84]:
# Prediction
torch.argmax(x, dim=1)

tensor([6])

### Analyzing model parameters
- Each layer of the neural network has parameters called weights and biases.
- These weights and biases are optimized during training using backpropagation.
- In fully connected (dense) neural network the weights and biases is calculated as follows:
  - Weights = Input features * Output Features
  - Bias = Output features

In [88]:
print(f"Model parameters\n\n")

for name, param in model.named_parameters():
    print(f"Layer: {name}, \t\t{'Weights' if 'weight' in name else 'Bias'} Size: {param.size()}")

Model parameters


Layer: linear_relu_stack.0.weight, 		Weights Size: torch.Size([512, 784])
Layer: linear_relu_stack.0.bias, 		Bias Size: torch.Size([512])
Layer: linear_relu_stack.2.weight, 		Weights Size: torch.Size([512, 512])
Layer: linear_relu_stack.2.bias, 		Bias Size: torch.Size([512])
Layer: linear_relu_stack.4.weight, 		Weights Size: torch.Size([10, 512])
Layer: linear_relu_stack.4.bias, 		Bias Size: torch.Size([10])
