In [5]:
# Importing necessary modules
import torch
import torch.nn as nn  # Importing neural network module
import torch.optim as optim  # Importing optimization algorithms
from torch.utils.data import DataLoader  # Importing DataLoader for batch processing
import torchvision.datasets as dt  # Importing torchvision dataset utilities
import torchvision.transforms as transforms  # Importing transforms for image processing

# Defining transformations to process and normalize the dataset
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to tensors
    transforms.Normalize((0.5,), (0.5,)),  # Normalize the image data between -1 and 1
])

# Loading the MNIST dataset (handwritten digits) with defined transformations
train_dataset = dt.MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)  # DataLoader to load data in batches of 64, with shuffling

# Defining the MLP (Multi-Layer Perceptron) model for digit classification
class MLP(nn.Module):
    def __init__(self, input_size=28*28, hidden_size=512, output_size=10):
        super(MLP, self).__init__()  # Initialize parent class (nn.Module)
        self.fc1 = nn.Linear(input_size, hidden_size)  # First fully connected layer, input: flattened image (28x28), output: 512 neurons
        self.relu = nn.ReLU()  # ReLU activation function to introduce non-linearity
        self.fc2 = nn.Linear(hidden_size, output_size)  # Second fully connected layer, input: 512 neurons, output: 10 (for 10 digits)

    # Defining the forward pass (how data moves through the model)
    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the 28x28 image into a 1D vector (28*28=784)
        x = self.fc1(x)  # Pass data through the first fully connected layer
        x = self.relu(x)  # Apply ReLU activation function
        x = self.fc2(x)  # Pass data through the second fully connected layer
        return x  # Return the output (logits)

# Initialize the model, loss function, and optimizer
model = MLP()  # Instantiate the MLP model
criterion = nn.CrossEntropyLoss()  # Use CrossEntropyLoss for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.0009)  # Adam optimizer to update weights, with a learning rate of 0.0009

# Training Loop (to train the model over multiple epochs)
num_epochs = 10  # Define the number of epochs (how many times the model will see the entire dataset)
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode (important for layers like dropout, which behave differently during training)
    total_loss = 0  # Initialize the total loss for the current epoch
    
    # Loop through the training data in batches
    for images, labels in train_loader:
        optimizer.zero_grad()  # Clear previous gradients to prevent accumulation
        outputs = model(images)  # Forward pass: Get the model predictions
        loss = criterion(outputs, labels)  # Compute the loss between predictions and actual labels
        loss.backward()  # Backward pass: Compute the gradients for backpropagation
        optimizer.step()  # Update the model weights based on the gradients
        total_loss += loss.item()  # Accumulate the loss for the current batch
    
    # Calculate and print the average loss for the epoch
    avg_loss = total_loss / len(train_loader)  # Average loss across all batches
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.2f}')  # Print epoch number and average loss


Epoch [1/10], Loss: 0.32
Epoch [2/10], Loss: 0.14
Epoch [3/10], Loss: 0.11
Epoch [4/10], Loss: 0.09
Epoch [5/10], Loss: 0.07
Epoch [6/10], Loss: 0.06
Epoch [7/10], Loss: 0.05
Epoch [8/10], Loss: 0.05
Epoch [9/10], Loss: 0.04
Epoch [10/10], Loss: 0.04
