# Working with data

In [1]:
import torch

In [2]:
# At the heart of PyTorch data loading utility is the torch.utils.data.DataLoader class. It represents a Python iterable over a dataset
from torch.utils.data import DataLoader

In [3]:
# PyTorch offers domain-specific libraries such as TorchText, TorchVision, and TorchAudio, all of which include datasets.
from torchvision import datasets

In [4]:
# In PyTorch and torchvision, the purpose of transform is to preprocess and augment data,
# especially images, before feeding them into a neural network
# v2 is newer API and recommend to use ToImage and ToDtype  in place of ToTensor.
import torchvision.transforms.v2 as transforms

In [5]:
transform = transforms.Compose([
    transforms.ToImage(),       # Converts input (like PIL images or NumPy arrays) into PyTorch image format (C × H × W, i.e., Channels, Height, Width).
    transforms.ToDtype(torch.float32, scale=True)       # Scales pixel values from [0, 255] to [0.0, 1.0] and changes data type.
])  

In [6]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root='data',            # Downloads or looks for the data in the ./data/ directory.
    train=True,             # Whether to load training or test split.
    download=True,          # Automatically downloads if not found.
    transform=transform     # Applies the transform defined above to every image.
)

In [7]:
# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root='data',
    train=False,
    download=True,
    transform=transform
)

DataLoader
    Wraps the datasets in a DataLoader, which:

        Automatically batches the data (64 images per batch).

        Shuffles training data by default (unless disabled).

        Allows iterating through the dataset efficiently.

        Supports multiprocessing for faster loading.


In [8]:
# We pass the Dataset as an argument to DataLoader. This wraps an iterable over our dataset,
# and supports automatic batching, sampling, shuffling and multiprocess data loading

batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    print(f"Max = {X.max().item()} and Min is {X.min().item()}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64
Max = 1.0 and Min is 0.0


# Creating Models

To define a neural network in PyTorch, we create a class that inherits from nn.Module. We define the layers of the network in the __init__ function and specify how data will pass through the network in the forward function. 

To accelerate operations in the neural network, we move it to the accelerator such as CUDA, MPS, MTIA, or XPU. If the current accelerator is available, we will use it. Otherwise, we use the CPU.

In [9]:
from torch import nn

In [10]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [11]:
# creating a new custom model by subclassing nn.Module—this is the standard way to define neural networks in PyTorch.

num_classes = 10

class NeuralNetwork(nn.Module):
    
    def __init__(self):
        super().__init__()                          # super().__init__() calls the parent class constructor (nn.Module)
        self.flatten = nn.Flatten()                 # Flatten the Image shape: [N, 1, 28, 28] to reshapes this into [N, 784]
        self.linear_relu_stack = nn.Sequential(     # Define the Neural Network
            nn.Linear(28*28, 512),                  # nn.Linear(in_features, out_features) defines a fully connected layer.
            nn.ReLU(),                              # introduces non-linearity.
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, X):                           # defines how data moves through the network.
        X = self.flatten(X)
        logits = self.linear_relu_stack(X)
        return logits

model = NeuralNetwork().to(device)                  # to(device) moves the model to GPU or CPU depending on what you've defined earlier
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


# Optimizing the Model Parameters

To train a model, we need a loss function and an optimizer.

In [12]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In a single training loop, the model makes predictions on the training dataset (fed to it in batches), and backpropagates the prediction error to adjust the model’s parameters.

In [13]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)  # Total number of training examples, used for logging.
    model.train()       # Puts model in training mode (enables dropout, batch norm, etc. if used).
    for batch, (X, y) in enumerate(dataloader):     # Loops over the dataloader, which yields mini-batches of input (X) and label (y).
        X, y = X.to(device), y.to(device)           # Moves the batch to the correct device (CPU or GPU).

        # Compute prediction error
        pred = model(X)                             # Get model output (logits) for the input X
        loss = loss_fn(pred, y)                     # Compare predictions (pred) to ground truth labels (y) using the provided loss_fn

        # Backpropagation
        loss.backward()                             # Computes gradients using backpropagation
        optimizer.step()                            # Updates model weights using the gradients.
        optimizer.zero_grad()                       # Clears gradients before the next step (important to prevent accumulation).

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)       # The current loss (converted to a Python number via .item())
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")  # current is number of samples processed so far: (batch + 1) * batch_size.

We also check the model’s performance against the test dataset to ensure it is learning.

In [14]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()                            # Set model to evaluation mode
    test_loss, correct = 0, 0               # Initialize loss and accuracy counters
    with torch.no_grad():                   # Turn Off Gradient Tracking, Gradient only required while model is learning,
                                            # saves memory & speeds up computation.
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()                                # Accumulate Loss, .item() converts a single-value tensor to a standard Python float.
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()     # Count Correct Predictions,
                                                                                # pred.argmax(1): Gets the index of the max logit (predicted class).
        test_loss /= num_batches            # Calculate Average Loss and Accuracy
        correct /= size
        print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

The training process is conducted over several iterations (epochs). During each epoch, the model learns parameters to make better predictions. We print the model’s accuracy and loss at each epoch; we’d like to see the accuracy increase and the loss decrease with every epoch.

In [15]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.310946  [   64/60000]


loss: 2.298762  [ 6464/60000]
loss: 2.279612  [12864/60000]
loss: 2.273447  [19264/60000]
loss: 2.260120  [25664/60000]
loss: 2.231806  [32064/60000]
loss: 2.246035  [38464/60000]
loss: 2.211257  [44864/60000]
loss: 2.211536  [51264/60000]
loss: 2.181599  [57664/60000]
Test Error: 
 Accuracy: 33.9%, Avg loss: 2.178273 

Epoch 2
-------------------------------
loss: 2.191308  [   64/60000]
loss: 2.180477  [ 6464/60000]
loss: 2.134054  [12864/60000]
loss: 2.149364  [19264/60000]
loss: 2.102354  [25664/60000]
loss: 2.046243  [32064/60000]
loss: 2.085636  [38464/60000]
loss: 2.010016  [44864/60000]
loss: 2.015189  [51264/60000]
loss: 1.952966  [57664/60000]
Test Error: 
 Accuracy: 54.9%, Avg loss: 1.951010 

Epoch 3
-------------------------------
loss: 1.980870  [   64/60000]
loss: 1.949572  [ 6464/60000]
loss: 1.852822  [12864/60000]
loss: 1.892917  [19264/60000]
loss: 1.778928  [25664/60000]
loss: 1.726599  [32064/60000]
loss: 1.759982  [38464/60000]
loss: 1.655093  [44864/60000]
loss: 

# Saving Models

A common way to save a model is to serialize the internal state dictionary (containing the model parameters).

In [16]:
torch.save(model.state_dict(), "saved_models/model.pth")
print("Saved PyTorch Model State to saved_model/model.pth")

Saved PyTorch Model State to saved_model/model.pth


# Loading Models

The process for loading a model includes re-creating the model structure and loading the state dictionary into it.

In [17]:
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("saved_models/model.pth", weights_only=True))

<All keys matched successfully>

This model can now be used to make predictions.

In [18]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"
