<a href="https://colab.research.google.com/github/pkro/pytorch_for_deep_learning/blob/main/00_pytorch_fundamentals_extra_curriculum_pythorch_quickstart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

# also available: TorchText, TorchAudio including datasets

Fashion-MNIST is a dataset of Zalando's article images—consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes.

https://www.kaggle.com/datasets/zalando-research/fashionmnist

In [2]:
# download training data from open datasets

training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

# download test data
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data/FashionMNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:00<00:00, 116318614.70it/s]


Extracting data/FashionMNIST/raw/train-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 10061352.61it/s]

Extracting data/FashionMNIST/raw/train-labels-idx1-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz



100%|██████████| 4422102/4422102 [00:00<00:00, 62468265.48it/s]


Extracting data/FashionMNIST/raw/t10k-images-idx3-ubyte.gz to data/FashionMNIST/raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<00:00, 18423444.53it/s]


Extracting data/FashionMNIST/raw/t10k-labels-idx1-ubyte.gz to data/FashionMNIST/raw



In [6]:
batch_size = 64

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

#for X, y in test_dataloader:
  #print(f"Shape of X [N, C, H, W]: {X.shape}")
  #print(f"Shape of y: {y.shape} {y.dtype}")


In [5]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [8]:
# Define a class for our neural network model
class NeuralNetwork(nn.Module):  # Subclass PyTorch's base class for all neural network modules
    def __init__(self):  # This method initializes the neural network
        super().__init__()  # Call the initialization method of the parent class
        self.flatten = nn.Flatten()  # An operation that flattens the input, i.e., it makes it one-dimensional

        # A sequential container where we stack our layers
        # (Linear layers with ReLU activation function)
        self.linear_relu_stack = nn.Sequential(
            # 28 * 28 coresponds to the pixel size of the mnist data items; 784 pixels = 784 "features"
            nn.Linear(28*28, 512),  # A linear layer (also known as a fully connected layer), it takes a 1D tensor of size 28*28 and outputs a 1D tensor of size 512
            nn.ReLU(),  # An activation function that introduces non-linearity in the model. It replaces negative pixel values in the previous layer with zero

            # a "hidden layer", optional; we can have more of these too, but not necessarily for better results
            # "hidden" means just that it's not directly connected to the input or output
            nn.Linear(512, 512),  # Another linear layer that takes a 1D tensor of size 512 and outputs a 1D tensor of size 512
            nn.ReLU(),  # Another ReLU activation layer

            nn.Linear(512, 10),  # A final linear layer that takes a 1D tensor of size 512 and outputs a 1D tensor of size 10 (this size usually corresponds to the number of classes in a classification problem)
            nn.ReLU(),  # A final ReLU activation layer
        )

    def forward(self, x):  # The method that defines the forward pass of the neural network
        x = self.flatten(x)  # First, we flatten the input
        logits = self.linear_relu_stack(x)  # Then, we pass the flattened input through the stack of layers
        return logits  # We return the output of the network, also known as the logits

# After defining the class for the network, we create an instance of the network
model = NeuralNetwork()

# 'device' is not defined in the code you provided, but it usually refers to where you want to run your code,
# either on a CPU or a GPU. '.to(device)' moves the model parameters to the specified device.
model = model.to(device)

# Finally, we print the structure of the neural network.
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
    (5): ReLU()
  )
)


In [9]:
# For training a model, we need a loss function and an optimizer.

# The loss function calculates how far the model's predictions are from the true values.
# Here we're using CrossEntropyLoss, which is commonly used for multi-class classification problems.
# It combines softmax and negative log likelihood loss in one single class.
loss_fn = nn.CrossEntropyLoss()

# The optimizer is an algorithm or method used to adjust the attributes of your neural network
# such as weights and learning rate in order to reduce the losses.
# Optimizers help to get results faster.
# Here we're using Stochastic Gradient Descent (SGD) as the optimizer.
# Learning rate (lr) is a hyper-parameter that controls how much we are adjusting the weights of our network with respect the loss gradient.
# The lower the value, the slower we travel along the downward slope.
# While this might be a good idea (using a low learning rate) in terms of making sure that we do not miss any local minima,
# it could also mean that we'll be taking a long time to converge — especially if we get stuck on a plateau region.
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)


In the following function, the training process for a neural network model is defined, which includes the forward pass (making predictions), calculating the loss, performing backpropagation to compute gradients, and updating the model's parameters (weights and biases) using an optimization algorithm. The function also prints the training loss every 100 batches, which gives you a way to monitor the training process.

In [10]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)  # Get the total size of the dataset
    model.train()  # Set the model to training mode. This has any effect only on certain modules like Dropout or BatchNorm.

    # Enumerate through the dataloader. In each loop, the data (X) and target labels (y) for a batch is returned.

    # In each iteration of training:

    # The network makes predictions on the input data (forward pass).
    # The error between the network's predictions and the known correct outcomes (the "labels") is calculated using a loss function.
    # The backpropagation algorithm then works out the contribution of each weight and bias to this error by calculating the gradients of the loss function with respect to each weight and bias.
    # These gradients are then used to adjust the weights and biases to minimize the error (this is an optimization step, typically done using an algorithm like stochastic gradient descent).

    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)  # Move the data and labels to the device where the model is. This is often a GPU, but could also be a CPU.

        # Compute the prediction error. First, we run the forward pass to get the model's predictions.
        pred = model(X)
        # Then, we pass the model's predictions along with the true labels to the loss function, which computes the error.
        loss = loss_fn(pred, y)

        # Backpropagation. This involves three steps:
        # 1. First, we calculate the gradients of the loss with respect to the model's parameters. This is done using the backward() function.
        loss.backward()
        # 2. Then, we perform a single optimization step (parameter update) using the optimizer. This adjusts the model's parameters based on the gradients computed in the backward step.
        optimizer.step()
        # 3. Finally, we zero out the gradients. This is because by default, gradients are accumulated in buffers, so we need to manually set them to zero before the next iteration.
        optimizer.zero_grad()

        # Print the current loss and training progress information every 100 batches.
        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)  # Compute the current loss and the number of examples seen so far.
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")  # Print the current loss and training progress.


Nice chatgpt analogy for gradient descent and backpropagation:

>Backpropagation and the concept of gradients can be complex topics, especially if you're just getting started with neural networks and deep learning. It's great that you're asking questions and seeking to understand these concepts.
>
>Here's a simpler way to think about it: imagine you're on a hill and your goal is to get to the bottom. You're blindfolded, so you can't see where to go, but you can feel the slope of the hill under your feet.
>
>If you feel the ground sloping downwards to your right, you know that if you step to the right, you'll go downhill. If you feel the ground sloping downwards to your left, you know that if you step to the left, you'll go downhill. By repeatedly feeling the slope under your feet and taking a step downhill, you'll eventually reach the bottom of the hill.
>
>In this analogy, the hill is the loss function, your position on the hill is the weights of the neural network, and the slope that you feel under your feet is the gradient. Just like how you can use the slope to figure out which way to step to go downhill, a neural network can use the gradient to figure out how to adjust its weights to reduce the loss.
>
>The process of calculating the slope (gradient) is what we call backpropagation, and the process of taking a step downhill (updating the weights) is what we call an optimization step.

In [12]:
# check the models performance against the test dataset

def test(dataloader, model, loss_fn):
    total_samples = len(dataloader.dataset)  # Total number of samples in the dataset
    num_batches = len(dataloader)  # Total number of batches in the dataloader
    model.eval()  # Set the model to evaluation mode. This has any effect only on certain modules like Dropout or BatchNorm.

    # Initialize counters for total test loss and number of correct predictions
    test_loss, correct_predictions = 0, 0

    # torch.no_grad() informs PyTorch that we do not want to perform back-propagation, which reduces memory usage and speeds up computation.
    with torch.no_grad():
        # Loop over each batch from the testing data
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)  # Move the data and labels to the device where the model is.

            # Make predictions using the model
            predictions = model(X)

            # Compute the loss between the predictions and the true labels, and add it to the total test loss
            test_loss += loss_fn(predictions, y).item()

            # For each prediction, get the index of the label with the highest predicted probability using argmax,
            # then compare with the true label. This gives us a binary vector where 1 represents a correct prediction and 0 represents a wrong prediction.
            correct_predictions_vector = (predictions.argmax(1) == y)

            # Sum up the correct predictions, convert to float (for division later), and add to the total number of correct predictions
            correct_predictions += correct_predictions_vector.type(torch.float).sum().item()

    # Calculate average loss over all batches
    average_test_loss = test_loss / num_batches

    # Calculate the accuracy as the proportion of correct predictions over total samples
    accuracy = correct_predictions / total_samples

    # Print the test error details
    print(f"Test Error: \n Accuracy: {(100*accuracy):>0.1f}%, Avg loss: {average_test_loss:>8f} \n")



In [14]:
# train and test the model

epochs = 5

for t in range(epochs):
  print(f"Epoch {t+1}\n--------------------------------")
  train(train_dataloader, model, loss_fn, optimizer)
  test(test_dataloader, model, loss_fn)

print("Done!")

Epoch 1
--------------------------------
loss: 1.441873  [   64/60000]
loss: 1.419557  [ 6464/60000]
loss: 1.329365  [12864/60000]
loss: 1.445377  [19264/60000]
loss: 1.239895  [25664/60000]
loss: 1.445290  [32064/60000]
loss: 1.226425  [38464/60000]
loss: 1.262740  [44864/60000]
loss: 1.358987  [51264/60000]
loss: 1.348363  [57664/60000]
Test Error: 
 Accuracy: 61.5%, Avg loss: 1.287357 

Epoch 2
--------------------------------
loss: 1.289905  [   64/60000]
loss: 1.286245  [ 6464/60000]
loss: 1.188234  [12864/60000]
loss: 1.333966  [19264/60000]
loss: 1.121187  [25664/60000]
loss: 1.345068  [32064/60000]
loss: 1.115078  [38464/60000]
loss: 1.170060  [44864/60000]
loss: 1.263788  [51264/60000]
loss: 1.266954  [57664/60000]
Test Error: 
 Accuracy: 62.7%, Avg loss: 1.197084 

Epoch 3
--------------------------------
loss: 1.189699  [   64/60000]
loss: 1.198598  [ 6464/60000]
loss: 1.091549  [12864/60000]
loss: 1.257800  [19264/60000]
loss: 1.044701  [25664/60000]
loss: 1.271755  [32064/

In [15]:
# save model
torch.save(model.state_dict(), "model.pth")
print("Saved PyTorch Model State to model.pth")

Saved PyTorch Model State to model.pth


In [16]:
# load a model
model = NeuralNetwork().to(device)
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [17]:
# make a prediction on the first item of the fashionmnist test set (a grayscale image of 28*28 pixel of an ankle boot)

# Define names of the classes (labels) corresponding to the numerical predictions 0-9 so we don't just see "5" but "Sandal" in the prediction
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"
