In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor



# Lab for Week 5: Fitting a Feed Forward Neural Network

In Milestone 1 you chose a project topic you are interested in and found some datasets you could use. You also already built a [data loader](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html).

Today, we will fit a baseline feed forward neural network to one of the datasets you chose by following the pytorch tutorials.

## Step 1: Define a Data Loader
You are encouraged to use one of the data loaders you built for milestone 1. Otherwise use the one described [here](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html).

Tip: If your targets are labels, you might want to use the lambda transformation described [here](https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html) to turn your target into one-hot vectors.

In [3]:
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

train_dataloader = DataLoader(training_data, batch_size=64)
test_dataloader = DataLoader(test_data, batch_size=64)


## Step 2: Define a Model
Follow [this tutorial](https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html) to define a model. Make sure that the input dimension fits the dimensionality of your data, and the output dimension fits the dimensionality of your targets.

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork()

## Step 3: Train Model

Follow the tutorial [here](https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html) to train your model.

In [5]:
def train_loop(dataloader, model, loss_fn, optimizer):
    """
    Runs one full training epoch on the given model using the provided dataloader.

    Parameters:
        dataloader (torch.utils.data.DataLoader): DataLoader providing batches of training data (inputs and labels).
        model (torch.nn.Module): The PyTorch model to be trained.
        loss_fn (function): The loss function used to compute the error between predictions and true labels.
        optimizer (torch.optim.Optimizer): The optimizer used to update the model parameters.

    Behavior:
        - Sets the model to training mode (important for layers like dropout and batch normalization).
        - Iterates over the data batches:
            - Computes the model's predictions.
            - Calculates the loss.
            - Performs backpropagation and updates model weights.
        - Prints progress every 100 batches, showing the current loss and the number of samples processed.

    Returns:
        None
    """
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, loss_fn):
    """
    Evaluates the model's performance on a test dataset.

    Parameters:
        dataloader (torch.utils.data.DataLoader): DataLoader providing batches of test data (inputs and labels).
        model (torch.nn.Module): The PyTorch model to be evaluated.
        loss_fn (function): The loss function used to compute the error between predictions and true labels.

    Behavior:
        - Sets the model to evaluation mode (which disables behaviors like dropout).
        - Iterates over the test data without computing gradients (using torch.no_grad() for efficiency).
        - Accumulates the total loss and counts the number of correct predictions.
        - Computes the average loss and overall accuracy.
        - Prints the test accuracy and average loss.

    Returns:
        None
    """ 
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [6]:
# Train the model for 3 different hyper parameter settings (e.g. different learning rates, different loss functions that make sense for your data, etc.)

learning_rate = 1e-3
batch_size = 64
epochs = 5

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.307856  [   64/60000]
loss: 2.284600  [ 6464/60000]
loss: 2.273957  [12864/60000]
loss: 2.258824  [19264/60000]
loss: 2.248000  [25664/60000]
loss: 2.228622  [32064/60000]
loss: 2.229075  [38464/60000]
loss: 2.207806  [44864/60000]
loss: 2.193921  [51264/60000]
loss: 2.158929  [57664/60000]
Test Error: 
 Accuracy: 50.5%, Avg loss: 2.157776 

Epoch 2
-------------------------------
loss: 2.174810  [   64/60000]
loss: 2.160250  [ 6464/60000]
loss: 2.105361  [12864/60000]
loss: 2.106467  [19264/60000]
loss: 2.081221  [25664/60000]
loss: 2.017720  [32064/60000]
loss: 2.044077  [38464/60000]
loss: 1.974459  [44864/60000]
loss: 1.964731  [51264/60000]
loss: 1.892145  [57664/60000]
Test Error: 
 Accuracy: 60.5%, Avg loss: 1.893542 

Epoch 3
-------------------------------
loss: 1.931175  [   64/60000]
loss: 1.905419  [ 6464/60000]
loss: 1.784084  [12864/60000]
loss: 1.810077  [19264/60000]
loss: 1.733045  [25664/60000]
loss: 1.670145  [32064/600

## Step 4: Record results

Results are typically recorded in a table. For inspiration, check out the famous [Attention is All You Need paper](https://arxiv.org/abs/1706.03762). This paper first introduced the transformer architecture we will learn about later this semseter and has been highly influential in deep learnin. Check out how results are reported in Tables 2, 3, and 4.
Note that because here the transformers are used for text generation results are reported using the BLEU score (the higher the better). 

You should create your own result tables to record how different hyperparameter settings affect performance. Make sure to record test accuracy, not traingin accuracy!