# 07. PyTorch Experiment Tracking Exercises

Resource: https://www.learnpytorch.io/07_pytorch_experiment_tracking/

## 0. Get imports and helper function

In [1]:
import torch
import torchvision

from torch import nn
from torchvision import transforms

from torch.utils.tensorboard import SummaryWriter

In [2]:
import matplotlib.pyplot as plt

In [3]:
from src import get_data, setup_data, engine

In [4]:
# Set device agnostic code
device = "cuda" if torch.cuda.is_available() else (
    "mps" if torch.mps.is_available() else "cpu"
)
device

'mps'

In [5]:
# Set seeds
def set_seeds(seed: int=42):
    """Sets random sets for torch operations.

    Args:
        seed (int, optional): Random seed to set. Defaults to 42.
    """
    # Set the seed for general torch operations
    torch.manual_seed(seed)
    # Set the seed for CUDA torch operations (ones that happen on the GPU)
    torch.cuda.manual_seed(seed)


In [34]:
def create_writer(
    experiment_name: str,
    model_name: str,
    extra: str=None,
):
    """Creates a torch.utils.tensorboard.SummaryWriter() instance saving to a specific log_dir"""

    from datetime import datetime
    import os
    
    timestampe = datetime.now().strftime("%y-%m-%d")
    
    if extra:
        log_dir = os.path.join("runs", timestampe, experiment_name, model_name, extra)
    else:
        log_dir = os.path.join("runs", timestampe, experiment_name, model_name)
    
    print(f"[INFO] Created SummaryWriter, saving to {log_dir}...")
    return SummaryWriter(log_dir=log_dir)

In [35]:
from typing import Dict, List
import torch.utils.tensorboard
from tqdm.auto import tqdm

def train(
    model: torch.nn.Module,
    train_dataloader: torch.utils.data.DataLoader,
    test_dataloader: torch.utils.data.DataLoader,
    optimizer: torch.optim.Optimizer,
    loss_fn: torch.nn.Module,
    epochs: int,
    device: torch.device,
    writer: torch.utils.tensorboard.writer.SummaryWriter
) -> Dict[str, List]:
    """Trains and test PyTorch model"""
    # Create empty results dictionary
    results = {
        "train_loss": [],
        "train_acc": [],
        "test_loss": [],
        "test_acc": []
    }
    
    # Loop through training and testing steps for a number of epochs
    for epoch in tqdm(range(epochs)):
        train_loss, train_acc = engine.train_step(
            model=model,
            dataloader=train_dataloader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            device=device
        )
        
        test_loss, test_acc = engine.test_step(
            model=model,
            dataloader=test_dataloader,
            loss_fn=loss_fn,
            device=device
        )

        # Print out what's happening
        print(
            f"Epoch: {epoch+1} | "
            f"train_loss: {train_loss:.4f} | "
            f"train_acc: {train_acc:.4f} | "
            f"test_loss: {test_loss:.4f} | "
            f"test_acc: {test_acc:.4f}"
        )

        # Update results dictionary
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["test_loss"].append(test_loss)
        results["test_acc"].append(test_acc)


        ### New: Use the writer parameter to track experiments ###
        # See if there's a writer, if so, log to it
        if writer:
            # Add results to SummaryWriter
            writer.add_scalars(
                main_tag="Loss", 
                tag_scalar_dict={
                    "train_loss": train_loss,
                    "test_loss": test_loss
                },
                global_step=epoch
            )
            
            writer.add_scalars(
                main_tag="Accuracy", 
                tag_scalar_dict={
                    "train_acc": train_acc,
                    "test_acc": test_acc
                }, 
                global_step=epoch
            )

            # Close the writer
            writer.close()
        else:
            pass
    ### End new ###

    # Return the filled results at the end of the epochs
    return results

## 1. Download data

In [36]:
get_data.get_data(
    data_dir_str="data/",
    image_path_str="pizza_steak_sushi_10_percent",
    data_url_str="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip",
    file_name_str="pizza_steak_sushi.zip"
)

get_data.get_data(
    data_dir_str="data/",
    image_path_str="pizza_steak_sushi_20_percent",
    data_url_str="https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi_20_percent.zip",
    file_name_str="pizza_steak_sushi_20_percent.zip"
)

data/pizza_steak_sushi_10_percent exists...
Data in data/pizza_steak_sushi_10_percent already exits, skipping downloading and unzipping...
Finished getting data...
data/pizza_steak_sushi_20_percent exists...
Data in data/pizza_steak_sushi_20_percent already exits, skipping downloading and unzipping...
Finished getting data...


In [37]:
from pathlib import Path

data_10_percent_path = Path("data/pizza_steak_sushi_10_percent")
data_20_percent_path = Path("data/pizza_steak_sushi_20_percent")

# Setup training directory paths
train_dir_10_percent = data_10_percent_path / "train"
train_dir_20_percent = data_20_percent_path / "train"

# Setup testing directory paths (note: use the same test dataset for both to compare the results)
test_dir = data_10_percent_path / "test"

# Check the directories
print(f"Training directory 10%: {train_dir_10_percent}")
print(f"Training directory 20%: {train_dir_20_percent}")
print(f"Testing directory: {test_dir}")

Training directory 10%: data/pizza_steak_sushi_10_percent/train
Training directory 20%: data/pizza_steak_sushi_20_percent/train
Testing directory: data/pizza_steak_sushi_10_percent/test


In [38]:

# Create a transform to normalize data distribution to be inline with ImageNet
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406], # values per colour channel [red, green, blue]
    std=[0.229, 0.224, 0.225]
)

# Create a transform pipeline
simple_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(), # get image values between 0 & 1
    normalize
])

## 2. Turn data into DataLoaders

In [39]:
BATCH_SIZE = 32

# Create 10% training and test DataLoaders
train_dataloader_10_percent, test_dataloader, class_names = setup_data.create_dataloaders(
    train_dir=train_dir_10_percent,
    test_dir=test_dir,
    transform=simple_transform,
    batch_size=BATCH_SIZE
)

# Create 20% training and test DataLoaders
train_dataloader_20_percent, test_dataloader, class_names = setup_data.create_dataloaders(
    train_dir=train_dir_20_percent,
    test_dir=test_dir,
    transform=simple_transform,
    batch_size=BATCH_SIZE
)

# Find the number of samples/batches per dataloader (using the same test_dataloader for both experiments)
print(f"Number of batches of size {BATCH_SIZE} in 10 percent training data: {len(train_dataloader_10_percent)}")
print(f"Number of batches of size {BATCH_SIZE} in 20 percent training data: {len(train_dataloader_20_percent)}")
print(f"Number of batches of size {BATCH_SIZE} in testing data: {len(train_dataloader_10_percent)} (all experiments will use the same test set)")
print(f"Number of classes: {len(class_names)}, class names: {class_names}")

Number of batches of size 32 in 10 percent training data: 8
Number of batches of size 32 in 20 percent training data: 15
Number of batches of size 32 in testing data: 8 (all experiments will use the same test set)
Number of classes: 3, class names: ['pizza', 'steak', 'sushi']


## 3. Exercise 1: Pick a larger model from torchvision.models to add to the list of experiments (for example, EffNetB3 or higher)

In [40]:
def create_effnetb0() -> nn.Module:
    weights = torchvision.models.EfficientNet_B0_Weights.DEFAULT
    model = torchvision.models.efficientnet_b0(weights=weights).to(device)
    
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.2, inplace=True),
        nn.Linear(in_features=1280, out_features=len(class_names), bias=True)
    ).to(device)
    
    print(f"[INFO]Created EfficientNetB0...")
    return model
    
def create_effnetb3() -> nn.Module:
    weights = torchvision.models.EfficientNet_B3_Weights.DEFAULT
    model = torchvision.models.efficientnet_b3(weights=weights).to(device)
    
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.3, inplace=True),
        nn.Linear(in_features=1536, out_features=len(class_names), bias=True)
    ).to(device)
    
    print(f"[INFO]Created EfficientNetB3...")
    return model

def create_effnetb5() -> nn.Module:
    weights = torchvision.models.EfficientNet_B5_Weights.DEFAULT
    model = torchvision.models.efficientnet_b5(weights=weights).to(device)
    
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.4, inplace=True),
        nn.Linear(in_features=2048, out_features=len(class_names), bias=True)
    ).to(device)
    
    print(f"[INFO]Created EfficientNetB5...")
    return model

In [41]:
experiments = {
    "models": {
        "effnetb0": create_effnetb0,
        "effnetb3": create_effnetb3,
        "effnetb5": create_effnetb5,
    },
    "epochs": {
        "5": 5,
        "10": 10
    },
    "data": {
        "10_percent": train_dataloader_10_percent,
        "20_percent": train_dataloader_20_percent
    }
}

In [42]:
for k, v in experiments["models"].items():
    print(k)

effnetb0
effnetb3
effnetb5


In [None]:
from src import utils

for model_name, model_ in experiments["models"].items():
    
    for epochs_num, epochs in experiments["epochs"].items():
        
        for data_name, data in experiments["data"].items():
            
            model = model_()
                    
            optimizer = torch.optim.Adam(
                params=model.parameters(),
                lr=0.001
            )
            loss_fn = torch.nn.CrossEntropyLoss()
            
            train(
                model=model,
                train_dataloader=data,
                test_dataloader=test_dataloader,
                optimizer=optimizer,
                loss_fn=loss_fn,
                epochs=epochs,
                device=device,
                writer=create_writer(
                    experiment_name=data_name,
                    model_name=model_name,
                    extra=f"{epochs}_epochs"
                )
            )
            
            # Save model to file so we can import it later if need be
            save_filepath = f"07_{model_name}_{data_name}_{epochs}_epochs.pth"
            utils.save_model(
                model=model,
                target_dir="models",
                model_name=save_filepath
            )
            
            print("\n")

[INFO]Created EfficientNetB0...
[INFO] Created SummaryWriter, saving to runs/25-08-19/10_percent/effnetb0/5_epochs...


  0%|          | 0/5 [00:00<?, ?it/s]



Epoch: 1 | train_loss: 0.7201 | train_acc: 0.7617 | test_loss: 0.9357 | test_acc: 0.6525


In [None]:
# Let's view oru experiments from within notebook
%load_ext tensorboard
%tensorboard --logdir runs

## 4. Exercise 2. Introduce data augmentation to the list of experiments using the 20% pizza, steak, sushi training and test datasets, does this change anything?

## Exercise 3. Scale up the dataset to turn FoodVision Mini into FoodVision Big using the entire Food101 dataset from torchvision.models