In [1]:
import torch
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.optim as optim
import random
%matplotlib inline

In [32]:
%run ImageDataLoader.ipynb
%run ImageProcessor.ipynb
%run DataExplorer.ipynb
%run DatasetStatistics.ipynb
%run DuplicateDetector.ipynb
%run OversampledDataset.ipynb
%run BatchVisualizer.ipynb
%run VGG16Model.ipynb
%run ResNet34Model.ipynb
%run EfficientNetModel.ipynb
%run Trainer.ipynb

In [3]:
DATA_ROOT = "./data"
TRAIN_DIR = "./data/Training"
TEST_DIR = "./data/Testing"
BATCH_SIZE = 64
IMAGE_SIZE = (224, 224)
VAL_SPLIT = 0.15

In [None]:
device = (
    torch.accelerator.current_accelerator().type
    if torch.accelerator.is_available()
    else "cpu"
)
print(f"Using {device} device")
print(f"Accelerator name: {torch.cuda.get_device_name(device)}")

In [None]:
loader = ImageDataLoader(TRAIN_DIR, TEST_DIR)
all_files = loader.load_all_images()

print(f"Successfully loaded {len(all_files)} images")

In [None]:
duplicate_detector = DuplicateDetector(all_files)
duplicate_detector.detect_duplicates()

if len(duplicate_detector.duplicates) > 0:  # If duplicate files are present
    duplicate_detector.remove_duplicates_from_disk()  # Removing duplicates entirely from disk
    all_files = duplicate_detector.get_unique_files()  # Cleaning list with file paths

In [None]:
loader.print_dataset_class_count()

In [None]:
processor = ImageProcessor(all_files)

processor.load_grayscale_images()
processor.display_image_grid(batch_size=32, figsize=(18, 9), images_per_row=8)

In [None]:
explorator = DataExplorer(all_files)
explorator.retrieve_sample_of_images(
    [0, len(all_files) // 2, -1]
)  # First, middle and last image

explorator.plot_histogram()

In [None]:
stats = DatasetStatistics(processor.gray_images)

stats.compute_stats()
stats.print_stats()

MEAN, STD = stats.get_normalized_values()

In [11]:
train_transform = transforms.Compose(
    [
        transforms.Resize(IMAGE_SIZE),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(12),
        transforms.ColorJitter(brightness=0.15, contrast=0.15),
        transforms.ToTensor(),
        transforms.Normalize(mean=[MEAN] * 3, std=[STD] * 3),
    ]
)

In [12]:
test_transform = transforms.Compose(
    [
        transforms.Resize(IMAGE_SIZE),
        transforms.ToTensor(),
        transforms.Normalize(mean=[MEAN] * 3, std=[STD] * 3),
    ]
)

In [None]:
trainset = OversampledDataset(TRAIN_DIR, transform=train_transform)
testset = datasets.ImageFolder(TEST_DIR, transform=test_transform)

print()
trainset.print_class_distribution()

print()
print(f"Training samples: {len(trainset)}")
print(f"Test samples: {len(testset)}")
print(f"Classes: {trainset.classes}")

In [None]:
train_size = int((1 - VAL_SPLIT) * len(trainset))
val_size = len(trainset) - train_size

print(
    f"Validation set size: {val_size} images\n \
      Train set size: {train_size} images"
)

In [None]:
trainset, valset = random_split(
    trainset, [train_size, val_size], generator=torch.Generator().manual_seed(42)
)

print(f"Train: {len(trainset)}, Validation: {len(valset)}")

In [16]:
train_dl = DataLoader(trainset, BATCH_SIZE, shuffle=True, num_workers=0)
val_dl = DataLoader(valset, BATCH_SIZE, shuffle=False, num_workers=0)
test_dl = DataLoader(testset, BATCH_SIZE, shuffle=False, num_workers=0)

In [None]:
visualizer = BatchVisualizer(trainset.dataset.classes, mean=[MEAN] * 3, std=[STD] * 3)

visualizer.visualize_batch(train_dl)
visualizer.visualize_classes(val_dl, 6)

In [37]:
EPOCHS = 5
LEARNING_RATES = [0.01, 0.001, 0.0001, 0.00001]
NUMBER_OF_CLASSES = 4

In [38]:
models_config = {
    "VGG-16": VGG16Model,
    "ResNet34": ResNet34Model,
    "EfficientNet_B0": EfficientNetModel,
}

In [39]:
optimizers_config = {
    "sgd": lambda params, lr: optim.SGD(params, lr=lr),
    "sgd_momentum": lambda params, lr: optim.SGD(params, lr=lr, momentum=0.9),
    "adam": lambda params, lr: optim.Adam(params, lr=lr),
}

In [21]:
def set_seed(seed=42):
    random.seed(seed)  # Python random
    np.random.seed(seed)  # NumPy
    torch.manual_seed(seed)  # PyTorch CPU
    torch.cuda.manual_seed(seed)  # PyTorch GPU (used)
    torch.cuda.manual_seed_all(seed)  # PyTorch all GPUs
    torch.backends.cudnn.deterministic = True  # CUDA deterministic operations
    torch.backends.cudnn.benchmark = False  # Turn off autotuning for reproducity

In [22]:
def run_experiment(
    model_class, optimizer_fn, lr, train_dl, val_dl, device, epochs=EPOCHS
):
    set_seed(42)

    model = model_class(NUMBER_OF_CLASSES)
    trainer = Trainer(model, device)
    optimizer = optimizer_fn(model.parameters(), lr)

    history = trainer.fit(train_dl, val_dl, optimizer, epochs)

    return {
        "last_epoch_val_acc": history["val_acc"][-1],
        "best_val_acc": max(history["val_acc"]),
        "last_epoch_val_loss": history["val_loss"][-1],
        "history": history,
    }

In [None]:
def run_all_experiments(train_dl, val_dl, device):
    res = []

    total = len(models_config) * len(optimizers_config) * len(LEARNING_RATES)
    current = 0

    for model_name, model_class in models_config.items():
        for optim_name, optim_fn in optimizers_config.items():
            for lr in LEARNING_RATES:
                current += 1
                print(f"\n[{current}/{total}] {model_name} + {optim_name} + lr={lr}")

                result = run_experiment(
                    model_class, optim_fn, lr, train_dl, val_dl, device
                )

                res.append(
                    {
                        "model": model_name,
                        "optimizer": optim_name,
                        "learning rate": lr,
                        **result,
                    }
                )
                print(
                    f"Last epoch's validation accuracy: {result['last_epoch_val_acc']:.4f}"
                )

    return res

In [None]:
results = run_all_experiments(train_dl, val_dl, device)

In [None]:
FINAL_EPOCHS = 50
PATIENCE = 10

In [None]:
set_seed(42)

vgg16_model = VGG16Model()
optimizer_vgg16 = optim.Adam(vgg16_model.parameters(), lr=0.0001)

trainer_vgg16 = Trainer(vgg16_model, device)
vgg16_history = trainer_vgg16.fit(train_dl, val_dl, optimizer_vgg16, EPOCHS, PATIENCE)

In [None]:
set_seed(42)

resnet34_model = ResNet34Model()
optimizer_resnet = optim.SGD(resnet34_model.parameters(), lr=0.001, momentum=0.9)

trainer_resnet34 = Trainer(resnet34_model, device)
resnet34_history = trainer_resnet34.fit(
    train_dl, val_dl, optimizer_resnet, EPOCHS, PATIENCE
)

In [None]:
set_seed(42)

efficientnet_model = EfficientNetModel()
optimizer_efficientnet = optim.Adam(efficientnet_model.parameters(), lr=0.001)

trainer_efficientnet = Trainer(efficientnet_model, device)
efficientnet_history = trainer_efficientnet.fit(
    train_dl, val_dl, optimizer_efficientnet, EPOCHS, PATIENCE
)