## Libraries

In [1]:
# System libraries
import os
from pathlib import Path

# Third-party libraries
import torch
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

## Constants

In [2]:
random_state = 42

# Define paths
root_path = Path.cwd().parent
metadata_path = root_path / 'data' / 'metadata_updated.csv'
image_path = root_path / 'data' / 'raw_dataset' / 'images'

## Load Metadata

In [3]:
# Load metadata
metadata = pd.read_csv(metadata_path)
metadata.head()

Unnamed: 0.1,Unnamed: 0,DDI_ID,DDI_file,skin_tone,malignant,disease
0,0,1,000001.png,56,1,melanoma-in-situ
1,1,2,000002.png,56,1,melanoma-in-situ
2,2,3,000003.png,56,1,mycosis-fungoides
3,3,4,000004.png,56,1,squamous-cell-carcinoma-in-situ
4,4,5,000005.png,12,1,basal-cell-carcinoma


## Split Training, Validation and Testing Set

We split the dataset into training (60%), validation (20%) and testing (20%) sets using stratified sampling to ensure balanced distribution of malignant and benign cases across all splits. This approach maintains the same proportion of classes in each subset, which is important for model training and evaluation, especially with imbalanced datasets.

We first split the data into train (60%) and a temporary set (40%), then further divide the temporary set into validation and test sets of equal size.

In [4]:
# Split the dataset into training, validation, and test sets
train_df, val_df = train_test_split(
  metadata, 
  test_size=0.4, 
  stratify=metadata['malignant'], 
  random_state=random_state
)

val_df, test_df = train_test_split(
  val_df,
  test_size=0.5,
  stratify=val_df['malignant'],
  random_state=random_state
)

## Prepare Dataset

We create a custom PyTorch dataset class (`SkinDataset`) to efficiently load and preprocess skin lesion images for our deep learning model. The dataset class handles:

1. Loading images from file paths stored in our metadata DataFrame
2. Applying provided transformations to the images
3. Pairing each image with its corresponding label (malignant or benign)

In [5]:
class SkinDataset(Dataset):
    """
    Custom Dataset for loading skin images and their labels.

    """
    def __init__(self, dataframe: pd.DataFrame, img_dir: str, transform: callable):
        """
        Initializes the SkinDataset with a DataFrame, image directory, and transformations.

        Parameters
        ----------
        dataframe : pandas.DataFrame
            DataFrame containing image file paths and labels.
        img_dir : str
            Directory where images are stored.
        transform : callable
            Transformations to apply to the images.
        """
        self.dataframe = dataframe.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self) -> int:
        """ Returns the number of samples in the dataset. """
        return len(self.dataframe)

    def __getitem__(self, idx: int) -> tuple:
        """
        Retrieves an image and its label by index.

        Parameters
        ----------
        idx : int
            Index of the sample to retrieve.

        Returns
        -------
        tuple
            A tuple containing the transformed image and its label.
        """
        # Create image path with image directory and filename
        img_path = Path(self.img_dir) / self.dataframe.loc[idx, 'DDI_file']

        # Retrieve the label for the image
        label = self.dataframe.loc[idx, 'malignant']

        # Load the image, convert to RGB, and apply transformations
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)

        # Return the transformed image and its label
        return image, label
    
    def get_labels(self) -> np.ndarray:
        """
        Returns the labels of the dataset.

        Returns
        -------
        numpy.ndarray
            Array of labels.
        """
        return self.dataframe['malignant'].values

## Data Augmentation

Data augmentation is a crucial technique for improving model generalisation and performance, especially when working with limited datasets. Our augmentation function applies various transformations to the training data to artificially increase the diversity of the training set.

The `augmentation()` function creates two separate transformation pipelines:

1. **Training Transformations:** Apply various random modifications to training images to help the model learn more robust features:
  - Resize images to 299×299 pixels (Xception's required input size)
  - Random horizontal and vertical flips to simulate different orientations
  - Random affine transformations (rotation, translation, scaling) to provide positional variance
  - Color jitter to simulate lighting variations
  - Normalisation with ImageNet mean and standard deviation values
  - Random erasing to help the model learn to identify lesions even with partial occlusions
  - Gaussian blur to simulate focus variations in dermatoscopic images

2. **Test/Validation Transformations:** Apply only essential preprocessing:
  - Resize images to the required dimensions
  - Normalisation to match training data distribution

Each transformation can be toggled on or off using function parameters, allowing for controlled experimentation with different augmentation strategies.

In [6]:
def augmentation(resize: tuple) -> tuple:
    """
    Defines image transformations for training and testing.

    Parameters
    ----------
    resize : tuple
        Tuple specifying the size to which images should be resized (height, width).

    Returns
    -------
    tuple of torchvision.transforms.Compose
        A tuple containing (train_transform, test_transform) for training and testing datasets.
    """
    # Define image transformations for training
    train_transform = transforms.Compose([
        transforms.Resize(resize),
        transforms.RandomHorizontalFlip(p=0.2),
        transforms.RandomVerticalFlip(p=0.2),
        transforms.RandomAffine(degrees=20, translate=(0.1, 0.1), scale=(0.9, 1.1)),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        transforms.RandomErasing(p=0.2, scale=(0.02, 0.1)),
        transforms.GaussianBlur(kernel_size=5, sigma=(0.1, 2.0)),
    ])

    # Define image transformations for testing
    test_transform = transforms.Compose([
        transforms.Resize(resize),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Return tuple of train_transform and test_transform
    return train_transform, test_transform

## Create Data Loaders

The `create_data_loaders()` function creates PyTorch DataLoader objects for training, validation, and testing datasets applying the specified data augmentation transformations. These DataLoaders efficiently batch and shuffle data during model training and evaluation.


In [7]:
def create_data_loaders(resize: tuple, batch_size: int = 32, oversample: bool = False) -> tuple:
    """
    Create data loaders for training, validation, and testing datasets. Uses augmentation transformations as provided.

    Parameters
    ----------
    resize : tuple
        Tuple specifying the size to which images should be resized (height, width).
    batch_size : int, optional
        Batch size for the data loaders. Defaults to 32.
    oversample : bool, optional
        Whether to oversample the training dataset. Defaults to False.
    
    Returns
    -------
    tuple of torch.utils.data.DataLoader
        A tuple containing the training, validation, and testing data loaders.
    """
    # Initialise transformations
    train_transform, test_transform = augmentation(resize=resize)

    from torchsampler import ImbalancedDatasetSampler

    # Create datasets
    train_dataset = SkinDataset(train_df, image_path, transform=train_transform)
    val_dataset = SkinDataset(val_df, image_path, transform=test_transform)
    test_dataset = SkinDataset(test_df, image_path, transform=test_transform)

    # Create data loaders
    if oversample:
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=ImbalancedDatasetSampler(train_dataset))
    else:
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Return the data loaders
    return train_loader, val_loader, test_loader

In [8]:
model_type = 'ViT'
train_loader, val_loader, test_loader = create_data_loaders(
  # resize=(299, 299), # Xception
  resize=(224, 224), # Vision Transformer
  batch_size=32
)

# Create directory for saving loaders if it doesn't exist
save_dir = root_path / 'data' / 'processed' / model_type
os.makedirs(save_dir, exist_ok=True)

# Extract datasets from loaders
train_dataset = train_loader.dataset
val_dataset = val_loader.dataset
test_dataset = test_loader.dataset

# Extract data and save as PyTorch tensors
# (Note: This extracts all data into memory which may be slow for large datasets)
def extract_and_save_dataset(dataset, name):
  images = []
  labels = []
  for i in range(len(dataset)):
    image, label = dataset[i]
    images.append(image)
    labels.append(label)
  
  # Convert to tensors
  images = torch.stack(images)
  labels = torch.tensor(labels)
  
  # Save tensors
  torch.save({
    'images': images,
    'labels': labels
  }, save_dir / f'{name}_dataset.pt')
  
  print(f"Saved {name} dataset with {len(dataset)} samples")

# Save all datasets
extract_and_save_dataset(train_dataset, 'train')
extract_and_save_dataset(val_dataset, 'val')
extract_and_save_dataset(test_dataset, 'test')

Saved train dataset with 393 samples
Saved val dataset with 131 samples
Saved test dataset with 132 samples
