# Setup


In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path

DATA_DIR = Path.cwd().parent / "data"

DATA_DIR

In [None]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

device

In [None]:
import torch

generator = torch.Generator().manual_seed(42)

In [None]:
import torch

torch.set_float32_matmul_precision("high")

# Dataset


In [None]:
TRAIN_IMAGES_FILE = DATA_DIR / "train-image.hdf5"
TRAIN_METADATA_FILE = DATA_DIR / "train-metadata.csv"

TRAIN_IMAGES_FILE, TRAIN_METADATA_FILE

In [None]:
from isic.dataset import ISICDataset

ds = ISICDataset(TRAIN_IMAGES_FILE, TRAIN_METADATA_FILE)

len(ds)

In [None]:
with ds:
    for i in range(5):
        metadata, image, target = ds[i]
        print(metadata["isic_id"], target)
        image.resize((128, 128)).show()

# Experiment Setup


In [None]:
params = {
    "epochs": 1,
    "batch_size": 128,
    "learning_rate": 0.001,
    "image_size": 128,
    "threshold": 0.5,
}

epochs = params["epochs"]
batch_size = params["batch_size"]
lr = params["learning_rate"]
img_size = params["image_size"], params["image_size"]
threshold = params["threshold"]

# Model Definition


In [None]:
from isic.models import MLP

model = MLP(img_size).to(device)

model

In [None]:
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params

# Training


In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

print(f"Model device: {next(model.parameters()).device}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print(
    f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}"
)

In [None]:
class_counts = [400666, 393]  # [benign, malignant] from EDA

print("Class Distribution:")
print(f"Benign: {class_counts[0]:,} samples")
print(f"Malignant: {class_counts[1]:,} samples")
print(f"Imbalance ratio: {class_counts[0] / class_counts[1]:.1f}:1")

Use weighted BCE loss to handle class imbalance


In [None]:
import torch.nn as nn

pos_weight = torch.tensor([ds.pos_weight], device=device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
print(f"Positive class weight: {ds.pos_weight:.1f}")

Acquire datasource resources


In [None]:
ds.open();

In [None]:
from torch.utils.data import DataLoader, random_split
from isic.dataset import ImageEncoder, MetadataEncoder, BatchEncoder

train_size = int(0.8 * len(ds))
val_size = len(ds) - train_size
train_dataset, val_dataset = random_split(
    ds, [train_size, val_size], generator=generator
)

print("Dataset sizes:")
print(f"Total: {len(ds):,}")
print(f"Train: {len(train_dataset):,}")
print(f"Validation: {len(val_dataset):,}")

image_encoder = ImageEncoder(image_size=img_size)
metadata_encoder = MetadataEncoder().fit(ds.metadata)
batch_encoder = BatchEncoder(
    image_encoder=image_encoder,
    metadata_encoder=metadata_encoder,
)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=batch_encoder,
    generator=generator,
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=batch_encoder,
)

print(f"Batches per epoch - Train: {len(train_loader)}, Val: {len(val_loader)}")

In [None]:
import trackio
from isic.training import train, validate, training_summary

trackio.init(project="mlp", config=params, embed=False)

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    print("-" * 60)

    # train
    train_metrics = train(model, train_loader, criterion, optimizer, device, threshold)

    # validate
    val_metrics, val_targets, val_predictions = validate(
        model, val_loader, criterion, device, threshold
    )

trackio.finish()
ds.close()

In [None]:
print(training_summary(val_targets, val_predictions))