# Custom Mask R-CNN Training

Training notebook for custom Mask R-CNN with EfficientNet backbone and CBAM attention.


In [None]:
!git clone https://github.com/michaelo-ponteski/isaid-instance-segmentation.git
%cd isaid-instance-segmentation/

In [None]:
!git pull origin ponteski

In [None]:
# The following code will only execute
# successfully when compression is complete

import kagglehub

# Download latest version
path = kagglehub.dataset_download("michaeloponteski/isaid-patches")

print("Path to dataset files:", path)

In [None]:
import torch
import matplotlib.pyplot as plt
import importlib

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Reload modules for development
import datasets.isaid_dataset
import models.maskrcnn_model
import models.backbone
import utils.overfit_test
import training.trainer
import training.transforms

importlib.reload(datasets.isaid_dataset)
importlib.reload(models.maskrcnn_model)
importlib.reload(models.backbone)
importlib.reload(utils.overfit_test)
importlib.reload(training.trainer)
importlib.reload(training.transforms)

from datasets.isaid_dataset import iSAIDDataset, visualize_sample
from models.maskrcnn_model import CustomMaskRCNN, get_custom_maskrcnn
from models.backbone import BackboneWithFPN, build_custom_backbone_with_fpn
from utils.overfit_test import overfit_single_image_test
from training import Trainer, create_datasets

## Load Dataset


In [None]:
root_dir = path + "/iSAID_patches"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 16

In [None]:
# Create datasets ONCE - can be reused for multiple training runs
train_dataset, val_dataset = create_datasets(
    data_root=root_dir,
    image_size=800,
    subset_fraction=1.0,  # Use full dataset (set < 1.0 for quick testing)
)

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}")

In [None]:
# Visualize a sample
visualize_sample(train_dataset, 19)

## Create Model


In [None]:
# Build backbone with FPN (default: EfficientNet + CBAM attention)
backbone, fpn, fpn_out_channels = build_custom_backbone_with_fpn(pretrained=True)
backbone_with_fpn = BackboneWithFPN(backbone, fpn)

# Create model with the backbone
model = CustomMaskRCNN(
    num_classes=num_classes,
    backbone_with_fpn=backbone_with_fpn,
    pretrained_backbone=True,
)

# Model info
info = model.get_model_info()
print(f"Total parameters: {info['total_parameters']:,}")
print(f"Trainable parameters: {info['trainable_parameters']:,}")
print(f"Model size: {info['model_size_mb']:.1f} MB")

## Overfit Single Image Test

Sanity check - can the model learn to overfit a single image?


In [None]:
# Run overfit test on 3 images (sanity check)
losses, preds = overfit_single_image_test(
    model, train_dataset, idx=19, num_epochs=500, device=device, num_images=3
)

## Training


In [None]:
# Re-create model with fresh weights for training
# (datasets are already loaded - no need to reload them!)
backbone, fpn, _ = build_custom_backbone_with_fpn(pretrained=True)
backbone_with_fpn = BackboneWithFPN(backbone, fpn)

model = CustomMaskRCNN(
    num_classes=num_classes,
    backbone_with_fpn=backbone_with_fpn,
)

In [None]:
# Create trainer with pre-loaded datasets and model
# This allows reusing datasets across multiple training runs with different backbones
trainer = Trainer(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    model=model,
    batch_size=8,
    lr=0.0001,
    device=device,
    use_amp=True,
)

In [None]:
# Train with all metrics tracked
# Returns history dictionary with TensorBoard-compatible keys
history = trainer.fit(
    epochs=20,
    save_dir="checkpoints",
    find_lr_first=True,  # Run LR finder before training
    compute_metrics_every=1,  # Compute mAP every epoch
)

## Training History Visualization

Plot all training metrics including losses, mAP, gradient norms, and training dynamics.


In [None]:
# Plot training history
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

epochs = range(1, len(history["train/loss"]) + 1)

# 1. Loss curves
ax = axes[0, 0]
ax.plot(epochs, history["train/loss"], "b-", label="Train", linewidth=2)
ax.plot(epochs, history["val/loss"], "r-", label="Val", linewidth=2)
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
ax.set_title("Training & Validation Loss")
ax.legend()
ax.grid(True, alpha=0.3)

# 2. mAP@0.5 curves
ax = axes[0, 1]
ax.plot(epochs, history["train/mAP@0.5"], "b-", label="Train", linewidth=2)
ax.plot(epochs, history["val/mAP@0.5"], "r-", label="Val", linewidth=2)
ax.set_xlabel("Epoch")
ax.set_ylabel("mAP@0.5")
ax.set_title("mAP@0.5 (Primary Metric)")
ax.legend()
ax.grid(True, alpha=0.3)

# 3. Mean IoU (validation)
ax = axes[0, 2]
ax.plot(epochs, history["val/mean_iou"], "g-", linewidth=2)
ax.set_xlabel("Epoch")
ax.set_ylabel("Mean IoU")
ax.set_title("Validation Mean IoU")
ax.grid(True, alpha=0.3)

# 4. Gradient Norm
ax = axes[1, 0]
ax.plot(epochs, history["train/grad_norm"], "purple", linewidth=2)
ax.set_xlabel("Epoch")
ax.set_ylabel("Gradient Norm (L2)")
ax.set_title("Epoch-Averaged Gradient Norm")
ax.grid(True, alpha=0.3)

# 5. Loss Variance (stability indicator)
ax = axes[1, 1]
ax.plot(epochs, history["train/loss_variance"], "orange", linewidth=2)
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss Variance")
ax.set_title("Loss Variance (Training Stability)")
ax.grid(True, alpha=0.3)

# 6. mAP Gap (overfitting indicator)
ax = axes[1, 2]
ax.plot(epochs, history["train_val/mAP_gap"], "brown", linewidth=2)
ax.axhline(y=0, color="k", linestyle="--", alpha=0.5)
ax.fill_between(
    epochs,
    0,
    history["train_val/mAP_gap"],
    where=[g > 0 for g in history["train_val/mAP_gap"]],
    alpha=0.3,
    color="red",
    label="Overfitting",
)
ax.fill_between(
    epochs,
    0,
    history["train_val/mAP_gap"],
    where=[g <= 0 for g in history["train_val/mAP_gap"]],
    alpha=0.3,
    color="blue",
    label="Underfitting",
)
ax.set_xlabel("Epoch")
ax.set_ylabel("mAP Gap (train - val)")
ax.set_title("mAP Gap (Overfitting Indicator)")
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig("training_history.png", dpi=150, bbox_inches="tight")
plt.show()

print(f"\nFinal Metrics:")
print(f"  Val Loss: {history['val/loss'][-1]:.4f}")
print(f"  Val mAP@0.5: {history['val/mAP@0.5'][-1]:.4f}")
print(f"  Val Mean IoU: {history['val/mean_iou'][-1]:.4f}")

In [None]:
# Learning rate schedule over training
plt.figure(figsize=(10, 4))
plt.plot(epochs, history["train/lr"], "g-", linewidth=2)
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.title("Learning Rate Schedule (ReduceLROnPlateau)")
plt.yscale("log")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Save history for later analysis / TensorBoard
import json

# Convert to serializable format
history_serializable = {k: [float(v) for v in vals] for k, vals in history.items()}

with open("training_history.json", "w") as f:
    json.dump(history_serializable, f, indent=2)

print("Saved training history to training_history.json")

## Load Best Model and Evaluate

Load the best checkpoint and evaluate on validation set.


In [None]:
# Load best checkpoint (by loss or by mAP)
# Options: "checkpoints/best.pth" (best loss) or "checkpoints/best_map.pth" (best mAP)
checkpoint = trainer.load_checkpoint("checkpoints/best_map.pth")
trainer.model.eval()
print("Loaded best mAP model")

In [None]:
# Final evaluation on validation set
val_losses = trainer.validate()
val_map, val_mean_iou = trainer.compute_map(trainer.val_loader, iou_threshold=0.5)

print("\n" + "=" * 50)
print("Final Validation Results:")
print("=" * 50)
print(f"  Loss: {val_losses['total']:.4f}")
print(f"  mAP@0.5: {val_map:.4f}")
print(f"  Mean IoU: {val_mean_iou:.4f}")
print("=" * 50)

## Visualize


In [None]:
# Visualize predictions on validation set
trainer.visualize_predictions(num_samples=5, score_threshold=0.5)

In [None]:
# Quick inference on a single sample
idx = 50
image, target = val_dataset[idx]

# Ensure image is a tensor
if not isinstance(image, torch.Tensor):
    from torchvision.transforms.functional import to_tensor

    image = to_tensor(image)

with torch.no_grad():
    trainer.model.eval()
    pred = trainer.model([image.to(device)])[0]

# Filter by score threshold
score_threshold = 0.5
keep = pred["scores"] > score_threshold

print(f"Sample {idx}:")
print(f"  Ground truth: {len(target['boxes'])} objects")
print(f"  Predictions: {keep.sum().item()} detections (score > {score_threshold})")
print(f"  Predicted classes: {pred['labels'][keep].cpu().tolist()}")
print(
    f"  Confidence scores: {[f'{s:.2f}' for s in pred['scores'][keep].cpu().tolist()]}"
)