# Single-Shot Detection
In this notebook, we will perform object detection using Single-Shot Detection approach on the Pascal VOC dataset from scratch. And then compare it with standard torchvision implementation.

In [1]:
from typing import Optional

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision.transforms import v2
from torchvision import tv_tensors

from ignite.engine import Engine, Events
from ignite.handlers import global_step_from_engine
from ignite.handlers.checkpoint import Checkpoint, DiskSaver
from ignite.handlers.tqdm_logger import ProgressBar
from ignite.handlers.wandb_logger import WandBLogger, OutputHandler
from ignite.metrics import Average, ObjectDetectionAvgPrecisionRecall, RunningAverage

from detection_tools.data.pascal_voc import SSDVOCDataset
from detection_tools.ssd.utils import AnchorGenerator, Matcher, OffsetHandler, SSDPredictor
from detection_tools.ssd.modules import SSDLoss, SSDHead, VGG16FeatureExtractor

## Defining the Architecture
We will first create the architecture for the SSD model by organizing the components imported above.

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
class SSDDetector(nn.Module):
    def __init__(self, num_classes_actual: int, device: torch.device = torch.device("cpu")):
        super().__init__()
        self.feature_extractor = VGG16FeatureExtractor()
        self.device = device
        self.anchor_generator = AnchorGenerator(
            aspect_ratios=[
                [1.0, 2.0],
                [1.0, 2.0, 3.0],
                [1.0, 2.0, 3.0],
                [1.0, 2.0, 3.0],
                [1.0, 2.0],
                [1.0, 2.0]
            ],
            device=device
        )
        self.matcher = Matcher(iou_threshold=0.5)
        self.offset_handler = OffsetHandler()
        self.head = SSDHead(
            num_classes=num_classes_actual+1, # +1 for background class
            num_anchors=self.anchor_generator.num_anchors,
            channels=self.feature_extractor.MAP_CHANNELS
        )
        self.loss = SSDLoss(o_handler=self.offset_handler)
        self.predictor = SSDPredictor(
            score_thresh=0.01,
            num_top_k=200,
            nms_thresh=0.45,
            max_detections=100,
            device=device
        )
        self.to(device)
    
    def forward(self, images: torch.Tensor, targets: Optional[list[dict[str, torch.Tensor]]] = None) -> dict[str, torch.Tensor]:
        """If training targets are required and the model returns the loss, else returns the predictions."""
        img_h, img_w = images.shape[-2:]

        anchors = self.anchor_generator.generate_anchors(
            image_size=(img_h, img_w),
            feature_map_sizes=self.feature_extractor.MAP_SHAPES_300
        )
        anchors = [anchors.clone() for _ in range(images.shape[0])]
        features = self.feature_extractor(images)
        head_outputs = self.head(features)
        if targets is not None:
            matches = [
                self.matcher(target["bbox"], anchor_set)
                for target, anchor_set in zip(targets, anchors)
            ]
        
        if self.training:
            loss = self.loss(targets, head_outputs, anchors, matches)
            return loss
        else:
            predictions = self.predictor(head_outputs, anchors)
            if targets is not None:
                val_loss = self.loss(targets, head_outputs, anchors, matches, raw=True)
                return predictions, targets, val_loss
            return predictions
        

In [4]:
model = SSDDetector(num_classes_actual=20, device=device)
model

SSDDetector(
  (feature_extractor): VGG16FeatureExtractor(
    (features_conv4_3): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
      (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (3): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   

## Loading the Dataset
Now, we will load the datasets with appropriate transformations and create dataloaders for training and validation.

In [5]:
train_transforms = v2.Compose([
    v2.ToImage(),
    v2.RandomPhotometricDistort(p=0.5),
    v2.RandomZoomOut(fill={tv_tensors.Image: 127}),
    v2.RandomIoUCrop(),
    v2.RandomHorizontalFlip(p=0.5),
    v2.Resize((300, 300), antialias=True),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

val_transform = v2.Compose([
    v2.ToImage(),
    v2.Resize(size=(300, 300), antialias=True),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [6]:
train_set = SSDVOCDataset(
    root="../data/Pascal_VOC_Detection",
    image_set="train",
    transforms=train_transforms
)
val_set = SSDVOCDataset(
    root="../data/Pascal_VOC_Detection",
    image_set="val",
    transforms=val_transform
)
print(f"Train set size: {len(train_set)}, Validation set size: {len(val_set)}")

Train set size: 5717, Validation set size: 5823


In [7]:
def collate_fn(batch):
    images = [sample[0] for sample in batch]
    targets = [sample[1] for sample in batch]
    images = torch.stack(images)
    return images, targets

BATCH_SIZE = 8
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE, collate_fn=collate_fn)

## Training and Validation Loops
Finally, we will create the training and validation loops using Ignite Engine and run the training for a few epochs.

In [8]:
optimizer = torch.optim.SGD([
    {"params": model.feature_extractor.parameters(), "lr": 1e-5},
    {"params": model.head.parameters(), "lr": 1e-3}],
    momentum=0.9,
    weight_decay=5e-4
)

In [9]:
def train_batch(engine, batch):
    model.train()
    optimizer.zero_grad()
    images, targets = batch
    images = images.to(device)
    targets = [{k: v.to(device) for k, v in target.items()} for target in targets]
    loss = model(images, targets)
    loss.backward()
    optimizer.step()
    return loss.item()

def validate_batch(engine, batch):
    model.eval()
    images, targets = batch
    images = images.to(device)
    targets = [{k: v.to(device) for k, v in target.items()} for target in targets]
    with torch.no_grad():
        output = model(images, targets)
    return output

In [10]:
trainer = Engine(train_batch)
validator = Engine(validate_batch)

## Defining Metrics
Now, we are going to define some metrics for training and validation set.

In [11]:
train_metrics = {
    "loss": RunningAverage(output_transform=lambda x: x),
}
def output_transform_mAP(output):
    preds = output[0]
    targets = output[1]
    # preds = [{k: v.cpu() for k, v in pred.items()} for pred in preds]
    # targets = [{k: v.cpu() for k, v in target.items()} for target in targets]
    return preds, targets
    
mAP_metric = ObjectDetectionAvgPrecisionRecall(
    num_classes=21,
    output_transform=output_transform_mAP,
    iou_thresholds=[0.5],
    device=device
)
val_metrics = {
    "reg_loss": Average(output_transform=lambda output: output[2]["reg_loss"]),
    "cls_loss": Average(output_transform=lambda output: output[2]["cls_loss"]),
    "full_loss": Average(output_transform=lambda output: output[2]["reg_loss"] + output[2]["cls_loss"]),
    "mAP": mAP_metric
}

In [12]:
for key, metric in train_metrics.items():
    metric.attach(trainer, name=key)
for key, metric in val_metrics.items():
    metric.attach(validator, name=key)

In [None]:
pg_bar1 = ProgressBar(persist=True, desc="Training")
pg_bar1.attach(trainer, metric_names="all")

@trainer.on(Events.EPOCH_COMPLETED)
def log_final_loss(engine):
    current_epoch = engine.state.epoch
    max_epochs = engine.state.max_epochs
    loss = engine.state.metrics["loss"]
    print(f"Epoch: {current_epoch}/{max_epochs}, Final Loss: {loss:.4f}")
    validator.run(val_loader)

  from tqdm.autonotebook import tqdm


In [None]:
pg_bar2 = ProgressBar(persist=True, desc="Validating")
pg_bar2.attach(validator, output_transform=lambda output: output[2])

@validator.on(Events.EPOCH_COMPLETED)
def log_validation_results(engine):
    current_epoch = trainer.state.epoch
    max_epochs = trainer.state.max_epochs
    reg_loss = engine.state.metrics["reg_loss"]
    cls_loss = engine.state.metrics["cls_loss"]
    full_loss = engine.state.metrics["full_loss"]
    mAP = engine.state.metrics["mAP"]
    mAP50 = mAP[0]
    print(f"Validation Epoch: {current_epoch}/{max_epochs}: Reg Loss: {reg_loss:.4f}, Cls Loss: {cls_loss:.4f}")
    print(f"Validation Full Loss: {full_loss:.4f} mAP@50: {mAP50:.4f}")

In [16]:
# Checkpoints and Loggers
to_save = {
    "model": model,
    "optimizer": optimizer
}
file_name_prefix = "ssd_detector_voc"
def score_function(engine):
    return engine.state.metrics["mAP"][0]
score_name = "mAP@50"
n_saved = 2
step_source = global_step_from_engine(trainer, Events.EPOCH_COMPLETED)
ckp_handler = Checkpoint(
    to_save=to_save,
    save_handler=DiskSaver("./checkpoints", create_dir=True),
    filename_prefix=file_name_prefix,
    score_function=score_function,
    score_name=score_name,
    n_saved=n_saved,
    global_step_transform=step_source
)
validator.add_event_handler(Events.EPOCH_COMPLETED, ckp_handler)

<ignite.engine.events.RemovableEventHandle at 0x253f0b82390>

In [19]:
logger = WandBLogger(
    project="SSD_Object_Detection_VOC",
    config={
        "backbone_lr": 1e-5,
        "head_lr": 1e-3,
        "max_epochs": 10
    }
)

logger.attach(
    trainer,
    log_handler=OutputHandler(
        tag="training",
        metric_names="all",
        output_transform=lambda _: None,
        global_step_transform=lambda engine, _: engine.state.epoch
    ),
    event_name=Events.EPOCH_COMPLETED
)


logger.attach(
    validator,
    log_handler=OutputHandler(
        tag="validation",
        metric_names="all",
        output_transform=lambda _: None,
        global_step_transform=step_source
    ),
    event_name=Events.EPOCH_COMPLETED
)

wandb: Currently logged in as: ak_chp1 (ak_chp1-panjab-university) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


<ignite.engine.events.RemovableEventHandle at 0x298a0df9100>

In [17]:
trainer.run(train_loader, max_epochs=2)

Training[1/715]   0%|           [00:00<?]



Epoch: 1/2, Final Loss: 40.4987


Validating[1/728]   0%|           [00:00<?]

  all_padded_tensors[


Validation Epoch: 1/2: Reg Loss: 0.1001, Cls Loss: 9.9193
Validation Full Loss: 10.0194 mAP@50: 0.0000


Training[1/715]   0%|           [00:00<?]

Epoch: 2/2, Final Loss: 40.1854


Validating[1/728]   0%|           [00:00<?]

Validation Epoch: 2/2: Reg Loss: 0.1002, Cls Loss: 8.0777
Validation Full Loss: 8.1779 mAP@50: 0.0059


State:
	iteration: 1430
	epoch: 2
	epoch_length: 715
	max_epochs: 2
	max_iters: <class 'NoneType'>
	output: 22.874704360961914
	batch: <class 'tuple'>
	metrics: <class 'dict'>
	dataloader: <class 'torch.utils.data.dataloader.DataLoader'>
	seed: <class 'NoneType'>
	times: <class 'dict'>

In [18]:
# Grab one batch
images, targets = next(iter(val_loader))
images = images.to(device)
targets = [{k: v.to(device) for k, v in target.items()} for target in targets]
# Get predictions
model.eval()
with torch.no_grad():
    preds = model(images, targets)
print("--- DIAGNOSTIC CHECK ---")
print(f"Ground Truth Box 0: {targets[0]['bbox'][0].tolist()}")
preds = preds[0]
if len(preds[0]['bbox']) > 0:
    print(f"Prediction Box 0:   {preds[0]['bbox'][0].tolist()}")
else:
    print("Prediction Box 0:   No boxes predicted (confidence too low)")

--- DIAGNOSTIC CHECK ---
Ground Truth Box 0: [20.400001525878906, 8.800000190734863, 268.8000183105469, 234.40000915527344]
Prediction Box 0:   [175.96824645996094, 3.266291856765747, 175.96824645996094, 3.2662928104400635]


In [24]:
preds[0]

[{'bbox': tensor([[ 1.4701e+02,  9.8712e-01,  1.4701e+02,  9.8712e-01],
          [-2.4693e+03, -9.3293e+03,  2.8956e+03,  9.6986e+03],
          [-7.0313e+02, -3.2649e+03,  1.2343e+03,  3.5473e+03],
          [-5.2998e+04, -2.4123e+04,  5.3298e+04,  2.4429e+04],
          [-3.6844e+04, -3.6310e+04,  3.7142e+04,  3.6619e+04],
          [-2.4182e+04, -5.2905e+04,  2.4481e+04,  5.3210e+04],
          [-5.2998e+04, -2.4123e+04,  5.3298e+04,  2.4429e+04],
          [-3.4697e+04, -3.4267e+04,  3.4995e+04,  3.4574e+04],
          [-2.4182e+04, -5.2905e+04,  2.4481e+04,  5.3210e+04],
          [-5.2998e+04, -2.4123e+04,  5.3298e+04,  2.4429e+04],
          [-2.4182e+04, -5.2905e+04,  2.4481e+04,  5.3210e+04],
          [-2.4182e+04, -5.2905e+04,  2.4481e+04,  5.3210e+04],
          [-5.2998e+04, -2.4123e+04,  5.3298e+04,  2.4429e+04],
          [-5.2998e+04, -2.4123e+04,  5.3298e+04,  2.4429e+04],
          [-5.2998e+04, -2.4123e+04,  5.3298e+04,  2.4429e+04],
          [-2.4182e+04, -5.2905e