# SSD300 Object Detection Training and Evaluation Pipeline

This notebook demonstrates a complete pipeline for training and evaluating an SSD300 model on Pascal VOC 2012 dataset.

## Pipeline Overview:
1. **Data Preparation**: Convert VOC XML annotations to COCO format
2. **Dataset Loading**: Create optimized dataset loaders
3. **Model Training**: Train SSD300 with proper configuration
4. **Model Evaluation**: Evaluate using COCO metrics
5. **Performance Analysis**: Speed, memory, and accuracy benchmarking
6. **Visualization**: View inference results

## Requirements:
- Pascal VOC 2012 dataset
- PyTorch with torchvision
- pycocotools
- Standard ML libraries (numpy, matplotlib, etc.)

## 1. Environment Setup and Imports

In [None]:
import os
import sys
import json
import time
import argparse
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Adam
import torchvision.transforms as T
from torchvision.models.detection import ssd300_vgg16, SSD300_VGG16_Weights
from torchvision.models.detection.ssd import SSDClassificationHead

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

# Configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Pascal VOC classes
VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle",
    "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]

## 2. Data Preparation: VOC to COCO Conversion

In [None]:
# Create labels file for VOC classes
os.makedirs("data", exist_ok=True)

with open("data/labels.txt", "w") as f:
    for cls in VOC_CLASSES:
        f.write(f"{cls}\n")

print("Created labels.txt with Pascal VOC classes")
print(f"Classes: {', '.join(VOC_CLASSES)}")

In [None]:
# Convert VOC XML annotations to COCO JSON format
# Note: Update these paths according to your dataset location
VOC_DATA_DIR = "data/VOCdevkit/VOC2012"
ANNOTATIONS_DIR = f"{VOC_DATA_DIR}/Annotations"
IMAGES_DIR = f"{VOC_DATA_DIR}/JPEGImages"
TRAINVAL_IDS = f"{VOC_DATA_DIR}/ImageSets/Main/trainval.txt"
COCO_JSON_PATH = "data/voc2012_coco.json"

# Run VOC to COCO conversion using the existing script
conversion_cmd = f"""
python scripts/voc2coco.py \
  --ann_dir {ANNOTATIONS_DIR} \
  --ann_ids {TRAINVAL_IDS} \
  --labels data/labels.txt \
  --output {COCO_JSON_PATH} \
  --ext xml \
  --extract_num_from_imgid
"""

print("Converting VOC annotations to COCO format...")
print(f"Command: {conversion_cmd.strip()}")
os.system(conversion_cmd)
print(f"Conversion complete! COCO JSON saved to: {COCO_JSON_PATH}")

## 3. Dataset Loading with Optimized COCO Loader

In [None]:
# Create optimized COCO dataset loader
from PIL import Image
from torch.utils.data import Dataset

class COCODetectionDataset(Dataset):
    """Optimized COCO dataset loader for object detection."""
    
    def __init__(self, ann_file, img_folder, transform=None):
        print(f"Loading COCO annotations from: {ann_file}")
        self.coco = COCO(ann_file)
        self.img_folder = img_folder
        self.transform = transform
        self.ids = list(self.coco.imgs.keys())
        print(f"Loaded {len(self.ids)} images")

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]
        
        # Load image
        img_path = os.path.join(self.img_folder, img_info['file_name'])
        img = Image.open(img_path).convert("RGB")

        # Load annotations
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        anns = self.coco.loadAnns(ann_ids)

        boxes, labels, areas, iscrowd = [], [], [], []
        for ann in anns:
            x, y, w, h = ann['bbox']
            boxes.append([x, y, x + w, y + h])
            labels.append(ann['category_id'] - 1)  # Convert to 0-based indexing
            areas.append(ann['area'])
            iscrowd.append(ann.get('iscrowd', 0))

        # Create target dictionary
        target = {
            "boxes": torch.tensor(boxes, dtype=torch.float32),
            "labels": torch.tensor(labels, dtype=torch.int64),
            "image_id": torch.tensor([img_id], dtype=torch.int64),
            "area": torch.tensor(areas, dtype=torch.float32),
            "iscrowd": torch.tensor(iscrowd, dtype=torch.uint8),
        }

        if self.transform:
            img = self.transform(img)

        return img, target

    def __len__(self):
        return len(self.ids)


def get_transform():
    """Get transforms for SSD300 input preprocessing."""
    return T.Compose([
        T.Resize((300, 300)),
        T.ToTensor(),
        T.Normalize(mean=[0.48235, 0.45882, 0.40784], 
                   std=[0.229, 0.224, 0.225])
    ])


def collate_fn(batch):
    """Custom collate function for object detection."""
    return tuple(zip(*batch))


# Create dataset and dataloader
print("Creating dataset and dataloader...")
dataset = COCODetectionDataset(COCO_JSON_PATH, IMAGES_DIR, transform=get_transform())
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
print(f"Dataset ready with {len(dataset)} samples")

## 4. Model Setup and Training

In [None]:
# Initialize SSD300 model
print("Initializing SSD300 model...")
model = ssd300_vgg16(weights=SSD300_VGG16_Weights.COCO_V1)

# Replace classification head for Pascal VOC (21 classes including background)
model.head.classification_head = SSDClassificationHead(
    in_channels=[512, 1024, 512, 256, 256, 256],
    num_anchors=[4, 6, 6, 6, 4, 4],
    num_classes=21
)

model.to(DEVICE)
print(f"Model loaded on {DEVICE}")

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")

In [None]:
# Training configuration
EPOCHS = 5
LEARNING_RATE = 1e-4
MODEL_SAVE_PATH = "outputs/ssd300_voc.pth"

os.makedirs("outputs", exist_ok=True)

# Setup optimizer
optimizer = Adam(model.parameters(), lr=LEARNING_RATE)

print(f"Training configuration:")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Batch size: {dataloader.batch_size}")
print(f"  Dataset size: {len(dataset)}")
print(f"  Batches per epoch: {len(dataloader)}")

In [None]:
# Training loop
print("Starting training...")
model.train()

training_losses = []

for epoch in range(EPOCHS):
    epoch_loss = 0.0
    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    
    for batch_idx, (images, targets) in enumerate(progress_bar):
        # Move data to device
        images = [img.to(DEVICE) for img in images]
        targets = [{k: v.to(DEVICE) for k, v in t.items()} for t in targets]
        
        # Forward pass
        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())
        
        # Backward pass
        optimizer.zero_grad()
        losses.backward()
        optimizer.step()
        
        # Update metrics
        batch_loss = losses.item()
        epoch_loss += batch_loss
        
        # Update progress bar
        progress_bar.set_postfix({
            'batch_loss': f'{batch_loss:.4f}',
            'avg_loss': f'{epoch_loss/(batch_idx+1):.4f}'
        })
    
    # Log epoch results
    avg_epoch_loss = epoch_loss / len(dataloader)
    training_losses.append(avg_epoch_loss)
    print(f"Epoch {epoch+1}/{EPOCHS} - Average Loss: {avg_epoch_loss:.4f}")

print(f"Training completed! Saving model to: {MODEL_SAVE_PATH}")
torch.save(model.state_dict(), MODEL_SAVE_PATH)

In [None]:
# Plot training loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, EPOCHS+1), training_losses, 'b-', marker='o', linewidth=2, markersize=8)
plt.title('SSD300 Training Loss', fontsize=16, fontweight='bold')
plt.xlabel('Epoch', fontsize=12)
plt.ylabel('Average Loss', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(range(1, EPOCHS+1))
for i, loss in enumerate(training_losses):
    plt.annotate(f'{loss:.3f}', (i+1, loss), textcoords="offset points", 
                xytext=(0,10), ha='center', fontsize=10)
plt.tight_layout()
plt.show()

print(f"Final training loss: {training_losses[-1]:.4f}")
print(f"Loss reduction: {((training_losses[0] - training_losses[-1]) / training_losses[0] * 100):.1f}%")

## 5. Model Evaluation

In [None]:
# Setup evaluation
print("Setting up model for evaluation...")
model.eval()

# Create evaluation dataloader (no shuffling)
eval_dataloader = DataLoader(dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)

# Load ground truth COCO
coco_gt = COCO(COCO_JSON_PATH)
if 'info' not in coco_gt.dataset:
    coco_gt.dataset['info'] = {"description": "VOC to COCO converted dataset"}

print("Running inference on validation set...")
all_predictions = []

with torch.no_grad():
    for images, targets in tqdm(eval_dataloader, desc="Evaluating"):
        images = [img.to(DEVICE) for img in images]
        outputs = model(images)
        
        for output, target in zip(outputs, targets):
            img_id = target['image_id'].item()
            
            for box, score, label in zip(output['boxes'], output['scores'], output['labels']):
                x1, y1, x2, y2 = box.cpu().tolist()
                all_predictions.append({
                    "image_id": img_id,
                    "category_id": int(label) + 1,  # Convert back to 1-based for COCO
                    "bbox": [x1, y1, x2 - x1, y2 - y1],  # COCO format: [x, y, width, height]
                    "score": float(score)
                })

print(f"Generated {len(all_predictions)} predictions")

In [None]:
# Save predictions and run COCO evaluation
RESULTS_PATH = "outputs/ssd_results.json"

print(f"Saving predictions to: {RESULTS_PATH}")
with open(RESULTS_PATH, 'w') as f:
    json.dump(all_predictions, f)

print("Running COCO evaluation...")
coco_dt = coco_gt.loadRes(RESULTS_PATH)
coco_eval = COCOeval(coco_gt, coco_dt, iouType='bbox')

# Filter to only evaluate on images that have predictions
img_ids_with_preds = sorted({pred['image_id'] for pred in all_predictions})
coco_eval.params.imgIds = img_ids_with_preds

coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()

In [None]:
# Extract and save detailed metrics
metrics_keys = [
    "AP_0.50:0.95", "AP_0.50", "AP_0.75",
    "AP_small", "AP_medium", "AP_large",
    "AR_1", "AR_10", "AR_100",
    "AR_small", "AR_medium", "AR_large"
]

metrics_dict = dict(zip(metrics_keys, coco_eval.stats.tolist()))

# Add per-class Average Precision
precision = coco_eval.eval['precision']  # [IoU, Recall, Category, Area, MaxDets]
cat_ids = coco_eval.params.catIds
cat_names = {cat['id']: cat['name'] for cat in coco_gt.loadCats(cat_ids)}

per_class_ap = {}
print("\nPer-class Average Precision (AP @ IoU=0.50:0.95):")
print("=" * 50)
for i, cat_id in enumerate(cat_ids):
    # Extract precision for this category (all IoUs, all recalls, area=all, maxDets=100)
    cat_precision = precision[:, :, i, 0, 2]
    # Average precision is mean of valid precision values
    ap = np.mean(cat_precision[cat_precision > -1])
    per_class_ap[cat_names[cat_id]] = float(ap)
    print(f"{cat_names[cat_id]:15s}: {ap:.3f}")

metrics_dict['per_class_AP'] = per_class_ap

# Save comprehensive metrics
METRICS_PATH = "outputs/evaluation_metrics.json"
with open(METRICS_PATH, 'w') as f:
    json.dump(metrics_dict, f, indent=2)

print(f"\nDetailed metrics saved to: {METRICS_PATH}")
print(f"\nKey Results:")
print(f"  mAP @ IoU=0.50:0.95: {metrics_dict['AP_0.50:0.95']:.3f}")
print(f"  mAP @ IoU=0.50:     {metrics_dict['AP_0.50']:.3f}")
print(f"  Average Recall:     {metrics_dict['AR_100']:.3f}")

## 6. Performance Benchmarking

In [None]:
# Model size analysis
model_size_mb = os.path.getsize(MODEL_SAVE_PATH) / (1024 * 1024)
total_params = sum(p.numel() for p in model.parameters())

print("Model Analysis:")
print(f"  Model file size: {model_size_mb:.1f} MB")
print(f"  Total parameters: {total_params:,}")
print(f"  Parameters (millions): {total_params/1e6:.2f}M")

# Memory usage analysis
if DEVICE.type == 'cuda':
    torch.cuda.reset_peak_memory_stats()
    
    # Run a forward pass to measure peak memory
    dummy_input = torch.randn(1, 3, 300, 300).to(DEVICE)
    with torch.no_grad():
        _ = model(dummy_input)
    
    peak_memory_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
    print(f"  Peak GPU memory: {peak_memory_mb:.0f} MB")
else:
    print("  GPU memory analysis not available (CPU mode)")

In [None]:
# Inference speed benchmarking
print("Running inference speed benchmark...")

# Create benchmark dataloader with larger batch size if possible
benchmark_batch_size = 8 if DEVICE.type == 'cuda' else 4
benchmark_dataloader = DataLoader(dataset, batch_size=benchmark_batch_size, 
                                shuffle=False, collate_fn=collate_fn)

model.eval()

# Warmup runs
print("Warming up GPU...")
with torch.no_grad():
    for i, (images, _) in enumerate(benchmark_dataloader):
        if i >= 5:  # 5 warmup batches
            break
        images = [img.to(DEVICE) for img in images]
        _ = model(images)
        if DEVICE.type == 'cuda':
            torch.cuda.synchronize()

# Actual timing
print("Running benchmark...")
total_images = 0
start_time = time.time()

with torch.no_grad():
    for images, _ in tqdm(benchmark_dataloader, desc="Benchmarking"):
        images = [img.to(DEVICE) for img in images]
        
        if DEVICE.type == 'cuda':
            torch.cuda.synchronize()
        
        _ = model(images)
        
        if DEVICE.type == 'cuda':
            torch.cuda.synchronize()
        
        total_images += len(images)

end_time = time.time()
total_time = end_time - start_time
fps = total_images / total_time

print(f"\nInference Performance:")
print(f"  Total images processed: {total_images}")
print(f"  Total time: {total_time:.2f} seconds")
print(f"  Average FPS: {fps:.2f}")
print(f"  Average time per image: {1000/fps:.1f} ms")

## 7. Results Visualization

In [None]:
# Visualize some predictions
import matplotlib.patches as patches
from torchvision.transforms.functional import to_pil_image

def visualize_predictions(model, dataset, num_samples=4):
    """Visualize model predictions on sample images."""
    model.eval()
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()
    
    with torch.no_grad():
        for i in range(num_samples):
            # Get a sample
            img, target = dataset[i * 100]  # Skip some samples for variety
            
            # Run inference
            img_tensor = img.unsqueeze(0).to(DEVICE)
            prediction = model(img_tensor)[0]
            
            # Convert image for visualization
            img_np = img.permute(1, 2, 0).cpu().numpy()
            img_np = (img_np * np.array([0.229, 0.224, 0.225]) + 
                     np.array([0.48235, 0.45882, 0.40784]))
            img_np = np.clip(img_np, 0, 1)
            
            ax = axes[i]
            ax.imshow(img_np)
            
            # Filter predictions by confidence
            conf_threshold = 0.3
            high_conf_mask = prediction['scores'] > conf_threshold
            
            boxes = prediction['boxes'][high_conf_mask].cpu()
            labels = prediction['labels'][high_conf_mask].cpu()
            scores = prediction['scores'][high_conf_mask].cpu()
            
            # Draw bounding boxes
            for box, label, score in zip(boxes, labels, scores):
                x1, y1, x2, y2 = box
                width, height = x2 - x1, y2 - y1
                
                # Create rectangle
                rect = patches.Rectangle((x1, y1), width, height, 
                                       linewidth=2, edgecolor='red', 
                                       facecolor='none')
                ax.add_patch(rect)
                
                # Add label
                class_name = VOC_CLASSES[label]
                ax.text(x1, y1-5, f'{class_name}: {score:.2f}', 
                       color='red', fontsize=10, fontweight='bold',
                       bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7))
            
            ax.set_title(f'Sample {i+1} - {len(boxes)} detections (conf > {conf_threshold})')
            ax.axis('off')
    
    plt.tight_layout()
    plt.suptitle('SSD300 Predictions on Pascal VOC 2012', fontsize=16, y=1.02)
    plt.show()

# Visualize predictions
print("Generating prediction visualizations...")
visualize_predictions(model, dataset, num_samples=4)

In [None]:
# Create performance summary plot
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 12))

# 1. Training loss curve
ax1.plot(range(1, len(training_losses)+1), training_losses, 'b-o', linewidth=2, markersize=6)
ax1.set_title('Training Loss', fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.grid(True, alpha=0.3)

# 2. mAP metrics
map_scores = [metrics_dict['AP_0.50:0.95'], metrics_dict['AP_0.50'], metrics_dict['AP_0.75']]
map_labels = ['mAP@0.5:0.95', 'mAP@0.5', 'mAP@0.75']
bars = ax2.bar(map_labels, map_scores, color=['skyblue', 'lightgreen', 'lightcoral'])
ax2.set_title('COCO mAP Scores', fontweight='bold')
ax2.set_ylabel('Average Precision')
ax2.set_ylim(0, max(map_scores) * 1.2)
for bar, score in zip(bars, map_scores):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
             f'{score:.3f}', ha='center', va='bottom', fontweight='bold')

# 3. Top performing classes
sorted_classes = sorted(per_class_ap.items(), key=lambda x: x[1], reverse=True)[:10]
class_names, class_aps = zip(*sorted_classes)
y_pos = np.arange(len(class_names))
ax3.barh(y_pos, class_aps, color='lightsteelblue')
ax3.set_yticks(y_pos)
ax3.set_yticklabels(class_names)
ax3.set_xlabel('Average Precision')
ax3.set_title('Top 10 Classes by mAP', fontweight='bold')
ax3.grid(True, alpha=0.3, axis='x')

# 4. Model statistics
stats_data = {
    'Model Size\n(MB)': model_size_mb,
    'Parameters\n(Millions)': total_params/1e6,
    'Inference Speed\n(FPS)': fps,
    'Peak Memory\n(MB)': peak_memory_mb if DEVICE.type == 'cuda' else 0
}

stats_names = list(stats_data.keys())
stats_values = list(stats_data.values())
bars = ax4.bar(stats_names, stats_values, color=['gold', 'orange', 'lightgreen', 'pink'])
ax4.set_title('Model Performance Statistics', fontweight='bold')
ax4.set_ylabel('Value')
for bar, value in zip(bars, stats_values):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(stats_values)*0.01, 
             f'{value:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.suptitle('SSD300 Training and Evaluation Summary', fontsize=16, y=1.02)
plt.show()

## 8. Summary and Conclusions

In [None]:
# Generate comprehensive summary
summary_report = f"""
==================================================
           SSD300 TRAINING & EVALUATION SUMMARY
==================================================

DATASET:
  • Pascal VOC 2012 ({len(dataset)} images)
  • 20 object classes + background
  • Converted to COCO format for evaluation

MODEL ARCHITECTURE:
  • SSD300 with VGG16 backbone
  • Pre-trained on COCO, fine-tuned on VOC
  • Total parameters: {total_params:,} ({total_params/1e6:.2f}M)
  • Model file size: {model_size_mb:.1f} MB

TRAINING:
  • Epochs: {EPOCHS}
  • Learning rate: {LEARNING_RATE}
  • Optimizer: Adam
  • Initial loss: {training_losses[0]:.4f}
  • Final loss: {training_losses[-1]:.4f}
  • Loss reduction: {((training_losses[0] - training_losses[-1]) / training_losses[0] * 100):.1f}%

EVALUATION RESULTS:
  • mAP @ IoU=0.50:0.95: {metrics_dict['AP_0.50:0.95']:.3f}
  • mAP @ IoU=0.50:     {metrics_dict['AP_0.50']:.3f}
  • mAP @ IoU=0.75:     {metrics_dict['AP_0.75']:.3f}
  • Average Recall:     {metrics_dict['AR_100']:.3f}

PERFORMANCE:
  • Inference speed: {fps:.1f} FPS
  • Time per image: {1000/fps:.1f} ms"""

if DEVICE.type == 'cuda':
    summary_report += f"""
  • Peak GPU memory: {peak_memory_mb:.0f} MB"""

summary_report += f"""

TOP PERFORMING CLASSES:
"""

for i, (class_name, ap) in enumerate(sorted_classes[:5]):
    summary_report += f"  {i+1}. {class_name}: {ap:.3f}\n"

summary_report += f"""
FILES GENERATED:
  • Model weights: {MODEL_SAVE_PATH}
  • Predictions: {RESULTS_PATH}
  • Metrics: {METRICS_PATH}
  • COCO annotations: {COCO_JSON_PATH}

==================================================
"""

print(summary_report)

# Save summary to file
with open("outputs/training_summary.txt", "w") as f:
    f.write(summary_report)

print("Training and evaluation completed successfully!")
print("Summary saved to: outputs/training_summary.txt")