# COCO Data Pipeline for Faster RCNN

*Notebook 1 of 6 in the Faster RCNN from-scratch series*

Dataset: [COCO 2017](https://huggingface.co/datasets/detection-datasets/coco) streamed from Hugging Face — no local download required.

In [None]:
# Install dependencies (run once in container)
!pip install datasets --quiet

In [None]:
import random
from collections import Counter
from typing import List, Tuple

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches

import torch
from torch.utils.data import IterableDataset, DataLoader
from datasets import load_dataset

# Configuration
IMG_SIZE = 800          # Faster RCNN uses 800x800 (vs 640 for YOLO)
NUM_CLASSES = 80        # COCO categories
MEAN = [0.485, 0.456, 0.406]
STD  = [0.229, 0.224, 0.225]
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {DEVICE}")

# COCO category names (80 categories, 0-indexed as in HF dataset)
COCO_NAMES = [
    'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck',
    'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench',
    'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra',
    'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee',
    'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
    'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
    'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch',
    'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse',
    'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
    'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear',
    'hair drier', 'toothbrush'
]

## Dataset

We stream COCO 2017 directly from [detection-datasets/coco](https://huggingface.co/datasets/detection-datasets/coco) on the Hugging Face Hub — no local download or annotation files required.

The HF dataset provides bounding boxes in COCO format `[x, y, w, h]` (pixels, top-left corner) with **0-indexed** category labels. Faster RCNN requires:
- Boxes in `[x1, y1, x2, y2]` pixel coordinates, scaled to the resized image
- **1-indexed** labels (0 = background, 1–80 = COCO categories)
- ImageNet-normalized image tensors of shape `(3, 800, 800)`

In [None]:
class COCOStreamDataset(IterableDataset):
    """Stream COCO from Hugging Face and yield Faster RCNN-format samples.

    Each sample is resized to img_size x img_size and normalized with
    ImageNet statistics. Bounding boxes are converted from COCO
    [x, y, w, h] pixel format to [x1, y1, x2, y2] scaled to the
    resized image. Labels are 1-indexed (0 = background).

    Returns (image_tensor, target) where target is a dict with:
        boxes    — (N, 4) float32 [x1, y1, x2, y2] in pixel coords
        labels   — (N,) int64, 1-indexed category IDs
        image_id — int
    """

    def __init__(self, split: str = 'train', max_samples: int = None,
                 img_size: int = 800):
        self.split = split
        self.max_samples = max_samples
        self.img_size = img_size

    def __iter__(self):
        ds = load_dataset('detection-datasets/coco',
                          split=self.split, streaming=True)
        count = 0
        for example in ds:
            if self.max_samples and count >= self.max_samples:
                break

            img = example['image'].convert('RGB')
            orig_w, orig_h = img.size

            # Skip images with no annotations
            bboxes = example['objects']['bbox']
            cats   = example['objects']['category']
            if not bboxes:
                continue

            # Resize to fixed square
            img = img.resize((self.img_size, self.img_size), Image.BILINEAR)
            scale_x = self.img_size / orig_w
            scale_y = self.img_size / orig_h

            # Normalize
            img_arr = np.array(img, dtype=np.float32) / 255.0
            img_arr = (img_arr - MEAN) / STD
            img_tensor = torch.from_numpy(img_arr).permute(2, 0, 1)  # CHW

            # Convert boxes: [x, y, w, h] pixels -> [x1, y1, x2, y2] scaled
            boxes, labels = [], []
            for bbox, cat_id in zip(bboxes, cats):
                x, y, w, h = bbox
                x1 = x * scale_x
                y1 = y * scale_y
                x2 = (x + w) * scale_x
                y2 = (y + h) * scale_y
                if (x2 - x1) > 1 and (y2 - y1) > 1:
                    boxes.append([x1, y1, x2, y2])
                    labels.append(int(cat_id) + 1)  # 0-indexed -> 1-indexed

            if not boxes:
                continue

            target = {
                'boxes':    torch.tensor(boxes,  dtype=torch.float32),
                'labels':   torch.tensor(labels, dtype=torch.int64),
                'image_id': example['image_id'],
            }
            yield img_tensor, target
            count += 1

In [None]:
def frcnn_collate_fn(batch):
    """Stack images; keep targets as a list (variable box count per image)."""
    images  = torch.stack([b[0] for b in batch])
    targets = [b[1] for b in batch]
    return images, targets

In [None]:
class AnchorTargetGenerator:
    """Assign GT labels and regression targets to pre-computed anchors.

    For each image in a batch:
    - Positive anchor: IoU with any GT >= 0.7 (or highest-IoU anchor per GT)
    - Negative anchor: IoU with all GTs < 0.3
    - Neutral anchor: everything else (ignored during loss)

    Samples 256 anchors per image at 1:1 pos/neg ratio.
    """

    def __init__(self, pos_iou: float = 0.7, neg_iou: float = 0.3,
                 total_samples: int = 256, pos_fraction: float = 0.5):
        self.pos_iou = pos_iou
        self.neg_iou = neg_iou
        self.total_samples = total_samples
        self.n_pos = int(total_samples * pos_fraction)

    def compute_iou(self, anchors: torch.Tensor, gt_boxes: torch.Tensor) -> torch.Tensor:
        """Compute IoU matrix: (N_anchors, N_gt)."""
        ax1, ay1, ax2, ay2 = anchors.unbind(1)
        gx1, gy1, gx2, gy2 = gt_boxes.unbind(1)

        inter_x1 = torch.max(ax1[:, None], gx1[None, :])
        inter_y1 = torch.max(ay1[:, None], gy1[None, :])
        inter_x2 = torch.min(ax2[:, None], gx2[None, :])
        inter_y2 = torch.min(ay2[:, None], gy2[None, :])

        inter_w = (inter_x2 - inter_x1).clamp(min=0)
        inter_h = (inter_y2 - inter_y1).clamp(min=0)
        inter = inter_w * inter_h

        area_a = (ax2 - ax1) * (ay2 - ay1)
        area_g = (gx2 - gx1) * (gy2 - gy1)
        union = area_a[:, None] + area_g[None, :] - inter

        return inter / union.clamp(min=1e-6)

    def __call__(self, anchors: torch.Tensor, gt_boxes: torch.Tensor
                 ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Returns:
            labels: (N_anchors,) — 1=positive, 0=negative, -1=neutral/ignore
            matched_gt: (N_anchors, 4) — GT box matched to each anchor
        """
        N = len(anchors)
        labels = torch.full((N,), -1, dtype=torch.int64)
        matched_gt = torch.zeros((N, 4), dtype=torch.float32)

        if len(gt_boxes) == 0:
            neg_idx = torch.randperm(N)[:self.total_samples]
            labels[neg_idx] = 0
            return labels, matched_gt

        iou = self.compute_iou(anchors, gt_boxes)  # (N, M)

        max_iou_per_anchor, best_gt_idx = iou.max(dim=1)
        _, best_anchor_per_gt = iou.max(dim=0)

        labels[max_iou_per_anchor >= self.pos_iou] = 1
        labels[max_iou_per_anchor < self.neg_iou] = 0
        labels[best_anchor_per_gt] = 1  # force-positive best anchors

        pos_idx = torch.where(labels == 1)[0]
        neg_idx = torch.where(labels == 0)[0]

        n_pos = min(len(pos_idx), self.n_pos)
        n_neg = min(len(neg_idx), self.total_samples - n_pos)

        pos_idx = pos_idx[torch.randperm(len(pos_idx))[:n_pos]]
        neg_idx = neg_idx[torch.randperm(len(neg_idx))[:n_neg]]

        keep = torch.zeros(N, dtype=torch.bool)
        keep[pos_idx] = True
        keep[neg_idx] = True
        labels[~keep] = -1

        matched_gt = gt_boxes[best_gt_idx]
        return labels, matched_gt

In [None]:
# Stream a small batch for inspection (no local data needed)
dataset = COCOStreamDataset(split='train', max_samples=64, img_size=IMG_SIZE)
loader  = DataLoader(dataset, batch_size=2, collate_fn=frcnn_collate_fn,
                     num_workers=0)

imgs, targets = next(iter(loader))
print(f"Image batch: {imgs.shape}")                # [2, 3, 800, 800]
print(f"Boxes[0]:   {targets[0]['boxes'].shape}")  # [N, 4]
print(f"Labels[0]:  {targets[0]['labels']}")

In [None]:
# Inspection: visualize 2 images with GT boxes
cat_names = {i + 1: name for i, name in enumerate(COCO_NAMES)}  # 1-indexed

fig, axes = plt.subplots(1, 2, figsize=(18, 9))
for i, ax in enumerate(axes):
    img = imgs[i].permute(1, 2, 0).numpy()
    img = img * STD + MEAN      # denormalize
    img = img.clip(0, 1)
    ax.imshow(img)
    for box, lbl in zip(targets[i]['boxes'], targets[i]['labels']):
        x1, y1, x2, y2 = box.tolist()
        rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1,
                                   linewidth=2, edgecolor='lime', facecolor='none')
        ax.add_patch(rect)
        name = cat_names.get(lbl.item(), str(lbl.item()))
        ax.text(x1, y1 - 4, name, color='white', fontsize=7,
                bbox=dict(facecolor='green', alpha=0.6, pad=1))
    ax.axis('off')
    ax.set_title(f"Sample {i} — {len(targets[i]['boxes'])} objects")
plt.tight_layout()
plt.savefig('images/gt_visualization.png', dpi=100, bbox_inches='tight')
plt.show()

In [None]:
# Inspection: anchor label distribution on first image in batch
atg = AnchorTargetGenerator()
dummy_anchors = torch.rand(200, 4) * IMG_SIZE
dummy_anchors[:, 2:] = dummy_anchors[:, :2] + torch.rand(200, 2) * 200
anchor_labels, _ = atg(dummy_anchors, targets[0]['boxes'])

pos = (anchor_labels == 1).sum().item()
neg = (anchor_labels == 0).sum().item()
neu = (anchor_labels == -1).sum().item()

fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(['positive', 'negative', 'neutral'], [pos, neg, neu],
       color=['green', 'red', 'gray'])
ax.set_title('Anchor sampling (200 dummy anchors, sample image)')
ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('images/anchor_stats.png', dpi=100, bbox_inches='tight')
plt.show()
print(f"Pos: {pos}, Neg: {neg}, Neutral: {neu}")

In [None]:
# Inspection: class distribution across 200 streamed samples
sample_ds = COCOStreamDataset(split='train', max_samples=200, img_size=IMG_SIZE)
all_labels = []
for _, t in sample_ds:
    all_labels.extend(t['labels'].tolist())

counter = Counter(all_labels)
top20 = sorted(counter.items(), key=lambda x: -x[1])[:20]
names  = [cat_names.get(k, str(k)) for k, _ in top20]
counts = [v for _, v in top20]

fig, ax = plt.subplots(figsize=(14, 4))
ax.bar(names, counts)
ax.set_title('Top-20 categories by annotation count (200 COCO train samples)')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('images/class_distribution.png', dpi=100, bbox_inches='tight')
plt.show()