# Backbone: ResNet50 + Feature Pyramid Network

*Notebook 2 of 6 in the Faster RCNN from-scratch series*

We build the feature extractor from scratch: ResNet50 bottleneck blocks,
followed by an FPN that produces P2–P5 feature maps at strides 4, 8, 16, 32.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
from typing import List

In [None]:
class Bottleneck(nn.Module):
    """ResNet bottleneck block: 1x1 -> 3x3 -> 1x1 with optional downsample."""
    expansion = 4

    def __init__(self, in_channels: int, mid_channels: int, stride: int = 1,
                 downsample: nn.Module = None):
        super().__init__()
        out_channels = mid_channels * self.expansion

        self.conv1 = nn.Conv2d(in_channels, mid_channels, 1, bias=False)
        self.bn1   = nn.BatchNorm2d(mid_channels)
        self.conv2 = nn.Conv2d(mid_channels, mid_channels, 3, stride=stride,
                               padding=1, bias=False)
        self.bn2   = nn.BatchNorm2d(mid_channels)
        self.conv3 = nn.Conv2d(mid_channels, out_channels, 1, bias=False)
        self.bn3   = nn.BatchNorm2d(out_channels)
        self.relu  = nn.ReLU(inplace=True)
        self.downsample = downsample

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        identity = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.relu(self.bn2(self.conv2(out)))
        out = self.bn3(self.conv3(out))
        if self.downsample:
            identity = self.downsample(x)
        return self.relu(out + identity)

In [None]:
class ResNet50(nn.Module):
    """ResNet50 backbone returning C2, C3, C4, C5 feature maps.

    C2: stride 4,  256 channels
    C3: stride 8,  512 channels
    C4: stride 16, 1024 channels
    C5: stride 32, 2048 channels
    """

    def __init__(self):
        super().__init__()
        # Stem
        self.stem = nn.Sequential(
            nn.Conv2d(3, 64, 7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(3, stride=2, padding=1),
        )
        # Layers
        self.layer1 = self._make_layer(64,  64,  blocks=3, stride=1)  # C2
        self.layer2 = self._make_layer(256, 128, blocks=4, stride=2)  # C3
        self.layer3 = self._make_layer(512, 256, blocks=6, stride=2)  # C4
        self.layer4 = self._make_layer(1024,512, blocks=3, stride=2)  # C5

    def _make_layer(self, in_ch: int, mid_ch: int, blocks: int, stride: int):
        out_ch = mid_ch * Bottleneck.expansion
        downsample = None
        if stride != 1 or in_ch != out_ch:
            downsample = nn.Sequential(
                nn.Conv2d(in_ch, out_ch, 1, stride=stride, bias=False),
                nn.BatchNorm2d(out_ch),
            )
        layers = [Bottleneck(in_ch, mid_ch, stride, downsample)]
        for _ in range(1, blocks):
            layers.append(Bottleneck(out_ch, mid_ch))
        return nn.Sequential(*layers)

    def forward(self, x: torch.Tensor):
        x  = self.stem(x)
        c2 = self.layer1(x)
        c3 = self.layer2(c2)
        c4 = self.layer3(c3)
        c5 = self.layer4(c4)
        return c2, c3, c4, c5

In [None]:
backbone = ResNet50()
x = torch.randn(1, 3, 800, 800)
c2, c3, c4, c5 = backbone(x)
for name, feat in zip(['C2','C3','C4','C5'], [c2, c3, c4, c5]):
    print(f"{name}: {feat.shape}")
# Expected:
# C2: torch.Size([1, 256, 200, 200])
# C3: torch.Size([1, 512, 100, 100])
# C4: torch.Size([1, 1024, 50, 50])
# C5: torch.Size([1, 2048, 25, 25])

In [None]:
class FPN(nn.Module):
    """Feature Pyramid Network.

    Takes C2-C5 from ResNet50 and produces P2-P5 (+ P6 via pooling).
    All pyramid levels have `out_channels` channels (default 256).

    P2: stride 4  (200x200 for 800x800 input)
    P3: stride 8  (100x100)
    P4: stride 16 (50x50)
    P5: stride 32 (25x25)
    P6: stride 64 (13x13) — only used for large-anchor RPN level
    """

    def __init__(self, in_channels: List[int] = [256, 512, 1024, 2048],
                 out_channels: int = 256):
        super().__init__()
        # Lateral 1x1 convs
        self.lat2 = nn.Conv2d(in_channels[0], out_channels, 1)
        self.lat3 = nn.Conv2d(in_channels[1], out_channels, 1)
        self.lat4 = nn.Conv2d(in_channels[2], out_channels, 1)
        self.lat5 = nn.Conv2d(in_channels[3], out_channels, 1)
        # Output 3x3 convs
        self.out2 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
        self.out3 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
        self.out4 = nn.Conv2d(out_channels, out_channels, 3, padding=1)
        self.out5 = nn.Conv2d(out_channels, out_channels, 3, padding=1)

    def forward(self, c2, c3, c4, c5):
        # Lateral connections
        l5 = self.lat5(c5)
        l4 = self.lat4(c4) + F.interpolate(l5, size=c4.shape[-2:], mode='nearest')
        l3 = self.lat3(c3) + F.interpolate(l4, size=c3.shape[-2:], mode='nearest')
        l2 = self.lat2(c2) + F.interpolate(l3, size=c2.shape[-2:], mode='nearest')

        p2 = self.out2(l2)
        p3 = self.out3(l3)
        p4 = self.out4(l4)
        p5 = self.out5(l5)
        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)

        return p2, p3, p4, p5, p6

In [None]:
fpn = FPN()
p2, p3, p4, p5, p6 = fpn(c2, c3, c4, c5)
for name, p in zip(['P2','P3','P4','P5','P6'], [p2, p3, p4, p5, p6]):
    print(f"{name}: {p.shape}")
# Expected:
# P2: torch.Size([1, 256, 200, 200])
# P3: torch.Size([1, 256, 100, 100])
# P4: torch.Size([1, 256, 50, 50])
# P5: torch.Size([1, 256, 25, 25])
# P6: torch.Size([1, 256, 13, 13])

In [None]:
# Load one real image and pass through backbone + FPN
# (reuse COCODataset from notebook 01 or load a single image)
img = torch.randn(1, 3, 800, 800)  # replace with real image tensor
c2, c3, c4, c5 = backbone(img)
p2, p3, p4, p5, p6 = fpn(c2, c3, c4, c5)

fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for ax, (name, feat) in zip(axes, [('P2',p2),('P3',p3),('P4',p4),('P5',p5),('P6',p6)]):
    fmap = feat[0].mean(dim=0).detach().cpu().numpy()  # mean over channels
    ax.imshow(fmap, cmap='viridis')
    ax.set_title(f"{name}\n{feat.shape[-2]}x{feat.shape[-1]}")
    ax.axis('off')
plt.suptitle("FPN Feature Maps (mean activation)", fontsize=13)
plt.savefig("images/fpn_features.png", dpi=100, bbox_inches='tight')
plt.show()

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(16, 3))
for ax, (name, feat) in zip(axes, [('P2',p2),('P3',p3),('P4',p4),('P5',p5)]):
    vals = feat[0].detach().cpu().numpy().flatten()
    ax.hist(vals, bins=50, color='steelblue', alpha=0.7)
    ax.set_title(f"{name} activations")
    ax.set_xlabel("Value")
plt.tight_layout()
plt.savefig("images/activation_histograms.png", dpi=100, bbox_inches='tight')
plt.show()

In [None]:
total = sum(p.numel() for p in backbone.parameters()) + \
        sum(p.numel() for p in fpn.parameters())
print(f"Total backbone+FPN parameters: {total:,}")
# Expected: ~25M backbone + ~1.5M FPN ~= 26.5M