In [1]:
import torch,torchvision
from torchvision.models.detection.rpn import RegionProposalNetwork
from torchvision.models.detection.rpn import AnchorGenerator,RPNHead
from torchvision.models.detection.generalized_rcnn import GeneralizedRCNN
from torchvision.models.detection.transform import GeneralizedRCNNTransform
from collections import OrderedDict
from torch import nn

In [2]:
class MyFasterRCNN(nn.Module):
    def __init__(self,backbone,rpn_anchor_generator=None,rpn_head=None,
                rpn_pre_nms_top_n_train=2000,min_size=800,max_size=1333,
                image_mean=None,image_std=None):
        super().__init__()
        if not hasattr(backbone, "out_channels"):
            raise ValueError(
                "backbone should contain an attribute out_channels "
                "specifying the number of output channels (assumed to be the "
                "same for all the levels)")
        assert isinstance(rpn_anchor_generator, (AnchorGenerator, type(None)))
        self.backbone=backbone
        out_channels = self.backbone.out_channels
        
        if rpn_anchor_generator is None:
            anchor_sizes = ((32,), (64,), (128,), (256,), (512,))
            aspect_ratios = ((0.5, 1.0, 2.0),) * len(anchor_sizes)
            rpn_anchor_generator = AnchorGenerator(
                anchor_sizes, aspect_ratios
            )
        if rpn_head is None:
            rpn_head = RPNHead(
                out_channels, rpn_anchor_generator.num_anchors_per_location()[0]
            )
        rpn_pre_nms_top_n = dict(training=2000, testing=1000)
        rpn_post_nms_top_n = dict(training=2000, testing=1000)
        
        self.rpn = RegionProposalNetwork(
            rpn_anchor_generator, rpn_head,
            0.7, 0.3,
            256, 0.5,
            rpn_pre_nms_top_n, rpn_post_nms_top_n, 0.7)
        
        if image_mean is None:
            image_mean = [0.485, 0.456, 0.406]
        if image_std is None:
            image_std = [0.229, 0.224, 0.225]
        
        self.transform = GeneralizedRCNNTransform(min_size, max_size, image_mean, image_std)
            
    def forward(self,images,targets=None):

        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")
        
        original_image_sizes = [img.shape[-2:] for img in images]
        images, targets = self.transform(images, targets)
        features = self.backbone(images.tensors)
        if isinstance(features, torch.Tensor):
            features = OrderedDict([(0, features)])
        proposals, proposal_losses = self.rpn(images, features, targets)
        return proposals

In [3]:
backbone = nn.Sequential(*list(torchvision.models.resnet101(pretrained=True).children())[:-1])

In [4]:
backbone.out_channels = 2048

In [5]:
anchor_generator = AnchorGenerator(sizes=((32, 64, 128, 256, 512),),
                                 aspect_ratios=((0.5, 1.0, 2.0),))

In [6]:
model = MyFasterRCNN(backbone,
                     rpn_anchor_generator=anchor_generator)

In [7]:
model.eval()

MyFasterRCNN(
  (backbone): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (downsample): Sequential(
          (0): Conv2d(64, 256,

In [8]:
x = [torch.rand(3, 600, 800)]

In [9]:
predictions = model(x)

In [10]:
predictions

[tensor([[  0.0000,   0.0000, 179.5098,  80.9051],
         [  0.0000,   0.0000,  18.5347,  34.3807],
         [  0.0000,   0.0000,   9.8876,  25.1620],
         [  0.0000,   0.0000,  16.7928,   9.1592],
         [  0.0000,   0.0000, 221.8882, 300.1678],
         [  0.0000,   0.0000,  73.6171,  83.5896],
         [  0.0000,   0.0000,  54.5584,  70.1236],
         [  0.0000,   0.0000,  81.5694,  49.2861],
         [  0.0000,   0.0000,  37.8385,  21.7164],
         [  0.0000,   0.0000, 136.4569, 130.3694],
         [  0.0000,   0.0000,  30.2523,  36.9583],
         [  0.0000,   0.0000,  50.4484, 222.4019],
         [  0.0000,   0.0000, 300.0439, 145.0930],
         [  0.0000,   0.0000, 178.3131, 407.0588]])]

In [11]:
predictions[0].shape

torch.Size([14, 4])