In [1]:
import torchvision
from torchvision.models.detection.backbone_utils import BackboneWithFPN
from torchvision.ops import misc as misc_nn_ops
from torchvision.models import resnet
from utils.utils import load_state_dict_from_url
from torchvision.models.detection import FasterRCNN
model_urls = {
    'fasterrcnn_resnet50_fpn_coco':
        'https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth',
}

In [2]:
def show_frozen_layers(model):
    print("---- Frozen layers are: ")
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad:
            print(name)
    print("---------------------------")

### Torchvision Behavior
- first conv and layer 1 block are frozen in all cases. Even if pretrained backbone and faster rcnn are not used, which is incorrect behavior.

In [3]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True, pretrained_backbone=False)

show_frozen_layers(model)

---- Frozen layers are: 
backbone.body.conv1.weight
backbone.body.layer1.0.conv1.weight
backbone.body.layer1.0.conv2.weight
backbone.body.layer1.0.conv3.weight
backbone.body.layer1.0.downsample.0.weight
backbone.body.layer1.1.conv1.weight
backbone.body.layer1.1.conv2.weight
backbone.body.layer1.1.conv3.weight
backbone.body.layer1.2.conv1.weight
backbone.body.layer1.2.conv2.weight
backbone.body.layer1.2.conv3.weight
---------------------------


In [4]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=False, pretrained_backbone=False)

show_frozen_layers(model)

---- Frozen layers are: 
backbone.body.conv1.weight
backbone.body.layer1.0.conv1.weight
backbone.body.layer1.0.conv2.weight
backbone.body.layer1.0.conv3.weight
backbone.body.layer1.0.downsample.0.weight
backbone.body.layer1.1.conv1.weight
backbone.body.layer1.1.conv2.weight
backbone.body.layer1.1.conv3.weight
backbone.body.layer1.2.conv1.weight
backbone.body.layer1.2.conv2.weight
backbone.body.layer1.2.conv3.weight
---------------------------


### Modified functions

In [5]:
def resnet_fpn_backbone(backbone_name, pretrained, norm_layer=misc_nn_ops.FrozenBatchNorm2d, trainable_layers=3):
    backbone = resnet.__dict__[backbone_name](
        pretrained=pretrained,
        norm_layer=norm_layer)
    """
    Constructs a specified ResNet backbone with FPN on top of it. Freezes the specified number of layers in the backbone.
    Examples::
        >>> from torchvision.models.detection.backbone_utils import resnet_fpn_backbone
        >>> backbone = resnet_fpn_backbone('resnet50', pretrained=True, trainable_layers=3)
        >>> # get some dummy image
        >>> x = torch.rand(1,3,64,64)
        >>> # compute the output
        >>> output = backbone(x)
        >>> print([(k, v.shape) for k, v in output.items()])
        >>> # returns
        >>>   [('0', torch.Size([1, 256, 16, 16])),
        >>>    ('1', torch.Size([1, 256, 8, 8])),
        >>>    ('2', torch.Size([1, 256, 4, 4])),
        >>>    ('3', torch.Size([1, 256, 2, 2])),
        >>>    ('pool', torch.Size([1, 256, 1, 1]))]
        
    Arguments:
        backbone_name (string): resnet architecture. Possible values are 'ResNet', 'resnet18', 'resnet34', 'resnet50',
             'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d', 'wide_resnet50_2', 'wide_resnet101_2'
        norm_layer (torchvision.ops): it is recommended to use the default value. For details visit:
            (https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
        pretrained (bool): If True, returns a model with backbone pre-trained on Imagenet
        trainable_layers (int): number of trainable (not frozen) resnet layers starting from final block.
            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
    """
    # select layers that wont be frozen
    assert trainable_layers<=5 and trainable_layers >=0
    layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers]
    # freeze layers only if pretrained backbone is used
    for name, parameter in backbone.named_parameters():
        if all([not name.startswith(layer) for layer in layers_to_train]):
            parameter.requires_grad_(False)

    return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}

    in_channels_stage2 = backbone.inplanes // 8
    in_channels_list = [
        in_channels_stage2,
        in_channels_stage2 * 2,
        in_channels_stage2 * 4,
        in_channels_stage2 * 8,
    ]
    out_channels = 256
    return BackboneWithFPN(backbone, return_layers, in_channels_list, out_channels)

In [6]:
def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
                            num_classes=91, pretrained_backbone=True, trainable_backbone_layers=3, **kwargs):
    """
    Constructs a Faster R-CNN model with a ResNet-50-FPN backbone.
    The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
    image, and should be in ``0-1`` range. Different images can have different sizes.
    The behavior of the model changes depending if it is in training or evaluation mode.
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
    The model returns a ``Dict[Tensor]`` during training, containing the classification and regression
    losses for both the RPN and the R-CNN.
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the predicted labels for each image
        - scores (``Tensor[N]``): the scores or each prediction
    Faster R-CNN is exportable to ONNX for a fixed batch size with inputs images of fixed size.
    Example::
        >>> model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
        >>> # For training
        >>> images, boxes = torch.rand(4, 3, 600, 1200), torch.rand(4, 11, 4)
        >>> labels = torch.randint(1, 91, (4, 11))
        >>> images = list(image for image in images)
        >>> targets = []
        >>> for i in range(len(images)):
        >>>     d = {}
        >>>     d['boxes'] = boxes[i]
        >>>     d['labels'] = labels[i]
        >>>     targets.append(d)
        >>> output = model(images, targets)
        >>> # For inference
        >>> model.eval()
        >>> x = [torch.rand(3, 300, 400), torch.rand(3, 500, 400)]
        >>> predictions = model(x)
        >>>
        >>> # optionally, if you want to export the model to ONNX:
        >>> torch.onnx.export(model, x, "faster_rcnn.onnx", opset_version = 11)
    Arguments:
        pretrained (bool): If True, returns a model pre-trained on COCO train2017
        progress (bool): If True, displays a progress bar of the download to stderr
        pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
        num_classes (int): number of output classes of the model (including the background)
        trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting from final block.
            Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
    """
    assert trainable_backbone_layers<=5 and trainable_backbone_layers >=0
    # dont freeze any layers if pretrained model or backbone is not used
    if not (pretrained or pretrained_backbone):
        trainable_backbone_layers = 5
    if pretrained:
        # no need to download the backbone if pretrained is set
        pretrained_backbone = False
    backbone = resnet_fpn_backbone('resnet50', pretrained_backbone, trainable_layers=trainable_backbone_layers)
    model = FasterRCNN(backbone, num_classes, **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['fasterrcnn_resnet50_fpn_coco'],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model

### New Behavior

- When trainable_backbone_layers is not passed the behavior matches old behvaior.

In [7]:
model = fasterrcnn_resnet50_fpn(pretrained=True, progress=False,
                            num_classes=91, pretrained_backbone=False)

show_frozen_layers(model)

---- Frozen layers are: 
backbone.body.conv1.weight
backbone.body.layer1.0.conv1.weight
backbone.body.layer1.0.conv2.weight
backbone.body.layer1.0.conv3.weight
backbone.body.layer1.0.downsample.0.weight
backbone.body.layer1.1.conv1.weight
backbone.body.layer1.1.conv2.weight
backbone.body.layer1.1.conv3.weight
backbone.body.layer1.2.conv1.weight
backbone.body.layer1.2.conv2.weight
backbone.body.layer1.2.conv3.weight
---------------------------


- Old behavior is preserved except when both pretrained and pretrained_backbone are set to False. In that case no layers are frozen.

In [8]:
model = fasterrcnn_resnet50_fpn(pretrained=False, progress=False,
                            num_classes=91, pretrained_backbone=False)

show_frozen_layers(model)

---- Frozen layers are: 
---------------------------


- using trainable_backbone_layers to control the layers to freeze

In [9]:
for i in range(6):
    print('===================================================')
    print('--- '+str(i)+' trainable layers  -----')
    
    model = fasterrcnn_resnet50_fpn(pretrained=True, progress=False,
                                num_classes=91, pretrained_backbone=False, trainable_backbone_layers=i)

    show_frozen_layers(model)

--- 0 trainable layers  -----
---- Frozen layers are: 
backbone.body.conv1.weight
backbone.body.layer1.0.conv1.weight
backbone.body.layer1.0.conv2.weight
backbone.body.layer1.0.conv3.weight
backbone.body.layer1.0.downsample.0.weight
backbone.body.layer1.1.conv1.weight
backbone.body.layer1.1.conv2.weight
backbone.body.layer1.1.conv3.weight
backbone.body.layer1.2.conv1.weight
backbone.body.layer1.2.conv2.weight
backbone.body.layer1.2.conv3.weight
backbone.body.layer2.0.conv1.weight
backbone.body.layer2.0.conv2.weight
backbone.body.layer2.0.conv3.weight
backbone.body.layer2.0.downsample.0.weight
backbone.body.layer2.1.conv1.weight
backbone.body.layer2.1.conv2.weight
backbone.body.layer2.1.conv3.weight
backbone.body.layer2.2.conv1.weight
backbone.body.layer2.2.conv2.weight
backbone.body.layer2.2.conv3.weight
backbone.body.layer2.3.conv1.weight
backbone.body.layer2.3.conv2.weight
backbone.body.layer2.3.conv3.weight
backbone.body.layer3.0.conv1.weight
backbone.body.layer3.0.conv2.weight
back

- checking for invalid trainable_backbone_layers value

In [10]:
model = fasterrcnn_resnet50_fpn(pretrained=False, progress=False,
                            num_classes=91, pretrained_backbone=False, trainable_backbone_layers=7)

show_frozen_layers(model)

AssertionError: 