In [1]:
from PIL import Image
from torchvision import transforms
from src.backbone import Backbone
from src.fpn import FPN
from src.rpn import RPN
import torch

In [2]:
output_layer_map = {
    'conv3': 16,
    'conv4': 23,
    'conv5': 30
}

backbone = Backbone(output_layer_map)

dog_img = Image.open("../data/dog.png")



conv3 16
conv4 23
conv5 30


In [3]:
preprocess = transforms.Compose([
    transforms.Resize(256),      
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],     # ImageNet mean
        std=[0.229, 0.224, 0.225]       # ImageNet std
    )
])

preprocessed_img = preprocess(dog_img).unsqueeze(0)

preprocessed_img.size()

torch.Size([1, 3, 224, 224])

In [4]:
features = backbone(preprocessed_img)

In [5]:
features.keys()

dict_keys(['conv3', 'conv4', 'conv5'])

In [6]:
print(features['conv3'].size()), print(features['conv4'].size()), print(features['conv5'].size())

torch.Size([1, 256, 28, 28])
torch.Size([1, 512, 14, 14])
torch.Size([1, 512, 7, 7])


(None, None, None)

In [7]:
fpn = FPN()
aligned_features = fpn(features)
print(aligned_features['conv3'].size()), print(aligned_features['conv4'].size()), print(aligned_features['conv5'].size())

torch.Size([1, 256, 28, 28])
torch.Size([1, 256, 14, 14])
torch.Size([1, 256, 7, 7])


(None, None, None)

In [10]:
rpn = RPN()
rpn_out = dict()
for k, v in aligned_features.items():
    rpn_out[k] = rpn(v)

In [13]:
rpn_out['conv3'][0].size(), rpn_out['conv4'][0].size(), rpn_out['conv5'][0].size(), rpn_out['conv3'][1].size(), rpn_out['conv4'][1].size(), rpn_out['conv5'][1].size()

(torch.Size([1, 18, 28, 28]),
 torch.Size([1, 18, 14, 14]),
 torch.Size([1, 18, 7, 7]),
 torch.Size([1, 36, 28, 28]),
 torch.Size([1, 36, 14, 14]),
 torch.Size([1, 36, 7, 7]))