# 1. Backbone

In [3]:
import torch
from PIL import Image
from torchvision import transforms

transform = transforms.Compose([transforms.Resize(size=224),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                     std=[0.229, 0.224, 0.225]),])

def preprocess(image_path: str, device: str = 'cpu') -> torch.Tensor:
    image = Image.open(image_path, mode='r').convert('RGB')
    sample = transform(image)
    print(sample.shape)
    sample = sample.unsqueeze(dim=0)
    return sample

In [4]:
sample = preprocess(image_path='/home/phungpx/Downloads/bird.jpg')

torch.Size([3, 224, 398])


In [5]:
device = 'cpu'
model.eval().to(device)

with torch.no_grad():
    preds = model(sample)

for class_id in torch.topk(preds, k=5).indices.squeeze(0).tolist():
    score = torch.softmax(preds, dim=1)[0, class_id].item()
    print(f'Prediction: class_id={class_id}, score={100 * score}')

Prediction: class_id=10, score=12.35792338848114
Prediction: class_id=95, score=9.344076365232468
Prediction: class_id=11, score=6.44075945019722
Prediction: class_id=15, score=3.7864774465560913
Prediction: class_id=16, score=2.5017768144607544


## 1.BackBone - EfficientNet

In [10]:
from EfficientDet.EfficientNet.back_bone import EfficientNetBackBone
feature_extractor = EfficientNetBackBone(compound_coef=0,
                                         R_input=[512, 640, 768, 896, 1024, 1280, 1280, 1536, 1536],
                                         weight_path='../../../checkpoint/efficientnet_pretrained_weight/efficientnet-b0-355c32eb.pth')

print(f'Number of Parameters: {sum(param.numel() for param in feature_extractor.parameters() if param.requires_grad)}')

Loaded pretrained weights for efficientnet-b0
Number of Parameters: 3595388


In [33]:
import torch
inputs = torch.randn(1, 3, 512, 512)
feature_maps = feature_extractor(inputs)

In [13]:
for i, feature_map in enumerate(feature_maps, 1):
    print(f'P#{i}: {feature_map.shape}')

P#1: torch.Size([1, 16, 256, 256])
P#2: torch.Size([1, 24, 128, 128])
P#3: torch.Size([1, 40, 64, 64])
P#4: torch.Size([1, 112, 32, 32])
P#5: torch.Size([1, 320, 16, 16])


## 2. BiFPN

In [14]:
from EfficientDet.BiFPN.bifpn import BiFPN

In [15]:
backbone_out_channels = {0: [40, 112, 320], 1: [40, 112, 320],
                         2: [48, 120, 352], 3: [48, 136, 384],
                         4: [56, 160, 448], 5: [64, 176, 512],
                         6: [72, 200, 576], 7: [72, 200, 576],
                         8: [80, 224, 640]}

bifpn = BiFPN(compound_coef=0,
              backbone_out_channels=backbone_out_channels,
              W_bifpn=[64, 88, 112, 160, 224, 288, 384, 384, 384],
              D_bifpn=[3, 4, 5, 6, 7, 7, 8, 8, 8], onnx_export=False, epsilon=1e-4)

print(f'Number of Parameters: {sum(param.numel() for param in bifpn.parameters() if param.requires_grad)}')

Number of Parameters: 168249


In [16]:
P3, P4, P5 = feature_maps[-3:]
pyramid_features = bifpn(feature_maps=(P3, P4, P5))

In [17]:
for i, pyramid_feature in enumerate(pyramid_features, 3):
    print(f'P#{i}: {pyramid_feature.shape}')

P#3: torch.Size([1, 64, 64, 64])
P#4: torch.Size([1, 64, 32, 32])
P#5: torch.Size([1, 64, 16, 16])
P#6: torch.Size([1, 64, 8, 8])
P#7: torch.Size([1, 64, 4, 4])


## 3. Head

In [18]:
from EfficientDet.Head.regressor import Regressor
from EfficientDet.Head.classifier import Classifier

In [19]:
classifier = Classifier(n_anchors=9,
                        n_classes=10,
                        compound_coef=0,
                        D_class=[3, 3, 3, 4, 4, 4, 5, 5, 5],
                        W_pred=[64, 88, 112, 160, 224, 288, 384, 384, 384],
                        onnx_export=False)

regressor = Regressor(n_anchors=9,
                      compound_coef=0,
                      D_box=[3, 3, 3, 4, 4, 4, 5, 5, 5],
                      W_pred=[64, 88, 112, 160, 224, 288, 384, 384, 384],
                      onnx_export=False)

print(f'Number of Parameters of Classifier: {sum(param.numel() for param in classifier.parameters() if param.requires_grad)}')
print(f'Number of Parameters of Regressor: {sum(param.numel() for param in regressor.parameters() if param.requires_grad)}')

Number of Parameters of Classifier: 22554
Number of Parameters of Regressor: 19044


In [23]:
cls_preds = classifier(pyramid_features)
loc_preds = regressor(pyramid_features)

print(f'classification: {cls_preds.shape}')
print(f'regression: {loc_preds.shape}')

classification: torch.Size([1, 49104, 10])
regression: torch.Size([1, 49104, 4])


In [25]:
# calculation out_channels of cls_preds, and loc_preds
import numpy as np

compound_coef = 0
num_anchors = 9

input_size = 512 + 128 * compound_coef

pyramid_levels = np.arange(3, 8) if compound_coef <= 7 else np.arange(3, 9)

out_channels = sum((input_size / 2 ** pyramid_levels) * (input_size / 2 ** pyramid_levels) * num_anchors)

print(out_channels)

49104.0


## 5. Efficient Det

In [26]:
from EfficientDet.model import Model

In [27]:
model = Model(num_classes=1, compound_coef=0, scales=[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)],
              aspect_ratios=[(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])

Loaded pretrained weights for efficientnet-b0


In [28]:
print(f'Number of Parameters: {sum(param.numel() for param in model.parameters() if param.requires_grad)}')

Number of Parameters: 3799970


In [29]:
import torch
cls_preds, loc_preds, anchors = model(inputs=torch.rand(1, 3, 512, 512))
print(cls_preds.shape)
print(loc_preds.shape)
print(anchors.shape)

torch.Size([1, 49104, 1])
torch.Size([1, 49104, 4])
torch.Size([1, 49104, 4])


## Anchor Generation

In [1]:
import torch
import itertools
import numpy as np
from torch import nn
from typing import List, Tuple


class AnchorGeneration(nn.Module):
    def __init__(self,
                 debug: bool = False,
                 compound_coef: int = 0,
                 scales: List[float] = [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)],
                 aspect_ratios: List[float] = [0.5, 1., 2.]) -> None:
        super(AnchorGeneration, self).__init__()
        self.anchor_scale = 4. if compound_coef != 7 else 5.

        self.debug = debug
        self.scales = scales
        self.aspect_ratios = aspect_ratios

    def forward(self, inputs: torch.Tensor, pyramid_features: Tuple[torch.Tensor]) -> torch.Tensor:
        image_size = inputs.shape[-2:]
        grid_sizes = [pyramid_feature.shape[-2:] for pyramid_feature in pyramid_features]
        dtype, device = pyramid_features[0].dtype, pyramid_features[0].device
        strides = [[image_size[0] // grid_size[0], image_size[1] // grid_size[1]] for grid_size in grid_sizes]

        if self.debug:
            visual_image = np.zeros(shape=(image_size[0], image_size[1], 3), dtype=np.uint8)

        anchors_over_all_pyramid_features = []
        for stride in strides:
            stride_height, stride_width = stride

            anchors_per_pyramid_feature = []
            for scale, aspect_ratio in itertools.product(self.scales, self.aspect_ratios):
                if (image_size[0] % stride_height != 0) or (image_size[1] % stride_width != 0):
                    raise ValueError('input size must be divided by the stride.')

                base_anchor_width = self.anchor_scale * stride_width * scale
                base_anchor_height = self.anchor_scale * stride_height * scale

                anchor_width = base_anchor_width * np.sqrt(aspect_ratio)
                anchor_height = base_anchor_height * (1 / np.sqrt(aspect_ratio))

                shift_x = torch.arange(
                    start=stride_width / 2, end=image_size[1], step=stride_width,
                    dtype=torch.float32, device=device
                )
                shift_y = torch.arange(
                    start=stride_height / 2, end=image_size[0], step=stride_height,
                    dtype=torch.float32, device=device
                )

                shift_x, shift_y = torch.meshgrid(shift_x, shift_y)
                shift_x, shift_y = shift_x.reshape(-1), shift_y.reshape(-1)

                # y1, x1, y2, x2
                anchors = torch.stack(
                    (shift_y - anchor_height / 2.,
                     shift_x - anchor_width / 2.,
                     shift_y + anchor_height / 2.,
                     shift_x + anchor_width / 2.),
                    dim=1
                )

                anchors_per_pyramid_feature.append(anchors)

                if self.debug:
                    import cv2
                    for anchor in anchors:
                        y1, x1, y2, x2 = anchor.numpy()
                        cv2.rectangle(
                            img=visual_image,
                            pt1=(int(round(x1)), int(round(y1))),
                            pt2=(int(round(x2)), int(round(y2))),
                            color=(255, 255, 255),
                            thickness=1
                        )
                    cv2.imshow(f'visual_at_stride_#{stride}', visual_image)
                    cv2.waitKey()
                    cv2.destroyAllWindows()

            anchors_per_pyramid_feature = torch.cat(anchors_per_pyramid_feature, dim=0)

            anchors_over_all_pyramid_features.append(anchors_per_pyramid_feature)

        anchor_boxes = torch.cat(anchors_over_all_pyramid_features, dim=0).to(dtype).to(device)

        return anchor_boxes.unsqueeze(0)

In [3]:
anchor_generator = AnchorGeneration(debug=True,
                                    compound_coef=0,
                                    scales=[1 / 16, 1 / 8, 1 / 4],
                                    aspect_ratios=[1 / 3, 0.5, 1., 2., 3.])

inputs = torch.rand(1, 3, 512, 512)
pyramid_features = [torch.rand(1, 3, 4, 4), torch.rand(1, 3, 8, 8)]

anchor_boxes = anchor_generator(inputs, pyramid_features)