## DataLoader

In [1]:
import imgaug.augmenters as iaa
from torch.utils.data import DataLoader
from flame.core.data.pascal_dataset import PascalDataset

In [2]:
# setting
imsize = 416
VOC2012 = {
    'image_dir': '../efficient_det_pytorch/dataset/PASCALVOC2012/JPEGImages/',
    'label_dir': '../efficient_det_pytorch/dataset/PASCALVOC2012/Annotations/',
    'txt_path': '../efficient_det_pytorch/dataset/PASCALVOC2012/ImageSets/Segmentation/train.txt',
}

VOC2007 = {
    'image_dir': '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/',
    'label_dir': '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Annotations/',
}
image_extent = '.jpg'
label_extent = '.xml'
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
classes = {
    'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4,
    'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9,
    'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14,
    'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19,
}
S = [13, 26, 52]  # image_size // 32, image_size // 16, image_size // 8
C = 20
anchors = [[[0.28, 0.22], [0.38, 0.48], [0.9, 0.78]],  # S = 13
           [[0.07, 0.15], [0.15, 0.11], [0.14, 0.29]],  # S = 26
           [[0.02, 0.03], [0.04, 0.07], [0.08, 0.06]]]  # S = 52
transforms = [
    iaa.Fliplr(p=0.5)
]

In [3]:
train_dataset = PascalDataset(
    VOC2007=VOC2007, VOC2012=VOC2012,
    image_extent=image_extent, label_extent=label_extent,
    classes=classes, mean=mean, std=std,
    imsize=imsize, S=S, anchors=anchors, transforms=transforms
)

- train:
	 VOC2007: 5011
	 VOC2012: 1464
	 Total: 6475


In [4]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True,
)

### Visualization

In [7]:
import cv2
from flame.core.data.visualize import to_image, draw_box

idx2class = {idx: class_name for class_name, idx in classes.items()}

iter_loader = iter(train_loader)

In [24]:
samples, targets, infos = iter_loader.next()

s1, s2, s3 = targets  # s1, s2, s3: N x 3 x S x S x 6

for sample, target in zip(samples, s3):  # choose s1 or s2 or s3
    grid_size = sample.shape[2] / target.shape[2]

    image = to_image(sample)

    indices = target[:, :, :, 0] == 1  # 3 x 13 x 13 x 6
    boxes = target[indices]  # n_boxes x 6

    x = (target[..., 1:2] == boxes[:, 1]).nonzero(as_tuple=True)[2]  # columns
    y = (target[..., 2:3] == boxes[:, 2]).nonzero(as_tuple=True)[1]  # rows
    
    boxes[:, 1] += x
    boxes[:, 2] += y
    boxes[:, 1:5] *= grid_size
    boxes[:, [1, 2]] -= boxes[:, [3, 4]] / 2  # x1, y1 = x - w / 2, y - h / 2
    boxes[:, [3, 4]] += boxes[:, [1, 2]]  # x2 = x1 + w, y2 = y1 + h

    boxes = boxes.to(torch.int32).cpu().numpy().tolist()

    for box in boxes:
        draw_box(
            image, box=box[1:5], name=idx2class[box[5]],
            box_color=(0, 0, 255), text_color=(255, 0, 0)
        )
        cv2.imshow(f'visual S={target.shape[2]}', image)
        cv2.waitKey()
        cv2.destroyAllWindows()

## Model

## Anchor Generation

In [1]:
# parameters for anchor generation
image_size = (416 * 3, 416 * 3)  # w, h

# all predicting scales
scales = [13, 26, 52]

# width and height of each anchor boxes at each scales
anchors = {
    13: [[116, 90], [156, 198], [373, 326]],
    26: [[30, 61], [62, 45], [59, 119]],
    52: [[10, 13], [16, 30], [33, 23]]
}

device = 'cpu'

In [2]:
S = scales[0]
anchor = anchors[S]

import torch

anchor_boxes = {}

for scale in scales:
    grid_size_x, grid_size_y = (image_size[0] / S, image_size[1] / S)

    anchor_sizes = torch.tensor(anchors[scale], dtype=torch.float, device=device)  # 3 x 2

    w = anchor_sizes[:, 0].view(3, 1, 1)  # 3 x 1 x 1
    h = anchor_sizes[:, 1].view(3, 1, 1)  # 3 x 1 x 1

    cx = torch.arange(start=grid_size_x / 2, end=image_size[0], step=grid_size_x)  # scale
    cy = torch.arange(start=grid_size_y / 2, end=image_size[1], step=grid_size_y)  # scale

    cx, cy = torch.meshgrid(cx, cy)  # cx: scale x scale, cy: scale x scale  (coordinates)
    cx, cy = cx.unsqueeze(dim=0), cy.unsqueeze(dim=0)  # 1 x scale x scale

    x1, y1 = cx - w / 2, cy - h / 2  # 3 x scale x scale
    x2, y2 = cx + w / 2, cy + h / 2  # 3 x scale x scale

    boxes = torch.stack([x1, y1, x2, y2], dim=3)  # 3 x scale x scale x 4
    
    anchor_boxes[scale] = boxes


torch.Size([3, 13, 13, 4])


In [3]:
import cv2
import numpy as np

image = np.zeros(shape=(image_size[0], image_size[1], 3), dtype=np.uint8)
all_boxes = anchor_boxes.numpy().reshape(3, -1, 4)

for boxes in all_boxes:
    for box in boxes:
        box = np.int32(box)
        cv2.rectangle(
            img=image,
            pt1=tuple(box[:2]),
            pt2=tuple(box[2:]),
            color=(0, 0, 255),
            thickness=1
        )
        cv2.circle(
            img=image,
            center=(int((box[0] + box[2]) / 2),
                    int((box[1] + box[3]) / 2)),
            radius=1,
            color=(0, 255, 0),
            thickness=-1
        )
    cv2.imshow('a', image)
    cv2.waitKey()
    cv2.destroyAllWindows()

## Model

In [1]:
import torch
import torch.nn as nn

""" 
Information about architecture config:
Tuple is structured by (filters, kernel_size, stride) 
Every conv is a same convolution. 
List is structured by "B" indicating a residual block followed by the number of repeats
"S" is for scale prediction block and computing the yolo loss
"U" is for upsampling the feature map and concatenating with a previous layer
"""
config = [
    (32, 3, 1),
    (64, 3, 2),
    ["B", 1],
    (128, 3, 2),
    ["B", 2],
    (256, 3, 2),
    ["B", 8],
    (512, 3, 2),
    ["B", 8],
    (1024, 3, 2),
    ["B", 4],  # To this point is Darknet-53
    (512, 1, 1),
    (1024, 3, 1),
    "S",
    (256, 1, 1),
    "U",
    (256, 1, 1),
    (512, 3, 1),
    "S",
    (128, 1, 1),
    "U",
    (128, 1, 1),
    (256, 3, 1),
    "S",
]


class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, bn_act=True, **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            bias=not bn_act,
            **kwargs
        )
        self.bn = nn.BatchNorm2d(num_features=out_channels)
        self.leaky = nn.LeakyReLU(negative_slope=0.1)
        self.use_bn_act = bn_act

    def forward(self, x):
        if self.use_bn_act:
            return self.leaky(self.bn(self.conv(x)))
        else:
            return self.conv(x)


class ResidualBlock(nn.Module):
    def __init__(self, channels, use_residual=True, num_repeats=1):
        super().__init__()
        self.layers = nn.ModuleList()
        for repeat in range(num_repeats):
            self.layers += [
                nn.Sequential(
                    CNNBlock(channels, channels // 2, kernel_size=1),
                    CNNBlock(channels // 2, channels, kernel_size=3, padding=1),
                )
            ]

        self.use_residual = use_residual
        self.num_repeats = num_repeats

    def forward(self, x):
        for layer in self.layers:
            if self.use_residual:
                x = x + layer(x)
            else:
                x = layer(x)

        return x


class ScalePrediction(nn.Module):
    def __init__(self, in_channels, num_classes):
        super().__init__()
        self.pred = nn.Sequential(
            CNNBlock(in_channels, 2 * in_channels, kernel_size=3, padding=1),
            CNNBlock(
                2 * in_channels, (num_classes + 5) * 3, bn_act=False, kernel_size=1
            ),
        )
        self.num_classes = num_classes

    def forward(self, x):
        return (
            self.pred(x)
            .reshape(x.shape[0], 3, self.num_classes + 5, x.shape[2], x.shape[3])
            .permute(0, 1, 3, 4, 2)
        )


class YOLOv3(nn.Module):
    def __init__(self, in_channels=3, num_classes=80):
        super().__init__()
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.layers = self._create_conv_layers()

    def forward(self, x):
        outputs = []  # for each scale
        route_connections = []
        for layer in self.layers:
            if isinstance(layer, ScalePrediction):
                outputs.append(layer(x))
                continue

            x = layer(x)

            if isinstance(layer, ResidualBlock) and layer.num_repeats == 8:
                route_connections.append(x)

            elif isinstance(layer, nn.Upsample):
                x = torch.cat([x, route_connections[-1]], dim=1)
                route_connections.pop()

        return outputs

    def _create_conv_layers(self):
        layers = nn.ModuleList()
        in_channels = self.in_channels

        for module in config:
            if isinstance(module, tuple):
                out_channels, kernel_size, stride = module
                layers.append(
                    CNNBlock(
                        in_channels,
                        out_channels,
                        kernel_size=kernel_size,
                        stride=stride,
                        padding=1 if kernel_size == 3 else 0,
                    )
                )
                in_channels = out_channels

            elif isinstance(module, list):
                num_repeats = module[1]
                layers.append(ResidualBlock(in_channels, num_repeats=num_repeats,))

            elif isinstance(module, str):
                if module == "S":
                    layers += [
                        ResidualBlock(in_channels, use_residual=False, num_repeats=1),
                        CNNBlock(in_channels, in_channels // 2, kernel_size=1),
                        ScalePrediction(in_channels // 2, num_classes=self.num_classes),
                    ]
                    in_channels = in_channels // 2

                elif module == "U":
                    layers.append(nn.Upsample(scale_factor=2),)
                    in_channels = in_channels * 3

        return layers

In [2]:
# test model
model = YOLOv3(num_classes=20)

dummy_tensor = torch.randn((2, 3, 416, 416))

outputs = model(dummy_tensor)

assert outputs[0].shape == (2, 3, 416 // 32, 416 // 32, 20 + 5)
assert outputs[1].shape == (2, 3, 416 // 16, 416 // 16, 20 + 5)
assert outputs[2].shape == (2, 3, 416 // 8, 416 // 8, 20 + 5)
print("Success!")

Success!


# Inference

## preprocessing

In [10]:
import cv2
import torch
import numpy as np

def _resize(image: np.ndarray, imsize=416) -> np.ndarray:
    ratio = imsize / max(image.shape)
    image = cv2.resize(image, (0, 0), fx=ratio, fy=ratio)
    return image

def _pad_to_square(image: np.ndarray) -> np.ndarray:
    height, width = image.shape[:2]
    max_size = max(height, width)
    image = np.pad(image, ((0, max_size - height), (0, max_size - width), (0, 0)))
    return image

def preprocess(images, imsize=416, mean=[0, 0, 0], std=[1, 1, 1], device='cpu'):
    mean = torch.tensor(mean, dtype=torch.float, device=device).view(1, 3, 1, 1)
    std = torch.tensor(std, dtype=torch.float, device=device).view(1, 3, 1, 1)

    samples = [_resize(image, imsize=imsize) for image in images]  # resize
    samples = [_pad_to_square(sample) for sample in samples]  # pad to square
    samples = [cv2.cvtColor(sample, cv2.COLOR_BGR2RGB) for sample in samples]  # BGR -> RGB
    samples = [torch.from_numpy(sample) for sample in samples]  # array -> torch
    samples = torch.stack(samples, dim=0).to(device)  # stack
    samples = samples.permute(0, 3, 1, 2).contiguous()
    samples = (samples.float().div(255.) - mean) / std

    scales = [max(image.shape[:2]) / imsize for image in images]

    return images, scales, samples

## postprocessing

In [14]:
import torch
from torchvision import ops
from typing import Tuple, List

# postprocessing
anchors = [[(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],    # S = 13
           [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],   # S = 26
           [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],]  # S = 52

def inference(
        predictions: Tuple[torch.Tensor],
        anchors: List[List[Tuple[float, float]]],
        image_size: int = 416,
        iou_threshold: float = 0.5,
        score_threshold: float = 0.5
    ):
    '''get all boxes at gird S x S (grid_size = image_size / S)
    Args:
        preds: Tuple[[N x 3 x S x S x (tp, tx, ty, tw, th, n_classes)]] with S = [13, 26, 52]
        anchors: [3 x 3 x 2] (pw, ph with size in [0, 1])  (relative to image_size)
    Outputs:
        scores: [N x (3 * (S1 * S1 + S2 * S2 + S3 * S3))]
        labels: [N x (3 * (S1 * S1 + S2 * S2 + S3 * S3))]
        bboxes: [N x (3 * (S1 * S1 + S2 * S2 + S3 * S3)) x 4], with [x1 y1 x2 y2].
    '''
    device = predictions[0].device
    batch_size = predictions[0].shape[0]

    batch_boxes, batch_labels, batch_scores = [], [], []

    for i, pred in enumerate(predictions):
        S = pred.shape[2]  # 13, 26, 52

        # anchor: 1 x 3 x 1 x 1 x 2
        anchor = torch.tensor(anchors[i], device=device, dtype=torch.float)  # anchor: 3 x 2
        anchor = anchor.reshape(1, 3, 1, 1, 2)

        # cx, cy: N x 3 x S x S
        cx = torch.arange(S).repeat(batch_size, 3, S, 1).to(device)
        cy = cx.permute(0, 1, 3, 2)

        # N x 3 x S x S -> reshape: N x (3 * S * S)
        # score = sigmoid(tp)
        scores = torch.sigmoid(pred[..., 0]).reshape(batch_size, -1)

        # N x 3 x S x S -> reshape: N x (3 * S * S)
        labels = torch.argmax(pred[..., 5:], dim=-1).reshape(batch_size, -1)

        # xy: N x 3 x S x S x 2 (center of bboxes)
        # bx = sigmoid(tx) + cx, by = sigmoid(ty) + cy
        bx = (torch.sigmoid(pred[..., 1]) + cx) * (image_size / S)
        by = (torch.sigmoid(pred[..., 2]) + cy) * (image_size / S)
        bxy = torch.stack([bx, by], dim=-1)

        # wh: N x 3 x S x S x 2 (width, height of bboxes)
        # bw = pw * e ^ tw, bh = ph * e ^ th
        bwh = (image_size * anchor) * torch.exp(pred[..., 3:5])

        # boxes (x1 y1 x2 y2 type): N x (3 * S * S) x 4
        boxes = torch.cat([bxy - bwh / 2, bxy + bwh / 2], dim=-1).reshape(batch_size, -1, 4)
        boxes = torch.clamp(boxes, min=0, max=image_size)

        batch_boxes.append(boxes)
        batch_labels.append(labels)
        batch_scores.append(scores)

    batch_labels = torch.cat(batch_labels, dim=1)  # [N x (3 * (S1 * S1 + S2 * S2 + S3 * S3))]
    batch_scores = torch.cat(batch_scores, dim=1)  # [N x (3 * (S1 * S1 + S2 * S2 + S3 * S3))]
    batch_boxes = torch.cat(batch_boxes, dim=1)  # [N x (3 * (S1 * S1 + S2 * S2 + S3 * S3)) x 4]

    predictions = []

    for batch_id in range(batch_size):
        score_indices = batch_scores[batch_id, :] > score_threshold

        if score_indices.sum() == 0:
            predictions.append(
                {
                    'boxes': torch.tensor([[0, 0, 1, 1]], dtype=torch.float, device=device),
                    'labels': torch.tensor([-1], dtype=torch.int64, device=device),
                    'scores': torch.tensor([0], dtype=torch.float, device=device)
                }
            )

            continue

        boxes = batch_boxes[batch_id, score_indices, :]  # n_boxes x 4
        labels = batch_labels[batch_id, score_indices]  # n_labels
        scores = batch_scores[batch_id, score_indices]  # n_scores

        nms_indices = ops.boxes.batched_nms(
            boxes=boxes, scores=scores, idxs=labels,
            iou_threshold=iou_threshold
        )

        if nms_indices.shape[0] != 0:
            predictions.append(
                {
                    'boxes': boxes[nms_indices, :],
                    'labels': labels[nms_indices],
                    'scores': scores[nms_indices]
                }
            )
        else:
            predictions.append(
                {
                    'boxes': torch.tensor([[0, 0, 1, 1]], dtype=torch.float, device=device),
                    'labels': torch.tensor([-1], dtype=torch.int64, device=device),
                    'scores': torch.tensor([0], dtype=torch.float, device=device)
                }
            )

    return predictions

In [7]:
# load pretrained weight
model = YOLOv3(num_classes=20)
state_dict = torch.load(f='checkpoint/pretrained_weight/78.1map_0.2threshold_PASCAL.tar', map_location='cpu')
model.load_state_dict(state_dict=state_dict['state_dict'])
model = model.eval()

In [20]:
# preprocessing
import cv2
image_paths = [
    '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/000',
    '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/000012.jpg',
    '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/000016.jpg',
]

images = [cv2.imread(image_path) for image_path in image_paths]

images, scales, samples = preprocess(images)

In [21]:
# prediction
with torch.no_grad():
    preds = model(samples)

print(f'Input Shape: {samples.shape}')    
print(f'Output Shape at S={preds[0].shape[2]}: {preds[0].shape}')
print(f'Output Shape at S={preds[1].shape[2]}: {preds[1].shape}')
print(f'Output Shape at S={preds[2].shape[2]}: {preds[2].shape}')

Input Shape: torch.Size([3, 3, 416, 416])
Output Shape at S=13: torch.Size([3, 3, 13, 13, 25])
Output Shape at S=26: torch.Size([3, 3, 26, 26, 25])
Output Shape at S=52: torch.Size([3, 3, 52, 52, 25])


In [22]:
#post processing

predictions = inference(
    predictions=preds,
    anchors=anchors,
    image_size=416,
    iou_threshold=0.5,
    score_threshold=0.5
)

In [23]:
predictions

[{'boxes': tensor([[  9.4115,  11.4099, 237.5261, 397.4678],
          [178.3522, 178.9196, 293.7810, 403.9183],
          [  3.0243, 170.0002, 227.6837, 416.0000],
          [177.9559,  20.7553, 292.5116, 388.3447],
          [  0.0000,   6.5021, 101.8935, 352.0404],
          [  0.0000, 190.1166,  91.4532, 402.8295],
          [194.3734,  22.5865, 275.5323, 272.7602]]),
  'labels': tensor([14,  1,  1, 14, 14,  1, 14]),
  'scores': tensor([0.8944, 0.8688, 0.8530, 0.8383, 0.8261, 0.8107, 0.5676])},
 {'boxes': tensor([[126.4862,  70.2691, 286.6494, 227.7021]]),
  'labels': tensor([6]),
  'scores': tensor([0.8998])},
 {'boxes': tensor([[ 72.8772,  36.9785, 282.7292, 398.4085]]),
  'labels': tensor([1]),
  'scores': tensor([0.8798])}]

In [24]:
classes2idx = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4,
               'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 'diningtable': 10,
               'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 'pottedplant': 15,
               'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19}
classes = list(classes2idx.keys())

In [25]:
for image, scale, pred in zip(images, scales, predictions):
    thickness = max(image.shape) // 500
    fontscale = max(image.shape) / 500
    boxes = pred['boxes'].cpu().numpy()
    labels = pred['labels'].cpu().numpy()
    scores = pred['scores'].cpu().numpy()
    class_names = [classes[label] for label in labels]
    boxes = (boxes * scale).astype(np.int32)
    for box, score, class_name in zip(boxes, scores, class_names):
        color = (np.random.randint(200, 255),
                 np.random.randint(50, 200),
                 np.random.randint(0, 150))
#         if label != -1:
        cv2.rectangle(
            img=image,
            pt1=tuple(box[:2]),
            pt2=tuple(box[2:]),    
            color=color,
            thickness=thickness
        )

        cv2.putText(
            img=image,
            text=f'{class_name}: {score: .4f}',
            org=tuple(box[:2]),
            fontFace=cv2.FONT_HERSHEY_PLAIN,
            fontScale=fontscale,
            color=color,
            thickness=thickness,
            lineType=cv2.LINE_AA)

        cv2.imshow(class_name, image)
        cv2.waitKey()
        cv2.destroyAllWindows()