# 1. DataLoader

## 1.1 Dataset

In [1]:
import imgaug.augmenters as iaa
from torch.utils.data import DataLoader
from flame.core.data.pascal_dataset import PascalDataset

In [2]:
# setting
imsize = 416
VOC2012 = {
    'image_dir': '../efficient_det_pytorch/dataset/PASCALVOC2012/JPEGImages/',
    'label_dir': '../efficient_det_pytorch/dataset/PASCALVOC2012/Annotations/',
    'txt_path': '../efficient_det_pytorch/dataset/PASCALVOC2012/ImageSets/Segmentation/train.txt',
}

VOC2007 = {
    'image_dir': '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/',
    'label_dir': '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Annotations/',
}
image_extent = '.jpg'
label_extent = '.xml'
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
classes = {
    'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4,
    'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9,
    'diningtable': 10, 'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14,
    'pottedplant': 15, 'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19,
}
S = [13, 26, 52]  # image_size // 32, image_size // 16, image_size // 8
anchors = [
    [[116, 90], [156, 198], [373, 326]],  # S = 13
    [[30, 61], [62, 45], [59, 119]],      # S = 26
    [[10, 13], [16, 30], [33, 23]],       # S = 52
]
transforms = [
    iaa.Fliplr(p=0.5)
]

In [3]:
train_dataset = PascalDataset(
    VOC2007=VOC2007, VOC2012=VOC2012,
    image_extent=image_extent, label_extent=label_extent,
    classes=classes, mean=mean, std=std,
    imsize=imsize, S=S, anchors=anchors, transforms=transforms
)

- train:
	 VOC2007: 5011
	 VOC2012: 1464
	 Total: 6475


## 1.2 Dataloader

In [4]:
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=lambda batch:tuple(zip(*batch)),
)

## 1.3 Visualization

In [5]:
import cv2
import torch
from flame.core.data.visualize import to_image, draw_box

idx2class = {idx: class_name for class_name, idx in classes.items()}

iter_loader = iter(train_loader)

In [9]:
samples, targets, image_infos, boxes_infos = iter_loader.next()

In [11]:
targets = [torch.stack(target, dim=0).to('cpu') for target in zip(*targets)]
print(f's1 shape: {targets[0].shape}')
print(f's2 shape: {targets[1].shape}')
print(f's3 shape: {targets[2].shape}')

s1 shape: torch.Size([2, 3, 13, 13, 6])
s2 shape: torch.Size([2, 3, 26, 26, 6])
s3 shape: torch.Size([2, 3, 52, 52, 6])


In [12]:
samples = torch.stack(samples, dim=0).to('cpu')

In [13]:
image_infos

(['../efficient_det_pytorch/dataset/PASCALVOC2012/JPEGImages/2011_002300.jpg',
  (500, 375)],
 ['../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/009762.jpg',
  (500, 375)])

In [14]:
s1, s2, s3 = targets  # s1, s2, s3: N x 3 x S x S x 6
print(f's1 shape: {s1.shape}')
print(f's2 shape: {s2.shape}')
print(f's3 shape: {s3.shape}')

for sample, target in zip(samples, s3):  # choose s1 or s2 or s3
    grid_size = sample.shape[2] / target.shape[2]

    image = to_image(sample)

    indices = target[:, :, :, 0] == 1  # 3 x 13 x 13 x 6
    boxes = target[indices]  # n_boxes x 6

    x = (target[..., 1:2] == boxes[:, 1]).nonzero(as_tuple=True)[2]  # columns
    y = (target[..., 2:3] == boxes[:, 2]).nonzero(as_tuple=True)[1]  # rows
    
    boxes[:, 1] += x
    boxes[:, 2] += y
    boxes[:, 1:5] *= grid_size
    boxes[:, [1, 2]] -= boxes[:, [3, 4]] / 2  # x1, y1 = x - w / 2, y - h / 2
    boxes[:, [3, 4]] += boxes[:, [1, 2]]  # x2 = x1 + w, y2 = y1 + h

    boxes = boxes.to(torch.int32).cpu().numpy().tolist()

    for box in boxes:
        draw_box(
            image, box=box[1:5], name=idx2class[box[5]],
            box_color=(0, 0, 255), text_color=(255, 0, 0)
        )
        cv2.imshow(f'visual S={target.shape[2]}', image)
        cv2.waitKey()
        cv2.destroyAllWindows()

s1 shape: torch.Size([2, 3, 13, 13, 6])
s2 shape: torch.Size([2, 3, 26, 26, 6])
s3 shape: torch.Size([2, 3, 52, 52, 6])


In [15]:
image_infos

(['../efficient_det_pytorch/dataset/PASCALVOC2012/JPEGImages/2011_002300.jpg',
  (500, 375)],
 ['../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/009762.jpg',
  (500, 375)])

In [17]:
import numpy as np
ANCHORS = np.array([
    [(0.28, 0.22), (0.38, 0.48), (0.9, 0.78)],
    [(0.07, 0.15), (0.15, 0.11), (0.14, 0.29)],
    [(0.02, 0.03), (0.04, 0.07), (0.08, 0.06)],
])  # Note these have been rescaled to be between [0, 1]

In [18]:
ANCHORS * 416

array([[[116.48,  91.52],
        [158.08, 199.68],
        [374.4 , 324.48]],

       [[ 29.12,  62.4 ],
        [ 62.4 ,  45.76],
        [ 58.24, 120.64]],

       [[  8.32,  12.48],
        [ 16.64,  29.12],
        [ 33.28,  24.96]]])

# 2. Model

In [16]:
import torch
from flame.core.model.model import Model

model = Model(
    in_channels=3,
    num_classes=20,
    weight_path='checkpoint/pretrained_weight/78.1map_0.2threshold_PASCAL.tar',
    iou_threshold=0.5,
    score_threshold=0.5,
    anchors = [
      [[116, 90], [156, 198], [373, 326]],  # S = 13
      [[30, 61], [62, 45], [59, 119]],      # S = 26
      [[10, 13], [16, 30], [33, 23]],       # S = 52
    ]
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.train().to(device)

dummy_tensor = torch.FloatTensor(2, 3, 416, 416, device=device)

preds = model(dummy_tensor)

print(preds[0].shape)  # N x 3 x S x S x (5 + C)
print(preds[1].shape)
print(preds[2].shape)

torch.Size([2, 3, 13, 13, 25])
torch.Size([2, 3, 26, 26, 25])
torch.Size([2, 3, 52, 52, 25])


# 3. Loss

## 3.1 Anchor Generation

In [11]:
# parameters for anchor generation
image_size = (416, 416)  # w, h

# all predicting scales
scales = [13, 26, 52]

# width and height of each anchor boxes at each scales
anchors = {
    13: [[116, 90], [156, 198], [373, 326]],
    26: [[30, 61], [62, 45], [59, 119]],
    52: [[10, 13], [16, 30], [33, 23]]
}

device = 'cpu'

In [12]:
import torch

anchor_boxes = {}

for scale in scales:
    grid_size_x, grid_size_y = (image_size[0] / scale, image_size[1] / scale)

    anchor_sizes = torch.tensor(anchors[scale], dtype=torch.float, device=device)  # 3 x 2

    w = anchor_sizes[:, 0].view(3, 1, 1)  # 3 x 1 x 1
    h = anchor_sizes[:, 1].view(3, 1, 1)  # 3 x 1 x 1

    cx = torch.arange(start=grid_size_x / 2, end=image_size[0], step=grid_size_x)  # scale
    cy = torch.arange(start=grid_size_y / 2, end=image_size[1], step=grid_size_y)  # scale

    cx, cy = torch.meshgrid(cx, cy)  # cx: scale x scale, cy: scale x scale  (coordinates)
    cx, cy = cx.unsqueeze(dim=0), cy.unsqueeze(dim=0)  # 1 x scale x scale

    x1, y1 = cx - w / 2, cy - h / 2  # 3 x scale x scale
    x2, y2 = cx + w / 2, cy + h / 2  # 3 x scale x scale

    boxes = torch.stack([x1, y1, x2, y2], dim=3)  # 3 x scale x scale x 4
    
    anchor_boxes[scale] = boxes

In [None]:
# test anchor generation for each scales
import cv2
import numpy as np

image = np.zeros(shape=(image_size[0], image_size[1], 3), dtype=np.uint8)

for scale in scales:
    all_boxes = anchor_boxes[scale].numpy().reshape(3, -1, 4)
    for boxes in all_boxes:
        for box in boxes:
            box = np.int32(box)
            cv2.rectangle(
                img=image,
                pt1=tuple(box[:2]),
                pt2=tuple(box[2:]),
                color=(0, 0, 255),
                thickness=1
            )
            cv2.circle(
                img=image,
                center=(int((box[0] + box[2]) / 2),
                        int((box[1] + box[3]) / 2)),
                radius=1,
                color=(0, 255, 0),
                thickness=-1
            )
        cv2.imshow('a', image)
        cv2.waitKey()
        cv2.destroyAllWindows()

In [21]:
import torch


def compute_iou(anchors: torch.Tensor, boxes: torch.Tensor) -> torch.Tensor:
    '''compute IoU between each anchor boxes and groundtruth boxes
    Args:
        anchors: [num_anchors_per_scale, S, S, 4], S = 13 or 26 or 52
        box_type: [y1, x1, y2, x2]
        boxes: [num_boxes, 4]
        box_type: [x1, y1, x2, y2]
    Output:
        ious: [num_anchors_per_scale, S, S, num_boxes]
    references: https://pytorch.org/docs/stable/notes/broadcasting.html#broadcasting-semantics
    '''
    # get params and reshape anchors form [3 x S x S x 4] to [(3 * S * S), 4]
    num_boxes = boxes.shape[0]
    num_anchors_per_scale, S = anchors.shape[0], anchors.shape[1]
    anchors = anchors.reshape(num_anchors_per_scale * S * S, 4)  # num_anchors_per_scale * S * S, 4

    # calculate intersection areas of anchors and target boxes
    # num_anchors = num_anchors_per_scale * S * S
    inter_width = torch.min(anchors[:, 3].unsqueeze(dim=1), boxes[:, 2]) - torch.max(anchors[:, 1].unsqueeze(dim=1), boxes[:, 0])
    inter_height = torch.min(anchors[:, 2].unsqueeze(dim=1), boxes[:, 3]) - torch.max(anchors[:, 0].unsqueeze(dim=1), boxes[:, 1])
    inter_width = torch.clamp(inter_width, min=0.)  # num_anchors x num_boxes
    inter_height = torch.clamp(inter_height, min=0.)  # num_anchors x num_boxes
    inter_areas = inter_width * inter_height  # num_anchors x num_boxes

    # calculate union areas of anchors and target boxes
    # num_anchors = num_anchors_per_scale * S * S
    area_anchors = (anchors[:, 2] - anchors[:, 0]) * (anchors[:, 3] - anchors[:, 1])  # num_anchors
    area_boxes = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])  # num_boxes
    union_areas = area_anchors.unsqueeze(dim=1) + area_boxes - inter_width * inter_height  # num_anchors x num_boxes
    union_areas = torch.clamp(union_areas, min=1e-8)

    # calculate ious of anchors and target boxes
    # shape of ious is [(num_anchors_per_scale * S * S) x num_boxes]
    ious = inter_areas / union_areas

    # reshape ious from [(num_anchors_per_scale * S * S) x num_boxes] to [num_anchors_per_scale x S x S x num_boxes]
    ious = ious.reshape(num_anchors_per_scale, S, S, num_boxes)

    return ious

# test compute iou
anchors = torch.FloatTensor(3, 13, 13, 4)
boxes = torch.FloatTensor(2, 4)
ious = compute_iou(anchors, boxes)

print(f'Anchors at Scale Shape: {anchors.shape}')
print(f'Groundtruth Boxes: {boxes.shape}')
print(f'IoUs: {ious.shape}')

Anchors at Scale Shape: torch.Size([3, 13, 13, 4])
Groundtruth Boxes: torch.Size([2, 4])
IoUs: torch.Size([3, 13, 13, 2])


# 4. Inference

## 4.1 Preprocessing

In [1]:
import cv2
import torch
import numpy as np

def _resize(image: np.ndarray, imsize=416) -> np.ndarray:
    ratio = imsize / max(image.shape)
    image = cv2.resize(image, (0, 0), fx=ratio, fy=ratio)
    return image

def _pad_to_square(image: np.ndarray) -> np.ndarray:
    height, width = image.shape[:2]
    max_size = max(height, width)
    image = np.pad(image, ((0, max_size - height), (0, max_size - width), (0, 0)))
    return image

def preprocess(images, imsize=416, mean=[0, 0, 0], std=[1, 1, 1], device='cpu'):
    mean = torch.tensor(mean, dtype=torch.float, device=device).view(1, 3, 1, 1)
    std = torch.tensor(std, dtype=torch.float, device=device).view(1, 3, 1, 1)

    samples = [_resize(image, imsize=imsize) for image in images]  # resize
    samples = [_pad_to_square(sample) for sample in samples]  # pad to square
    samples = [cv2.cvtColor(sample, cv2.COLOR_BGR2RGB) for sample in samples]  # BGR -> RGB
    samples = [torch.from_numpy(sample) for sample in samples]  # array -> torch
    samples = torch.stack(samples, dim=0).to(device)  # stack
    samples = samples.permute(0, 3, 1, 2).contiguous()
    samples = (samples.float().div(255.) - mean) / std

    scales = [max(image.shape[:2]) / imsize for image in images]

    return images, scales, samples

## 4.2 Run

In [2]:
import torch
from flame.core.model.model import Model

model = Model(
    in_channels=3,
    num_classes=20,
    weight_path=None,
    iou_threshold=0.5,
    score_threshold=0.5,
    anchors = [
      [[116, 90], [156, 198], [373, 326]],  # S = 13
      [[30, 61], [62, 45], [59, 119]],      # S = 26
      [[10, 13], [16, 30], [33, 23]],       # S = 52
    ]
)

weight_path = 'checkpoint/pretrained_weight/78.1map_0.2threshold_PASCAL.tar'
state_dict = torch.load(f=weight_path, map_location='cpu')
model.load_state_dict(state_dict=state_dict['state_dict'])
model = model.eval()

In [3]:
# preprocessing
import cv2
image_paths = [
    '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/000005.jpg',
    '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/000012.jpg',
    '../efficient_det_pytorch/dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/000016.jpg',
]

images = [cv2.imread(image_path) for image_path in image_paths]

images, scales, samples = preprocess(images)

In [8]:
# prediction
with torch.no_grad():
    predictions = model.predict(samples)

In [9]:
predictions

[{'boxes': tensor([[135.9501, 214.1852, 221.9094, 306.2251],
          [204.0586, 162.0790, 262.1912, 271.0089],
          [  3.7298, 196.1347,  64.0971, 306.6113],
          [229.0565, 154.5116, 262.0498, 182.1058],
          [261.2530, 150.9323, 289.4495, 178.0880],
          [194.4179, 162.5784, 245.5823, 246.1445],
          [179.7555, 159.3967, 226.5517, 216.8225],
          [216.3913, 162.4353, 269.0368, 230.1576],
          [231.6277, 153.7339, 253.8889, 174.0897],
          [279.6370, 153.8638, 305.1120, 176.5713],
          [377.8582, 157.4993, 413.6432, 313.5740],
          [253.8978, 156.4704, 282.8614, 184.9576],
          [378.5215, 148.7126, 413.3721, 307.0582],
          [266.7535, 156.9698, 286.7319, 182.9836],
          [220.2417, 156.5490, 265.9907, 201.1744]]),
  'labels': tensor([ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, 14,  8,  8]),
  'scores': tensor([0.8458, 0.8184, 0.8114, 0.7521, 0.7256, 0.6656, 0.6551, 0.6264, 0.5843,
          0.5616, 0.5572, 0.5501, 0

In [10]:
classes2idx = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4,
               'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 'diningtable': 10,
               'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 'pottedplant': 15,
               'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19}
classes = list(classes2idx.keys())

In [13]:
for image, scale, pred in zip(images, scales, predictions):
    thickness = max(image.shape) // 500
    fontscale = max(image.shape) / 500
    boxes = pred['boxes'].cpu().numpy()
    labels = pred['labels'].cpu().numpy()
    scores = pred['scores'].cpu().numpy()
    class_names = [classes[label] for label in labels]
    boxes = (boxes * scale).astype(np.int32)
    for box, score, class_name in zip(boxes, scores, class_names):
        color = (np.random.randint(200, 255),
                 np.random.randint(50, 200),
                 np.random.randint(0, 150))
#         if label != -1:
        cv2.rectangle(
            img=image,
            pt1=tuple(box[:2]),
            pt2=tuple(box[2:]),    
            color=color,
            thickness=thickness
        )

        cv2.putText(
            img=image,
            text=f'{class_name}: {score: .4f}',
            org=tuple(box[:2]),
            fontFace=cv2.FONT_HERSHEY_PLAIN,
            fontScale=fontscale,
            color=color,
            thickness=thickness,
            lineType=cv2.LINE_AA)

    cv2.imshow(class_name, image)
    cv2.waitKey()
    cv2.destroyAllWindows()