In [2]:
import cv2
import time
import torch
import torchvision
import numpy as np
import imgaug.augmenters as iaa

In [3]:
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
model = model.eval()

pad_to_square = iaa.PadToSquare(position='right-bottom')

def preprocess(image, image_size=(800, 800)):
    sample = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    sample = pad_to_square(image=sample)
    sample = cv2.resize(sample, dsize=image_size)
    samples = torch.from_numpy(sample).unsqueeze(dim=0)
    samples = samples.permute(0, 3, 1, 2).contiguous()
    samples = samples.float().div(255.)
    return samples

def process(samples):
    with torch.no_grad():
        return model(samples)

In [5]:
image = cv2.imread('/home/phungpx/Downloads/dog.jpg')
samples = preprocess(image)
t1 = time.time()
preds = process(samples)
t2 = time.time()
print(t2 - t1)

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1595629427478/work/torch/csrc/utils/python_arg_parser.cpp:766.)
  keep = keep.nonzero().squeeze(1)


2.337468385696411


In [84]:
thresh_iou_nms = 0.5
thresh_score = 0.5
ratio_color_mask_on_image = 0.3

for pred in preds:
    labels, boxes, scores, masks = pred['labels'], pred['boxes'], pred['scores'], pred['masks']

    indices = torchvision.ops.nms(boxes, scores, thresh_iou_nms)
    labels, boxes, scores, masks = labels[indices], boxes[indices], scores[indices], masks[indices]

    indices = scores > thresh_score
    labels, boxes, scores, masks = labels[indices], boxes[indices], scores[indices], masks[indices]

    labels = labels.detach().cpu().numpy()
    boxes = boxes.detach().cpu().numpy()
    scores = scores.detach().cpu().numpy()
    masks = masks.round().to(torch.float).squeeze(dim=1).detach().cpu().numpy()

    size = max(image.shape[0], image.shape[1])
    rw, rh = size / masks.shape[2], size / masks.shape[1]
    for (label, box, score, mask) in zip(labels, boxes, scores, masks):
        mask = cv2.resize(mask, dsize=(size, size), interpolation=cv2.INTER_NEAREST)
        mask = mask[:image.shape[0], :image.shape[1]]
        box = np.int32([box[0] * rw, box[1] * rh, box[2] * rw, box[3] * rh])
        image = visualize_bbox(image=image, bbox=box, class_name='dog')
        image[mask.astype(dtype=bool)] = image[mask.astype(dtype=bool)] * (1 - ratio_color_mask_on_image) \
                                         + np.array([[0, 255, 0]], dtype=float) * ratio_color_mask_on_image

    cv2.imshow('image', image)
    cv2.waitKey()
    cv2.destroyAllWindows()

In [83]:
def visualize_bbox(image, bbox, mask, class_name, bbox_color=(255, 0, 0), mask_color=(0, 255, 0), text_color=(255, 255, 255), thickness=2):
    x1, y1, x2, y2 = bbox
    cv2.rectangle(img=image,
                  pt1=(int(x1), int(y1)),
                  pt2=(int(x2), int(y2)),
                  color=bbox_color,
                  thickness=thickness)
    ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1)
    cv2.rectangle(img=image,
                  pt1=(int(x1), int(y1 - 1.3 * text_height)),
                  pt2=(int(x1 + text_width), int(y1)),
                  color=bbox_color,
                  thickness=-1)
    cv2.putText(img=image,
                text=class_name,
                org=(int(x1), int(y1 - 0.3 * text_height)),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=0.35,
                color=text_color,
                lineType=cv2.LINE_AA)
    image[mask.astype(dtype=bool)] = image[mask.astype(dtype=bool)] * (1 - ratio_color_mask_on_image) \
                                     + np.array([mask_color], dtype=float) * ratio_color_mask_on_image
    return image

## Padd To Square

In [25]:
import cv2
import numpy as np
import imgaug.augmenters as iaa

def pad_to_square(image, value=0):
    height, width = image.shape[:2]
    if height > width:
        pad = np.empty(shape=(height, height - width, 3), dtype=image.dtype)
        pad.fill(value)
        padded_image = np.concatenate([image, pad], axis=1)
    elif width > height:
        pad = np.empty(shape=(width - height, width, 3), dtype=image.dtype)
        pad.fill(value)
        padded_image = np.concatenate([image, pad], axis=0)
    else:
        padded_image = image
    return padded_image

pad_to_square_iaa = iaa.PadToSquare(position='right-bottom')

In [26]:
image = cv2.imread('/home/phungpx/Downloads/dog.jpg')
print(image.shape)

(602, 1200, 3)


In [27]:
import time
t1 = time.time()
padded_image = pad_to_square(image=image)
t2 = time.time()
print(t2 - t1)

cv2.imshow('padding image', padded_image)
cv2.waitKey()
cv2.destroyAllWindows()

0.002534151077270508


In [28]:
t1 = time.time()
iaa_padded_image = pad_to_square_iaa(image=image)
t2 = time.time()
print(t2 - t1)

cv2.imshow('padding image', iaa_padded_image)
cv2.waitKey()
cv2.destroyAllWindows()

0.010089397430419922


In [29]:
# compare results
(padded_image - iaa_padded_image).sum()

0

## Dataset

In [7]:
image_dir = 'dataset/PASCAL_VOC/images/'
label_dir = 'dataset/PASCAL_VOC/labels/'
csv_path = 'dataset/PASCAL_VOC/train.csv'
image_size = 416
S = [13, 26, 52]  # image_size // 32, image_size // 16, image_size // 8
C = 20
anchors = [[[0.28, 0.22], [0.38, 0.48], [0.9, 0.78]],
           [[0.07, 0.15], [0.15, 0.11], [0.14, 0.29]],
           [[0.02, 0.03], [0.04, 0.07], [0.08, 0.06]]]
transforms = None

In [8]:
from flame.core.data.voc_dataset import YOLOv3Dataset

dataset = YOLOv3Dataset(image_dir, label_dir, csv_path, image_size, anchors, S, C, transforms=transforms)

In [9]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset=dataset, batch_size=3, shuffle=False)

In [10]:
import torch
import torchvision

def convert_cxcywh2xyxy(bboxes: torch.Tensor, image_width: int, image_height: int) -> torch.Tensor:
    '''convert (center_x / image_width, center_y / image_height, w / image_width, h / image_height) to (x1, y1, x2, y2)
    '''
    bboxes[:, [2, 4]] = bboxes[:, [2, 4]] * image_width
    bboxes[:, [3, 5]] = bboxes[:, [3, 5]] * image_height
    bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 4] / 2
    bboxes[:, 3] = bboxes[:, 3] - bboxes[:, 5] / 2
    bboxes[:, 4] = bboxes[:, 2] + bboxes[:, 4]
    bboxes[:, 5] = bboxes[:, 3] + bboxes[:, 5]

    return bboxes

def cells_to_bboxes(bboxes: torch.Tensor, scale_anchors: torch.Tensor,
                    S: int, is_prediction: bool = True) -> torch.Tensor:
    """
    Scales the predictions coming from the model to be relative to the entire image.
    Args:
        bboxes (Tensor[N, 3, S, S, num_classes + 5])
        anchors: the anchors used for the predictions
        S: the number of cells the image is divided in on the width (and height)
        is_preds: whether the input is predictions or the true bounding boxes
    Returns:
        converted_bboxes: the converted boxes of sizes (N, num_anchors * S * S, 1+5)
                          with class index, object score, bounding box coordinates.
    """
    BATCH_SIZE = bboxes.shape[0]
    num_anchors = scale_anchors.shape[0]
    box_coords = bboxes[..., 1:5]
    if is_prediction:
        scale_anchors = scale_anchors.reshape(1, num_anchors, 1, 1, 2)
        box_coords[..., 0:2] = torch.sigmoid(box_coords[..., 0:2])  # x, y coordinates
        box_coords[..., 2:4] = torch.exp(box_coords[..., 2:4]) * scale_anchors  # w, h
        scores = torch.sigmoid(bboxes[..., 0:1])
        labels = torch.argmax(bboxes[..., 5:], dim=-1).unsqueeze(dim=-1)
    else:
        scores = bboxes[..., 0:1]
        labels = bboxes[..., 5:6]

    # BATCH_SIZE x 3 x S x S x 1
    cell_indices = torch.arange(S).repeat(BATCH_SIZE, 3, S, 1).unsqueeze(dim=-1).to(bboxes.device)

    x = 1 / S * (box_coords[..., 0:1] + cell_indices)
    y = 1 / S * (box_coords[..., 1:2] + cell_indices.permute(0, 1, 3, 2, 4))
    w = 1 / S * box_coords[..., 2:3]
    h = 1 / S * box_coords[..., 3:4]

    bboxes = torch.cat([labels, scores, x, y, w, h], dim=-1).reshape(BATCH_SIZE, num_anchors * S * S, 6)

    return bboxes

def batched_nms(boxes: torch.Tensor, scores: torch.Tensor,
                idxs: torch.Tensor, iou_threshold: float) -> torch.Tensor:
    """Performs non-maximum suppression in a batched fashion.
    Each index value correspond to a category, and NMS will not be applied between elements of different categories.
    Args:
        boxes (Tensor[N, 4]): boxes where NMS will be performed. (x1, y1, x2, y2) with 0 <= x1 < x2 and 0 <= y1 < y2
        scores (Tensor[N]): scores for each one of the boxes
        idxs (Tensor[N]): indices of the categories for each one of the boxes.
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
    Returns:
        keep (Tensor): int64 tensor with the indices of the elements that have been kept by NMS, sorted in decreasing order of scores
    """
    if boxes.numel() == 0:
        return torch.empty(size=(0,), dtype=torch.int64, device=boxes.device)
    else:
        max_coordinate = boxes.max()
        offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(data=1).to(boxes))
        boxes_for_nms = boxes + offsets[:, None]
        keep = torchvision.ops.nms(boxes=boxes_for_nms, scores=scores, iou_threshold=iou_threshold)
        return keep

def postprocess_batched_nms(bboxes: torch.Tensor, iou_threshold: float, score_threshold: float) -> torch.Tensor:
    '''
    Args:
        bboxes (Tensor[N, 6]): (class_id, score, x1, y1, x2, y2)
        iou_threshold (float): discards all overlapping boxes with IoU > iou_threshold
        score_threshold (float): discards all boxes with score < score_threshold
    Returns:
        bboxes (Tensor[M, 6]): (class_id, score, x1, y1, x2, y2)
    '''
    # bboxes = torch.from_numpy(np.asarray(bboxes))
    idxs, scores, boxes = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2:6]
    indices = scores >= score_threshold
    idxs, scores, boxes = idxs[indices], scores[indices], boxes[indices]
    indices = batched_nms(boxes=boxes, scores=scores, idxs=idxs, iou_threshold=iou_threshold)
    idxs, scores, boxes = idxs[indices], scores[indices], boxes[indices]
    bboxes = torch.cat([idxs[:, None], scores[:, None], boxes], dim=-1)

    return bboxes

In [22]:
import cv2
import numpy as np
from pathlib import Path

PASCAL_CLASSES = {"aeroplane": 0, "bicycle": 1, "bird": 2, "boat": 3, "bottle": 4,
                  "bus": 5, "car": 6, "cat": 7, "chair": 8, "cow": 9,
                  "diningtable": 10, "dog": 11, "horse": 12, "motorbike": 13, "person": 14,
                  "pottedplant": 15, "sheep": 16, "sofa": 17, "train": 18, "tvmonitor": 19}

def visualize_targets(batch, multi_scale_anchors, scales=[13, 26, 52], iou_threshold=0, score_threshold=1., classes=PASCAL_CLASSES):
    '''
    parameters:
        batch_samples: Tensor [batch_size x 3 x image_size x image_size]
        tuple_batch_targets: tuple([batch_size x num_anchors x S1 x S1 x 6],
                        [batch_size x num_anchors x S2 x S2 x 6],
                        [batch_size x num_anchors x S3 x S3 x 6])
        multi_scale_anchors: 3 x 3 x 2
        scales: [13, 26, 52]
    '''
    idx2class = {idx: label_name for label_name, idx in classes.items()}
    multi_scale_anchors = torch.tensor(multi_scale_anchors)
    scales = torch.tensor(scales).unsqueeze(dim=1).unsqueeze(dim=1).repeat(1, 3, 2)
    multi_scale_anchors = multi_scale_anchors * scales

    batch_samples, tuple_batch_targets, batch_infos = batch
    batch_image_paths = batch_infos[0]
    batch_samples = batch_samples.permute(0, 2, 3, 1).contiguous()
    batch_samples = batch_samples.mul(255).to(torch.uint8)
    batch_images = batch_samples.data.detach().cpu().numpy()
    
    batch_multi_scale_boxes = []
    for scale_idx, batch_scale_targets in enumerate(tuple_batch_targets):
        chosen_scale_anchors = multi_scale_anchors[scale_idx]
        scale = batch_scale_targets.shape[2]
        batch_scale_boxes = cells_to_bboxes(batch_scale_targets, is_prediction=False, S=scale, scale_anchors=chosen_scale_anchors)
        batch_multi_scale_boxes.append(batch_scale_boxes)
    
    batch_multi_scale_boxes = torch.cat(batch_multi_scale_boxes, dim=1)

    for image, multi_scale_bboxes, image_path in zip(batch_images, batch_multi_scale_boxes, batch_image_paths):
        print('\n')
        print(f'image: {Path(image_path).name}')
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)  # convert RGB to BGR
        image_height, image_width = image.shape[:2]
        bboxes = convert_cxcywh2xyxy(bboxes=multi_scale_bboxes, image_width=image_width, image_height=image_height)
        # num_bboxes: (S1 * S1 + S2 * S2 + S3 * S3) * num_anchor_per_scale
        print(f'number of bboxes before NMS: {bboxes.shape[0]}')
        bboxes = postprocess_batched_nms(bboxes=bboxes, iou_threshold=iou_threshold, score_threshold=score_threshold)
        print(f'number of bboxes after NMS: {bboxes.shape[0]}')

        for bbox in bboxes:
            label, score, [x1, y1, x2, y2] = bbox[0].item(), bbox[1].item(), bbox[2:].to(torch.int32).data.numpy().tolist()
            print(x1, y1, x2, y2)
            cv2.rectangle(img=image, pt1=(x1, y1), pt2=(x2, y2), color=(0, 0, 255), thickness=2)
            print(f'class name: {idx2class[label]}, score: {score:.3f}, coords: x1={x1}, y1={y1}, x2={x2}, y2={y2}')
            cv2.imshow(Path(image_path).name, image)
            cv2.waitKey()
            cv2.destroyAllWindows()

In [23]:
data_iter = iter(dataloader)

In [24]:
batch_samples, tuple_batch_targets, batch_infos = data_iter.next()
print('samples: ', batch_samples.shape)
print('target 1: ', tuple_batch_targets[0].shape)
print('target 2: ', tuple_batch_targets[1].shape)
print('target 3: ', tuple_batch_targets[2].shape)
print('path infos: ', batch_infos[0][0])
print('width infos: ', batch_infos[1][0][0])
print('height infos: ', batch_infos[1][1][0])

samples:  torch.Size([3, 3, 416, 416])
target 1:  torch.Size([3, 3, 13, 13, 6])
target 2:  torch.Size([3, 3, 26, 26, 6])
target 3:  torch.Size([3, 3, 52, 52, 6])
path infos:  dataset/PASCAL_VOC/images/000007.jpg
width infos:  tensor(500)
height infos:  tensor(333)


In [25]:
batch_infos

[('dataset/PASCAL_VOC/images/000007.jpg',
  'dataset/PASCAL_VOC/images/000009.jpg',
  'dataset/PASCAL_VOC/images/000016.jpg'),
 [tensor([500, 500, 334]), tensor([333, 375, 500])]]

In [26]:
visualize_targets(batch=data_iter.next(), multi_scale_anchors=anchors, scales=S)



image: 000019.jpg
number of bboxes before NMS: 10647
number of bboxes after NMS: 1
8 93 220 214
class name: cat, score: 1.000, coords: x1=8, y1=93, x2=220, y2=214


image: 000020.jpg
number of bboxes before NMS: 10647
number of bboxes after NMS: 1
26 122 307 345
class name: car, score: 1.000, coords: x1=26, y1=122, x2=307, y2=345


image: 000021.jpg
number of bboxes before NMS: 10647
number of bboxes after NMS: 3
0 194 150 321
class name: dog, score: 1.000, coords: x1=0, y1=194, x2=150, y2=321
173 29 278 400
class name: person, score: 1.000, coords: x1=173, y1=29, x2=278, y2=400
8 149 117 347
class name: person, score: 1.000, coords: x1=8, y1=149, x2=117, y2=347


## Inference

In [27]:
import cv2
import time
import torch
from pretrained.yolov3 import YOLOv3

weight_path = './pretrained/78.1map_0.2threshold_PASCAL.tar'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = YOLOv3(in_channels=3, num_classes=20)
model.load_state_dict(state_dict=torch.load(f=weight_path, map_location='cpu')['state_dict'])
model.to(device)
model = model.eval()

In [28]:
image_paths = ['./dataset/PASCAL_VOC/images/000002.jpg', './dataset/PASCAL_VOC/images/000001.jpg']
image_size = (416, 416)

def preprocess(image_paths, image_size=(416, 416)):
    images = [cv2.imread(image_path) for image_path in image_paths]
    samples = [cv2.cvtColor(image, cv2.COLOR_BGR2RGB) for image in images]
    samples = [cv2.resize(sample, dsize=image_size) for sample in samples]
    samples = torch.from_numpy(np.array(samples))
    samples = samples.permute(0, 3, 1, 2).contiguous()
    samples = samples.float().div(255.)

    return images, samples

In [29]:
images, samples = preprocess(image_paths=image_paths, image_size=image_size)
t1 = time.time()
predictions = model(samples)
t2 = time.time()
print(f'predicting time: {t2 - t1:.5f}')

predicting time: 0.89522


In [30]:
print('prediction at scale = 13: ', predictions[0].shape)
print('prediction at scale = 26: ', predictions[1].shape)
print('prediction at scale = 52: ', predictions[2].shape)

prediction at scale = 13:  torch.Size([2, 3, 13, 13, 25])
prediction at scale = 26:  torch.Size([2, 3, 26, 26, 25])
prediction at scale = 52:  torch.Size([2, 3, 52, 52, 25])


In [35]:
from pathlib import Path

PASCAL_CLASSES = {"aeroplane": 0, "bicycle": 1, "bird": 2, "boat": 3, "bottle": 4,
                  "bus": 5, "car": 6, "cat": 7, "chair": 8, "cow": 9,
                  "diningtable": 10, "dog": 11, "horse": 12, "motorbike": 13, "person": 14,
                  "pottedplant": 15, "sheep": 16, "sofa": 17, "train": 18, "tvmonitor": 19}

scales = [13, 26, 52]  # image_size // 32, image_size // 16, image_size // 8
multi_scale_anchors = [[[0.28, 0.22], [0.38, 0.48], [0.9, 0.78]],
                       [[0.07, 0.15], [0.15, 0.11], [0.14, 0.29]],
                       [[0.02, 0.03], [0.04, 0.07], [0.08, 0.06]]]

def module_time(module, module_name, *args):
    start = time.time()
    output = module(*args)
    stop = time.time()
    print(f'{module_name}: {stop - start:.4f}s')
    return output

def visualize_predictions(image_paths,
                          multi_scale_anchors, scales=[13, 26, 52],
                          iou_threshold=0.5, score_threshold=0.7, classes=PASCAL_CLASSES):
    '''
    Args:
        batch_samples: Tensor [batch_size x 3 x image_size x image_size]
        tuple_batch_targets: tuple([batch_size x num_anchors x S1 x S1 x 6],
                        [batch_size x num_anchors x S2 x S2 x 6],
                        [batch_size x num_anchors x S3 x S3 x 6])
        multi_scale_anchors: 3 x 3 x 2
        scales: [13, 26, 52]
    Returns:
        visual image
    '''
    idx2class = {idx: label_name for label_name, idx in classes.items()}
    multi_scale_anchors = torch.tensor(multi_scale_anchors)
    scales = torch.tensor(scales).unsqueeze(dim=1).unsqueeze(dim=1).repeat(1, 3, 2)
    multi_scale_anchors = multi_scale_anchors * scales

    images, samples = preprocess(image_paths)
    multi_scale_predictions = module_time(model, 'YOLOv3', samples)

    batch_multi_scale_bboxes = []
    for scale_idx, predictions in enumerate(multi_scale_predictions):
        chosen_scale_anchors = multi_scale_anchors[scale_idx]
        scale = predictions.shape[2]
        # [batch_size, num_anchor_per_scale, scale, scale, 5 + num_classes]
        batch_boxes = cells_to_bboxes(predictions, is_prediction=True, S=scale, scale_anchors=chosen_scale_anchors)
        batch_multi_scale_bboxes.append(batch_boxes)

    batch_multi_scale_bboxes = torch.cat(batch_multi_scale_bboxes, dim=1)

    for image, multi_scale_bboxes, image_path in zip(images, batch_multi_scale_bboxes, image_paths):
        print('\n')
        print(f'image: {Path(image_path).name}')

        image_height, image_width = image.shape[:2]
        bboxes = convert_cxcywh2xyxy(bboxes=multi_scale_bboxes, image_width=image_width, image_height=image_height)

        # num_bboxes: (S1 * S1 + S2 * S2 + S3 * S3) * num_anchor_per_scale
        print(f'number of bboxes before NMS: {bboxes.shape[0]}')
        bboxes = postprocess_batched_nms(bboxes=bboxes, iou_threshold=iou_threshold, score_threshold=score_threshold)
        print(f'number of bboxes after NMS: {bboxes.shape[0]}')

        for bbox in bboxes:
            label, score, [x1, y1, x2, y2] = bbox[0].item(), bbox[1].item(), bbox[2:].to(torch.int32).data.numpy().tolist()
            cv2.rectangle(img=image, pt1=(x1, y1), pt2=(x2, y2), color=(0, 0, 255), thickness=2)
            print(f'class name: {idx2class[label]}, score: {score:.3f}, coords: x1={x1}, y1={y1}, x2={x2}, y2={y2}')
            cv2.imshow(Path(image_path).name, image)
            cv2.waitKey()
            cv2.destroyAllWindows()

In [36]:
visualize_predictions(image_paths=['./dataset/PASCAL_VOC/images/000017.jpg',
                                   './dataset/PASCAL_VOC/images/000001.jpg',
                                   './dataset/PASCAL_VOC/images/000004.jpg'],
                     multi_scale_anchors=multi_scale_anchors,
                     scales=scales, classes=PASCAL_CLASSES)

YOLOv3: 1.4657s


image: 000017.jpg
number of bboxes before NMS: 10647
number of bboxes after NMS: 2
class name: horse, score: 0.885, coords: x1=78, y1=64, x2=418, y2=343
class name: person, score: 0.865, coords: x1=163, y1=53, x2=306, y2=204


image: 000001.jpg
number of bboxes before NMS: 10647
number of bboxes after NMS: 2
class name: person, score: 0.930, coords: x1=13, y1=2, x2=345, y2=495
class name: dog, score: 0.788, coords: x1=49, y1=222, x2=186, y2=383


image: 000004.jpg
number of bboxes before NMS: 10647
number of bboxes after NMS: 8
class name: car, score: 0.893, coords: x1=12, y1=308, x2=86, y2=362
class name: car, score: 0.890, coords: x1=359, y1=323, x2=497, y2=390
class name: car, score: 0.877, coords: x1=134, y1=320, x2=189, y2=358
class name: car, score: 0.869, coords: x1=229, y1=326, x2=334, y2=371
class name: car, score: 0.864, coords: x1=107, y1=321, x2=148, y2=353
class name: car, score: 0.839, coords: x1=0, y1=321, x2=21, y2=342
class name: car, score: 0.835, co

## Model

In [7]:
from flame.core.model.darknet53 import YOLOv3

In [8]:
model = YOLOv3(in_channels=3, num_classes=20)

In [9]:
predictions = model(samples)

In [10]:
print('predict 1: ', predictions[0].shape)
print('predict 2: ', predictions[1].shape)
print('predict 3: ', predictions[2].shape)

predict 1:  torch.Size([4, 3, 13, 13, 25])
predict 2:  torch.Size([4, 3, 26, 26, 25])
predict 3:  torch.Size([4, 3, 52, 52, 25])


## Loss Function

In [11]:
from flame.core.loss.yolov3_loss import YOLOv3Loss

loss_fn = YOLOv3Loss(lambda_obj=1, lambda_noobj=10, lambda_bbox=10, lambda_class=1)

In [24]:
import torch

preds = predictions[0].data.clone()
trues = targets[0].data.clone()
_anchors = torch.tensor(anchors[0]).data.clone()

print(loss_fn(predictions=preds, targets=trues, anchors=_anchors))

tensor(40.5704)


## torchvision 0.9

In [62]:
import time
import torch
import torchvision

In [63]:
model = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn()

In [64]:
x = torch.FloatTensor(size=(1, 3, 800, 800))
model.eval()
t1 = time.time()
a = model(x)
t2 = time.time()
print(round(t2 - t1, 5))

0.36093


In [65]:
backbone = model.backbone
rpn_anchor_generator = model.rpn.anchor_generator
rpn_head = model.rpn.head
box_roi_pool = model.roi_heads.box_roi_pool

In [66]:
box_roi_pool

MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2)

In [70]:
mask = torchvision.models.detection.MaskRCNN(backbone=backbone,
                                             num_classes=24,
                                             rpn_anchor_generator=rpn_anchor_generator,
                                             box_roi_pool=box_roi_pool,
                                             min_size=320,
                                             max_size=640)

In [72]:
mask

MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(320,), max_size=640, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (0): ConvBNActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): FrozenBatchNorm2d(16, eps=1e-05)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): ConvBNActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): ReLU(inplace=True)
          )
          (1): ConvBNActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): FrozenBatchNorm2d(16, eps=1e-05)
            (2): Identity()
          )
        )
      )
      (2): InvertedResidual(
   

In [57]:
class MaskrcnnMobileNetV3(nn.Module):
    def __init__(self, num_classes, pretrained=False, pretrained_backbone=False):
        super(MaskrcnnMobileNetV3, self).__init__()
        self.model = self._maskrcnn_mobilenet_v3(num_classes, pretrained, pretrained_backbone)

    def _maskrcnn_mobilenet_v3(self, num_classes, pretrained, pretrained_backbone):
        fasterrcnn_mobilenetv3 = torchvision.models.detection.fasterrcnn_mobilenet_v3_large_320_fpn(pretrained=pretrained,
                                                                                                    pretrained_backbone=pretrained_backbone)
        mobilenet_v3 = fasterrcnn_mobilenetv3.backbone
        rpn_anchor_generator = fasterrcnn_mobilenetv3.rpn.anchor_generator
        box_roi_pool = fasterrcnn_mobilenetv3.roi_heads.box_roi_pool

        model = torchvision.models.detection.MaskRCNN(backbone=mobilenet_v3,
                                                      num_classes=num_classes,
                                                      rpn_anchor_generator=rpn_anchor_generator,
                                                      box_roi_pool=box_roi_pool,
                                                      mask_roi_pool=box_roi_pool)
        
        return model

    def state_dict(self):
        return self.model.state_dict()

    def load_state_dict(self, state_dict):
        self.model.load_state_dict(state_dict)

    def forward(self, x, targets=None):
        return self.model(x, targets)
 

In [58]:
model = MaskrcnnMobileNetV3(num_classes=3)

In [59]:
import time
x = torch.FloatTensor(size=(1, 3, 800, 800))
model.eval()
t1 = time.time()
model(x)
t2 = time.time()
print(t2 - t1)
# outputs = backbone(x)

0.6656076908111572


In [60]:
print(outputs['0'].shape)
print(outputs['1'].shape)
print(outputs['pool'].shape)

torch.Size([1, 256, 25, 25])
torch.Size([1, 256, 25, 25])
torch.Size([1, 256, 13, 13])
