## 1. DATA LOADER

In [1]:
import cv2
import torch
import numpy as np
import imgaug.augmenters as iaa
from torch.utils.data import DataLoader
from flame.core.data.pascal_dataset import PascalDataset

classes2idx = {'aeroplane': 0, 'bicycle': 1, 'bird': 2, 'boat': 3, 'bottle': 4,
               'bus': 5, 'car': 6, 'cat': 7, 'chair': 8, 'cow': 9, 'diningtable': 10,
               'dog': 11, 'horse': 12, 'motorbike': 13, 'person': 14, 'pottedplant': 15,
               'sheep': 16, 'sofa': 17, 'train': 18, 'tvmonitor': 19}

transforms = [iaa.Add(value=(-10, 10), per_channel=True),
              iaa.GaussianBlur(sigma=(0, 1)),
              iaa.MotionBlur(),
              iaa.JpegCompression(compression=(0, 10)),
              iaa.Fliplr(p=0.5),
              iaa.Flipud(p=0.5),
              iaa.Grayscale(alpha=(0.0, 0.1)),
              iaa.Affine(rotate=(-5, 5), shear=(-5, 5), fit_output=True),
              iaa.Crop(percent=(0, 0.1)),
              iaa.Pad(percent=(0, 0.1), keep_size=False),
              iaa.ChangeColorTemperature()]

pascal_dataset = PascalDataset(
    VOC2012={
        'image_dir': './dataset/PASCALVOC2012/JPEGImages/',
        'label_dir': './dataset/PASCALVOC2012/Annotations/',
        'txt_path': './dataset/PASCALVOC2012/ImageSets/Segmentation/train.txt'
    },
    VOC2007={
        'image_dir': './dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/JPEGImages/',
        'label_dir': 'dataset/PASCALVOC2007/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Annotations/'
    },
    image_extent='.jpg',
    label_extent='.xml',
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225],
    compound_coef=0,
    classes=classes2idx,
    transforms=transforms)

- train:
	 VOC2007: 5011
	 VOC2012: 1464
	 Total: 6475


In [2]:
pascal_loader = DataLoader(
    pascal_dataset,
    batch_size=2,
    num_workers=2,
    pin_memory=True,
    shuffle=False,
    collate_fn=lambda x: tuple(zip(*x))
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float, device=device).view(3, 1, 1)
std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float, device=device).view(3, 1, 1)
idx2class = {idx: label_name for label_name, idx in classes2idx.items()}

for i, pascal_data in enumerate(iter(pascal_loader)):
    samples, targets, sample_infos = pascal_data
    for sample, target, sample_info in zip(samples, targets, sample_infos):
        image = ((sample * std + mean) * 255).permute(1, 2, 0).contiguous()
        image = image.to(torch.uint8).cpu().numpy()
        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)

        boxes = target['boxes'].data.cpu().numpy().astype(np.int32)
        labels = target['labels'].data.cpu().numpy().astype(np.int32)

        thickness = max(sample_info[1]) // 800
        fontscale = max(sample_info[1]) / 800

        for box, label in zip(boxes, labels):
            if label != -1:
                image = np.ascontiguousarray(image)
                cv2.rectangle(
                    img=image,
                    pt1=tuple(box[:2]),
                    pt2=tuple(box[2:]),    
                    color=(0, 255, 0),
                    thickness=thickness
                )
                cv2.putText(
                    img=image,
                    text=idx2class[label.item()],
                    org=tuple(box[:2]),
                    fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                    fontScale=fontscale,
                    color=(0, 0, 255),
                    thickness=thickness,
                    lineType=cv2.LINE_AA)

        cv2.imshow(sample_info[0], image)
        cv2.waitKey()
        cv2.destroyAllWindows()

    if i == 1:
        break

## 2. Model

In [3]:
import time
import torch
from flame.core.model.EfficientDetV1.efficientdet import EfficientDet

compound_coef = 0

model = EfficientDet(
    pretrained_weight=f'./checkpoint/efficientdet_pretrained_weight/efficientdet-d{compound_coef}.pth',
    head_only=False,
    num_classes=20,
    compound_coef=compound_coef,
    backbone_pretrained=False,
    scales=[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)],
    aspect_ratios=[(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)],
    iou_threshold=0.2,
    score_threshold=0.2
)
model = model.train()

In [4]:
num_params = sum(param.numel() for param in model.parameters() if param.requires_grad)
print(f'The Number of Parameters of Version D{compound_coef}: {num_params} parameters')

The Number of Parameters of Version D0: 3839117 parameters


In [5]:
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params

In [6]:
pascal_iter = iter(pascal_loader)
samples, targets, image_infos = pascal_iter.next()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
samples = torch.stack([sample.to(device) for sample in samples], dim=0)

t1 = time.time()
cls_preds, reg_preds, anchors = model(samples)
t2 = time.time()

print(f'Forward Time Processing: {t2 - t1: .4f}s')

print(f'Shape of Input Tensor: {samples.shape}')  # batch_size x 3 x h x w, (h = w = 512 + 128 * compound_coef)
print(f'Shape of Classification Tensor: {cls_preds.shape}')  # batch_size x num_anchors x num_classes
print(f'Shape of Regression Tensor: {reg_preds.shape}')  # batch_size x num_anchors x 4
print(f'Shape of Anchors Tensor: {anchors.shape}')  # 1 x (w/2^7 * h/2^7 + ... + w/2^3 * h/2^3) * 9 x 4

Forward Time Processing:  0.6454s
Shape of Input Tensor: torch.Size([2, 3, 512, 512])
Shape of Classification Tensor: torch.Size([2, 49104, 20])
Shape of Regression Tensor: torch.Size([2, 49104, 4])
Shape of Anchors Tensor: torch.Size([1, 49104, 4])


In [7]:
h = w = 512 + 128 * compound_coef
num_anchor_boxes = sum([(h / 2 ** i) * (w / 2 ** i) * 9 for i in range(3, 8)])  # P3, P4, P5, P6, P7
print(num_anchor_boxes)

49104.0


## 3. Loss Function

In [8]:
from flame.core.loss.focal_loss import FocalLoss

loss = FocalLoss(
    alpha=0.25,
    gamma=2.0,
    lamda=50.0,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)

In [9]:
cls_loss, reg_loss = loss(cls_preds, reg_preds, anchors, targets)

In [10]:
loss = cls_loss.mean() + reg_loss.mean()
print(f'loss: {loss.item()}')

loss: 14590.7548828125


## 4. Inference

In [11]:
import torch
from flame.core.model.EfficientDetV1.efficientdet import EfficientDet

compound_coef = 0

model = EfficientDet(
    pretrained_weight=None,
    head_only=False,
    num_classes=90,
    compound_coef=compound_coef,
    backbone_pretrained=False,
    iou_threshold=0.5,
    score_threshold=0.3
)

model.load_state_dict(
    torch.load(
        f'./checkpoint/efficientdet_pretrained_weight/efficientdet-d{compound_coef}.pth',
        map_location='cpu'
    )
)

model = model.eval()

In [12]:
classes = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light',
            'fire hydrant', '', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
            'cow', 'elephant', 'bear', 'zebra', 'giraffe', '', 'backpack', 'umbrella', '', '', 'handbag', 'tie',
            'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
            'skateboard', 'surfboard', 'tennis racket', 'bottle', '', 'wine glass', 'cup', 'fork', 'knife', 'spoon',
            'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut',
            'cake', 'chair', 'couch', 'potted plant', 'bed', '', 'dining table', '', '', 'toilet', '', 'tv',
            'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink',
            'refrigerator', '', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
            'toothbrush']

In [13]:
import cv2
import torch
import numpy as np
import imgaug.augmenters as iaa

pad_to_square = iaa.PadToSquare(position='right-bottom')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

mean = torch.tensor([0.485, 0.456, 0.406], dtype=torch.float, device=device).view(1, 3, 1, 1)
std = torch.tensor([0.229, 0.224, 0.225], dtype=torch.float, device=device).view(1, 3, 1, 1)

imsize = 512 + 128 * compound_coef

def preprocess(image_paths, imsize=imsize, mean=mean, std=std, device=device):
    images = [cv2.imread(image_path) for image_path in image_paths]
    padded_images = [pad_to_square(image=image) for image in images]
    samples = [cv2.cvtColor(padded_image, cv2.COLOR_BGR2RGB) for padded_image in padded_images]
    samples = [cv2.resize(sample, dsize=(imsize, imsize)) for sample in samples]
    samples = [torch.from_numpy(sample) for sample in samples]
    samples = torch.stack(samples, dim=0).to(device)
    samples = samples.permute(0, 3, 1, 2).contiguous()
    samples = (samples.float().div(255.) - mean) / std
    scales = [max(*image.shape) / imsize for image in images]
    return images, scales, samples

In [14]:
image_paths = ['./dataset/VOC2007/valid/000002.jpg',
               './dataset/VOC2007/valid/000013.jpg',
               './dataset/VOC2007/valid/000030.jpg']

images, scales, samples = preprocess(image_paths=image_paths)

In [15]:
import time
t1 = time.time()
with torch.no_grad():
    predictions = model.inference(samples)
t2 = time.time()
print(f'prediction time: {t2 - t1}s')

prediction time: 0.9432260990142822s


In [16]:
predictions

[{'boxes': tensor([[143.4083, 203.5088, 211.8467, 306.7638]]),
  'labels': tensor([6]),
  'scores': tensor([0.7441])},
 {'boxes': tensor([[303.4525, 163.6749, 457.1618, 257.0464]]),
  'labels': tensor([20]),
  'scores': tensor([0.8815])},
 {'boxes': tensor([[300.3644, 137.6334, 460.2697, 296.3701],
          [ 56.8792, 164.4981, 152.5491, 292.8531],
          [ 28.6209, 205.2891, 182.1070, 296.2229]]),
  'labels': tensor([0, 0, 1]),
  'scores': tensor([0.9399, 0.9148, 0.7222])}]

In [17]:
for image, scale, pred in zip(images, scales, predictions):
    thickness = max(image.shape) // 700
    fontscale = max(image.shape) / 800
    boxes = pred['boxes'].cpu().numpy()
    labels = pred['labels'].cpu().numpy()
    scores = pred['scores'].cpu().numpy()
    class_names = [classes[label] for label in labels]
    boxes[:, [0, 2]] = boxes[:, [0, 2]] * scale
    boxes[:, [1, 3]] = boxes[:, [1, 3]] * scale
    boxes = boxes.astype(np.int32)
    for box, score, class_name in zip(boxes, scores, class_names):
        color = (np.random.randint(200, 255),
                 np.random.randint(50, 200),
                 np.random.randint(0, 150))
        if label != -1:
            cv2.rectangle(
                img=image,
                pt1=tuple(box[:2]),
                pt2=tuple(box[2:]),    
                color=color,
                thickness=thickness
            )

            cv2.putText(
                img=image,
                text=f'{class_name}: {score: .4f}',
                org=tuple(box[:2]),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                fontScale=fontscale,
                color=color,
                thickness=thickness,
                lineType=cv2.LINE_AA)

            cv2.imshow(class_name, image)
            cv2.waitKey()
            cv2.destroyAllWindows()