In [None]:
'''imports'''
# setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# stblib
import os
from collections import OrderedDict
from pprint import pprint

# thirdparty
import numpy as np
import cv2
from matplotlib import pyplot as plt

# detectron2 utils
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog

In [None]:
'''directory setup'''
tmp_dir = os.path.join('tmp', 'video_annotation')
os.makedirs(tmp_dir, exist_ok=True)
print(tmp_dir)

In [None]:
'''load video'''
video_name = 'highway-4k.mp4'
video_input = os.path.join(tmp_dir, video_name)
video = cv2.VideoCapture(video_input)
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frames_per_second = video.get(cv2.CAP_PROP_FPS)
num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
print(video_input)
print("%s, #frames = %d, fps = %.01f, w = %d, h = %d" % (video, num_frames, frames_per_second, width, height))

In [None]:
'''frames generator for video processing'''
def _video_frame_generator(cv2_video_capture):
    while video.isOpened():
        success, frame = cv2_video_capture.read()
        if success:
            yield frame
        else:
            break
# frame = next(_video_frame_generator(video))
# print(frame.shape)

In [None]:
'''setup detectron model'''
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file("Cityscapes/mask_rcnn_R_50_FPN.yaml"))
# cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
# Find a model from detectron2's model zoo. You can use the https://dl.fbaipublicfiles... url as well
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url("Cityscapes/mask_rcnn_R_50_FPN.yaml")
cfg.MODEL.DEVICE = 'cpu'
predictor = DefaultPredictor(cfg)
# inspect metadata
print(MetadataCatalog.get(cfg.DATASETS.TRAIN[0]))

In [None]:
'''extract next frame'''
frame = next(_video_frame_generator(video))
frame_disp = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
plt.imshow(frame_disp)
plt.show()

In [None]:
'''process frame'''
outputs = predictor(frame)

In [None]:
'''inspect outputs'''
print(outputs.keys())
print(type(outputs['instances']))
print(outputs['instances'].pred_classes)  # instance class ids are mapped to thing_classes!
# print(outputs['instances'].pred_masks)
print(outputs['instances'].scores)
print(outputs['instances'].pred_boxes)

In [None]:
'''visualize output'''
v = Visualizer(frame[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), scale=1.2)
v = v.draw_instance_predictions(outputs["instances"].to("cpu"))
frame_out = v.get_image()[:, :, ::-1]
frame_viz = cv2.cvtColor(frame_out, cv2.COLOR_BGR2RGB)
plt.imshow(frame_viz)
plt.show()

In [None]:
'''
assemble data structure
{
    frame_<id> : {
        instance_<id> : {
            class_id : <id>,
            class_name : <name>,
            # mask : <mask>,
            bbox : <coords>,
            score : <score>,
        },
        ...
    },
    ...
}
'''

metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
instance_classes = metadata.get('thing_classes', None)
# print(instance_classes)
outputs_cpu = outputs["instances"].to("cpu")
outputs_zipped = zip(outputs_cpu.pred_classes, outputs_cpu.pred_masks, outputs_cpu.pred_boxes, outputs_cpu.scores)

frame_dict = OrderedDict()
for instance_id, (class_id, mask, bbox, score) in enumerate(outputs_zipped):
    instance_path = 'instace_%02d' % instance_id
    frame_dict[instance_path] = OrderedDict()
    frame_dict[instance_path]['class_id'] = class_id.item()
    frame_dict[instance_path]['class_name'] = instance_classes[class_id]
#     frame_dict[instance_path]['mask'] = mask.numpy()
    frame_dict[instance_path]['bbox'] = bbox.numpy()
    frame_dict[instance_path]['score'] = score.item()

pprint(frame_dict)

In [None]:
'''save frame annotation as npz'''
frame_label_file = os.path.join(tmp_dir, 'frame_0000.npz')
np.savez(frame_label_file, **frame_dict)