# Convert CVAT format to yolo

In [29]:
import xmltodict
import json
from collections import namedtuple
from tqdm import tqdm
from tqdm.notebook import tqdm as ntqdm
from os.path import join, isdir, isfile
from os import makedirs
from shutil import copy2
import cv2
from utils import Segment, Bbox, ImageAnnot
from pathlib import Path
from random import shuffle

## Utility functions

In [34]:
def _json_object_hook(d): 
    return namedtuple('X', d.keys())(*d.values())

def json2obj(data): 
    return json.loads(open(data).read(), object_hook=_json_object_hook)

def xml2object(xml: str, output_path: str):
    with open(xml) as xml_file:
        data_dict = xmltodict.parse(xml_file.read())     
        json_data = json.dumps(data_dict)
        with open(output_path, "w") as json_file:
            json_file.write(json_data)
    f = open(output_path).read()
    f = f.replace("@", '')
    f = f.replace("#", "")
    with open(output_path, 'w') as file:
        file.write(f)
    return json2obj(output_path)


def get_labels(data: namedtuple):
    # Extract labels from converted object
    return [l.name for l in data.annotations.meta.task.labels[0]]

def get_images_with_annots(data: namedtuple):
    return [item for item in data.annotations.image if 'polygon' in item._asdict() or 'box' in item._asdict()]

def filter_some_annots(imgs_list: list):
    temp = []
    for img in ntqdm(imgs_list):
        try:
            if img.box.label == 'set' and img.box.attribute.text == 'fore-arms':
                continue
            elif img.box.label == 'receive' and img.box.attribute.text == 'setting-dig':
                continue
            else:
                temp.append(img)
        except:
            temp.append(img)
    return temp

def get_video_name(data: namedtuple):
    return data.annotations.meta.task.name

## Prepare input/output files and paths

In [34]:
base_path = '../input/videos/train/'
input_xml = Path(base_path) / "annotations/6_classes/3.xml"
data = xml2object(input_xml, 'data.json')

labels = get_labels(data)
img_annots = get_images_with_annots(data)
img_annots = filter_some_annots(img_annots)
video_file = get_video_name(data)
video_path = Path(base_path) / video_file
assert input_xml.is_file()
assert video_path.is_file()

cap = cv2.VideoCapture(video_path.as_posix())
assert cap.isOpened(), "video file not opened...."

  0%|          | 0/647 [00:00<?, ?it/s]

## Loop over annotations and creating images/labels

In [35]:
labels2ix = {label: i for i, label in enumerate(labels)}
labels2ix

{'ball': 0, 'block': 1, 'receive': 2, 'set': 3, 'spike': 4, 'serve': 5}

In [36]:
train_img_dir = "data/images/train"
val_img_dir = "data/images/val"

train_label_dir = "data/detection/labels/train"
val_label_dir = "data/detection/labels/val"

seg_train_label_dir = "data/segment/labels/train"
seg_val_label_dir = "data/segment/labels/val"


for i in [train_img_dir, val_img_dir, train_label_dir, val_label_dir, 
          seg_train_label_dir, seg_val_label_dir]:
    makedirs(i, exist_ok=True)


shuffle(img_annots)
train_size = int(len(img_annots) * 0.9)

In [37]:
for idx, img_annot in enumerate(ntqdm(img_annots)):
    img_annot = img_annot._asdict()
    ball_segments = []
    ball_bboxes = []
    txt_segment = ''
    polygons = []
    txt_boxes = ''
    temp = []
    
    # handle frame extraction
    frame_id = int(img_annot['id'])
    cap.set(1, frame_id)
    _, frame = cap.read()
    img_h, img_w, _ = frame.shape
    
    # Extract ball info (segmentation mask points and bounding box)
    if 'polygon' in img_annot:
        polygons = [img_annot['polygon']] if not isinstance(img_annot['polygon'], list) else img_annot['polygon']
        
        for j, p in enumerate(polygons):
            polygon = p.points
            polygon_label = p.label
            points = [int(float(item)) for item in polygon.replace(';', ' ').replace(',', ' ').split(' ')]

            ball_segment = Segment(points, labels2ix[polygon_label])
            ball_bbox = ball_segment.get_bbox()
            
            txt_segment += ball_segment.segment_to_yolo(img_w, img_h)
            if j != len(polygons)-1:
                txt_segment += '\n'
            ball_bboxes.append(ball_bbox)

    # Handle bounding boxes (add ball bounding box as well).
    
    if 'box' in img_annot:
        if isinstance(img_annot['box'], list):
            boxes = img_annot['box']
        else:
            boxes = [img_annot['box']]
    else:
        boxes = []

    for box in boxes:
        b = Bbox((int(float(box.xtl)), int(float(box.ytl)), int(float(box.xbr)), int(float(box.ybr))), labels2ix[box.label])
        temp.append(b)

    boxes = temp.copy()
    if len(ball_bboxes):
        boxes.extend(ball_bboxes)
    
    # for p in boxes
    sorted(boxes, key=lambda x: x.label, reverse=True)
    
    # Generate yolo segment/bbox label...
    for ii, box in enumerate(boxes):
        yolo_fmt = box.to_yolo(img_w, img_h)
        if ii != len(boxes) - 1:
            yolo_fmt += '\n'
        txt_boxes += yolo_fmt
    
    # Save image/label file in specified path
    file_stem = f"{video_path.stem}-frame_{frame_id}"

    if idx < train_size:
        img_path = f'{train_img_dir}/{file_stem}.png'
        seg_path = f'{seg_train_label_dir}/{file_stem}.txt'
        det_path = f'{train_label_dir}/{file_stem}.txt'
    else:
        img_path = f'{val_img_dir}/{file_stem}.png'
        seg_path = f'{seg_val_label_dir}/{file_stem}.txt'
        det_path = f'{val_label_dir}/{file_stem}.txt'
    
    cv2.imwrite(img_path, frame)
    with open(det_path, 'w') as det_file:
        det_file.write(txt_boxes)

    with open(seg_path, 'w') as seg_file:
        seg_file.write(txt_segment)

  0%|          | 0/642 [00:00<?, ?it/s]