In [28]:
import glob
import shutil
import json
from pathlib import Path

from collections import Counter

import pandas as pd
import xml.etree.ElementTree as ET

# Labelmg to COCO format

Labelmg by default creates files in the **Pascal VOC** format. Most of the latest pipelines are
expecting the labels in COCO format.

1. Pascal VOC format -> coordinates are represented as `(left_top, right_bottom)`
2. Labelmg tool produces Pascal voc format.
3. COCO expects all the file names should be in number format
4. COCO files

In [1]:
train_dir = "/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/train/"
test_dir = "/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/test/"
template_test = "/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/templates_test_data_coco/"

In [214]:
tree = ET.parse(f"{template_test}/1.xml")
root = tree.getroot()
fn_child = root.find("filename")

In [216]:
fn_child.text = "1.jpg"

In [229]:
# print(ET.tostring(root).decode('utf8'))

In [240]:

def renamefn_to_intfn(data_dir, start=1000):
    """
    @param data_dir: Pascal VOC format generated by labelmg.
    @param start: File name start point.
    """
    get_fn = lambda x: ".".join(x.split(".")[:-1])

    pp = Path(data_dir)
    for fn in glob.glob(f"{data_dir}/*.xml"):
        p = Path(fn)
        root = ET.parse(fn).getroot()
        fn_child = root.find("filename")
        path_child = root.find("path")
        img_fn = fn_child.text
        
        if not get_fn(p.name).isdigit():
            bname = ".".join(p.name.split(".")[:-1])
            png = Path(pp / f"{img_fn}")
            assert png.exists()
            
            imgfn_split = img_fn.split(".")
            img, img_ext = ".".join(imgfn_split[:-1]), imgfn_split[-1]
            
            p.rename(pp / f"{start}.xml")
            png.rename(pp / f"{start}.{img_ext}")
            
            # Update the filename reference in new xml 
            fn_child.text = f"{start}.{img_ext}"
            path_child.text = f"{pp/str(start)}.{img_ext}"
            
            with open(pp/f"{start}.xml", 'w') as f:
                f.write(ET.tostring(root).decode("utf8"))
            
            start += 1

In [241]:
renamefn_to_intfn(template_test)

## Coco Category Check

Ensure the Dataset has correct labels and category ID mapping across train/val/test datasets.

In [37]:
train_ann_file = "/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/train_coco_updated.json"
val_ann_file = "/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/test_coco_updated.json"
test_ann_file = "/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/templates_test_data_coco_updated.json"

def check_category_id(ann_file):
    ann = json.loads(open(ann_file).read())
    cat_map = {i["id"] : i["name"] for i in ann["categories"]}
    print({(k, cat_map[k]): v for k, v in Counter([i["category_id"] for i in ann["annotations"]]).items()})
    return ann["categories"]

check_category_id(train_ann_file)

{(3, 'checkbox'): 54, (1, 'textbox'): 937, (4, 'actionset'): 67, (5, 'image'): 258, (2, 'radiobutton'): 108, (6, 'rating'): 3}


[{'supercategory': 'none', 'id': 1, 'name': 'textbox'},
 {'supercategory': 'none', 'id': 2, 'name': 'radiobutton'},
 {'supercategory': 'none', 'id': 3, 'name': 'checkbox'},
 {'supercategory': 'none', 'id': 4, 'name': 'actionset'},
 {'supercategory': 'none', 'id': 5, 'name': 'image'},
 {'supercategory': 'none', 'id': 6, 'name': 'rating'}]

In [38]:
check_category_id(val_ann_file)

{(1, 'textbox'): 92, (2, 'radiobutton'): 11, (5, 'image'): 20, (4, 'actionset'): 1}


[{'supercategory': 'none', 'id': 1, 'name': 'textbox'},
 {'supercategory': 'none', 'id': 2, 'name': 'radiobutton'},
 {'supercategory': 'none', 'id': 3, 'name': 'checkbox'},
 {'supercategory': 'none', 'id': 4, 'name': 'actionset'},
 {'supercategory': 'none', 'id': 5, 'name': 'image'},
 {'supercategory': 'none', 'id': 6, 'name': 'rating'}]

In [39]:
check_category_id(test_ann_file)

{(5, 'image'): 305, (1, 'textbox'): 709, (4, 'actionset'): 31, (2, 'radiobutton'): 8}


[{'supercategory': 'none', 'id': 1, 'name': 'textbox'},
 {'supercategory': 'none', 'id': 2, 'name': 'radiobutton'},
 {'supercategory': 'none', 'id': 3, 'name': 'checkbox'},
 {'supercategory': 'none', 'id': 4, 'name': 'actionset'},
 {'supercategory': 'none', 'id': 5, 'name': 'image'},
 {'supercategory': 'none', 'id': 6, 'name': 'rating'}]

In [24]:
ann.keys()

dict_keys(['images', 'type', 'annotations', 'categories'])

In [29]:
Counter([i["category_id"] for i in ann["annotations"]])

Counter({3: 54, 1: 937, 4: 67, 5: 366, 6: 3})

# Label statistics

In [78]:
train_df = pd.read_csv("/home/haridas/projects/mystique/data/train_and_test-2020-Jun-05-coco/train_label.csv")

In [82]:
train_df.groupby("filename").count().describe()

Unnamed: 0,width,height,class,xmin,ymin,xmax,ymax
count,105.0,105.0,105.0,105.0,105.0,105.0,105.0
mean,13.590476,13.590476,13.590476,13.590476,13.590476,13.590476,13.590476
std,8.50757,8.50757,8.50757,8.50757,8.50757,8.50757,8.50757
min,2.0,2.0,2.0,2.0,2.0,2.0,2.0
25%,7.0,7.0,7.0,7.0,7.0,7.0,7.0
50%,12.0,12.0,12.0,12.0,12.0,12.0,12.0
75%,18.0,18.0,18.0,18.0,18.0,18.0,18.0
max,55.0,55.0,55.0,55.0,55.0,55.0,55.0
