In [None]:
# default_exp json_tree

In [None]:
# export

import logging
from dataclasses import dataclass, Field
from typing import *
from pathlib import Path
import json
import shutil

from cococleaner.utils import sort_dict
from cococleaner.coco import *

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()

In [None]:
# export

def load_json_tree(tree_dir: Union[str, Path], *, kind: str = "object_detection") -> CocoDataset:
    dataset_class = get_dataset_class(kind)
    from_dict_function = dataset_class.from_dict

    tree_dir = Path(tree_dir)
    logger.info(f"Loading json tree from dir: {tree_dir}")
    if not tree_dir.is_dir():
        raise ValueError(f"Source json tree dir not found: {tree_dir}")
    
    D = {}
    for el_name in dataset_class.get_collective_elements():
        el_dir = tree_dir / el_name
        if not el_dir.is_dir():
            logger.debug(f'Chunks dir not found: {el_dir}')
            el_list = []
        else:
            el_list = [json.loads(f.read_text()) for f in el_dir.glob('*.json')]
        logger.debug(f'Loaded {len(el_list)} json chunks from {el_dir}')
        D[el_name] = el_list
    
    for el_name in dataset_class.get_non_collective_elements():
        el_file = tree_dir / f'{el_name}.json'
        if not el_file.is_file():
            logger.debug(f'Chunks file not found: {el_file}')
            el = {}
        else:
            el = json.loads(el_file.read_text())
        logger.debug(f'Loaded single-file {len(el_list)} json chunk {el_dir}')
        D[el_name] = el

    return dataset_class.from_dict(D)

In [None]:
# hide
PATH = '../examples/coco_chunk/json_tree/'
d = load_json_tree(PATH)

display(d.info)
display(d.annotations[0])
display(d.images[0])
display(d.categories[0])

assert isinstance(d.info, CocoInfo)
assert isinstance(d.annotations[0], CocoObjectDetectionAnnotation)
assert isinstance(d.images[0], CocoImage)
assert isinstance(d.categories[0], CocoObjectDetectionCategory)

INFO:root:Loading json tree from dir: ../examples/coco_chunk/json_tree
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/images
DEBUG:root:Loaded 8 json chunks from ../examples/coco_chunk/json_tree/licenses
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/annotations
DEBUG:root:Loaded 3 json chunks from ../examples/coco_chunk/json_tree/categories


CocoInfo(year=2017, version='1.0', description='COCO 2017 Dataset', contributor='COCO Consortium', url='http://cocodataset.org', date_created='2017/09/01')

CocoObjectDetectionAnnotation(id=124710, image_id=140006, category_id=2, bbox=(496.52, 125.94, 143.48, 113.54), supercategory=None, area=9387.706200000002, iscrowd=0)

CocoImage(id=117891, coco_url='http://images.cocodataset.org/train2017/000000117891.jpg', width=640, height=427, license=4, file_name='000000117891.jpg', flickr_url='http://farm1.staticflickr.com/121/256249222_f4c8375c29_z.jpg', date_captured='2013-11-15 14:46:14')

CocoObjectDetectionCategory(id=2, name='bicycle', supercategory='vehicle')

In [None]:
# export

def dump_json_tree(
    coco: CocoDataset, 
    target_dir: Union[str, Path],
    *,
    kind: str = 'object_detection',
    skip_nulls: bool = False,
    overwrite: bool = False,
    indent: Optional[int] = 4,
) -> None:
    dataset_class = get_dataset_class(kind)
    if skip_nulls:
        to_dict_function = dataset_class.to_dict_skip_nulls
    else:
        to_dict_function = dataset_class.to_dict
    
    target_dir = Path(target_dir)
    raw = to_dict_function(coco)
    logger.info(f"Dumping json tree to dir: {target_dir}")
    
    if overwrite:
        if target_dir.is_dir():
            logger.warning(f'Destination dir exists and will be overwritten: {target_dir}')
    elif target_dir.is_dir():
        raise ValueError(f"Destination json tree dir already exists: {target_dir}")

    if target_dir.is_dir():
        logger.info(f'Deleting old target tree directory {target_dir}')
        shutil.rmtree(str(target_dir))

    target_dir.mkdir(parents=True)
    
    # TODO: rename cat -> el_kind
    for cat in dataset_class.get_collective_elements():
        el_dir = target_dir / cat
        if not raw.get(cat):
            logger.debug(f'Skipping empty category {el_dir}')
            continue
        el_dir.mkdir()
        for el in raw[cat]:
            el_file = el_dir / f'{el["id"]}.json'
            el = sort_dict(el)
            el_file.write_text(json.dumps(el, indent=indent))
        logger.debug(f'Written {len(raw[cat])} elements to {el_dir}')
    
    for cat in dataset_class.get_non_collective_elements():
        el_dir = target_dir / cat
        el_dir.mkdir()
        el_file = target_dir / f'{cat}.json'
        el = raw[cat]        
        el = sort_dict(el)
        el_file.write_text(json.dumps(el, indent=indent))
        logger.debug(f'Written single element to {el_dir}')

In [None]:
# hide
import tempfile
from cococleaner.json_file import load_json_file

SRC = '../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json'
DST = tempfile.mktemp()
d = load_json_file(SRC)

dump_json_tree(d, DST)

! ls {DST}

IMG = ! echo {DST}/annotations/$(ls {DST}/annotations | head -n1)
IMG = IMG[0]
!cat {IMG} | jq

INFO:root:Loading json file from file: ../examples/coco-dataset/json.file/instances_train2017_chunk3x2.json
INFO:root:Loaded: images=6, annotations=6, categories=3
INFO:root:Dumping json tree to dir: /tmp/tmp1f6r2u0y
DEBUG:root:Written 6 elements to /tmp/tmp1f6r2u0y/images
DEBUG:root:Written 8 elements to /tmp/tmp1f6r2u0y/licenses
DEBUG:root:Written 6 elements to /tmp/tmp1f6r2u0y/annotations
DEBUG:root:Written 3 elements to /tmp/tmp1f6r2u0y/categories
DEBUG:root:Written single element to /tmp/tmp1f6r2u0y/info


annotations  categories  images  info  info.json  licenses
[1;39m{
  [0m[34;1m"area"[0m[1;39m: [0m[0;39m9387.706200000002[0m[1;39m,
  [0m[34;1m"bbox"[0m[1;39m: [0m[1;39m[
    [0;39m496.52[0m[1;39m,
    [0;39m125.94[0m[1;39m,
    [0;39m143.48[0m[1;39m,
    [0;39m113.54[0m[1;39m
  [1;39m][0m[1;39m,
  [0m[34;1m"category_id"[0m[1;39m: [0m[0;39m2[0m[1;39m,
  [0m[34;1m"id"[0m[1;39m: [0m[0;39m124710[0m[1;39m,
  [0m[34;1m"image_id"[0m[1;39m: [0m[0;39m140006[0m[1;39m,
  [0m[34;1m"iscrowd"[0m[1;39m: [0m[0;39m0[0m[1;39m,
  [0m[34;1m"supercategory"[0m[1;39m: [0m[1;30mnull[0m[1;39m
[1;39m}[0m


In [None]:
# hide
! rm {IMG}
dump_json_tree(d, DST, overwrite=True)
!cat {IMG} | jq

INFO:root:Dumping json tree to dir: /tmp/tmp1f6r2u0y
INFO:root:Deleting old target tree directory /tmp/tmp1f6r2u0y
DEBUG:root:Written 6 elements to /tmp/tmp1f6r2u0y/images
DEBUG:root:Written 8 elements to /tmp/tmp1f6r2u0y/licenses
DEBUG:root:Written 6 elements to /tmp/tmp1f6r2u0y/annotations
DEBUG:root:Written 3 elements to /tmp/tmp1f6r2u0y/categories
DEBUG:root:Written single element to /tmp/tmp1f6r2u0y/info


[1;39m{
  [0m[34;1m"area"[0m[1;39m: [0m[0;39m9387.706200000002[0m[1;39m,
  [0m[34;1m"bbox"[0m[1;39m: [0m[1;39m[
    [0;39m496.52[0m[1;39m,
    [0;39m125.94[0m[1;39m,
    [0;39m143.48[0m[1;39m,
    [0;39m113.54[0m[1;39m
  [1;39m][0m[1;39m,
  [0m[34;1m"category_id"[0m[1;39m: [0m[0;39m2[0m[1;39m,
  [0m[34;1m"id"[0m[1;39m: [0m[0;39m124710[0m[1;39m,
  [0m[34;1m"image_id"[0m[1;39m: [0m[0;39m140006[0m[1;39m,
  [0m[34;1m"iscrowd"[0m[1;39m: [0m[0;39m0[0m[1;39m,
  [0m[34;1m"supercategory"[0m[1;39m: [0m[1;30mnull[0m[1;39m
[1;39m}[0m


In [None]:
# hide

# Test that non-tree files got deleted after the algorithm
! echo 123 > {DST}/images/dummy.json
dump_json_tree(d, DST, overwrite=True)
! [ ! -f {DST}/images/dummy.json ] && echo "{DST}/images/dummy.json not exists :)"

INFO:root:Dumping json tree to dir: /tmp/tmp1f6r2u0y
INFO:root:Deleting old target tree directory /tmp/tmp1f6r2u0y
DEBUG:root:Written 6 elements to /tmp/tmp1f6r2u0y/images
DEBUG:root:Written 8 elements to /tmp/tmp1f6r2u0y/licenses
DEBUG:root:Written 6 elements to /tmp/tmp1f6r2u0y/annotations
DEBUG:root:Written 3 elements to /tmp/tmp1f6r2u0y/categories
DEBUG:root:Written single element to /tmp/tmp1f6r2u0y/info


/tmp/tmp1f6r2u0y/images/dummy.json not exists :)
