# CLI options

In [None]:
# default_exp cli

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# export

import argparse
import logging
from pathlib import Path
import random

from cocorepr.utils import log_elapsed_time
from cocorepr.coco import merge_datasets, cut_annotations_per_category, remove_invalid_elements
from cocorepr.json_file import *
from cocorepr.json_tree import *
from cocorepr.crop_tree import *

In [None]:
# export
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger()

In [None]:
# export

def get_parser():
    parser = argparse.ArgumentParser(
        description="Tool for converting datasets in COCO format between different representations"
    )
    parser.add_argument("--in_json_file", type=Path, nargs="*", default=[],
                        help=(
                            "Path to one or multiple json files storing COCO dataset "
                            "in `json_file` representation (all json-based datasets will be merged)."
                        ))

    parser.add_argument("--in_json_tree", type=Path, nargs="*", default=[],
                        help=(
                            "Path to one or multiple directories storing COCO dataset "
                            "in `json_tree` representation (all json-based datasets will be merged)."
                        ))

    parser.add_argument("--in_crop_tree", type=Path, nargs="*", default=[],
                        help=(
                            "Path to one or multiple directories storing COCO dataset "
                            "in `crop_tree` representation (all crop-based datasets will be merged and will "
                            "overwrite the json-based datasets)."
                       ))

    parser.add_argument("--out_path", type=Path,
                        help="Path to the output dataset (file or directory: depends on `--out_format`)")

    parser.add_argument("--out_format", choices=['json_file', 'json_tree', 'crop_tree'])

    parser.add_argument("--seed", type=int, default=42, help="Random seed.")

    parser.add_argument("--max_crops_per_class", type=int, default=None,
                        help=(
                            "If set, the tool will randomly select up to this number of "
                            "crops (annotations) per each class (category) and drop the others."),
                       )

    parser.add_argument("--drop_invalid_elements", action='store_true',
                        help="If set, drops broken elements (for example, negative IDs or broken bboxes).")

    parser.add_argument("--dump_crop_tree_num_processes", type=int, default=1)

    parser.add_argument("--overwrite", action='store_true',
                        help="If set, will delete the output file/directory before dumping the result dataset.")
    parser.add_argument("--indent", default=4,
                        type=lambda x: int(x) if str(x).lower() not in ('none', 'null', '~') else None,
                        help="Indentation in the output json files.")

    parser.add_argument("--debug", action='store_true')

    return parser

In [None]:
# export

@log_elapsed_time(lambda t: logger.info(f'Total elapsed: {t.elapsed}'))
def main(args=None):
    args = args or get_parser().parse_args()

    if args.debug:
        logging.getLogger().setLevel(logging.DEBUG)

    logger.info(f'Arguments: {args}')

    in_json_tree_list = args.in_json_tree
    in_json_file_list = args.in_json_file
    in_crop_tree_list = args.in_crop_tree

    seed = args.seed
    max_crops_per_class = args.max_crops_per_class
    drop_invalid_elements = args.drop_invalid_elements

    out_path = args.out_path
    out_format = args.out_format
    dump_crop_tree_num_processes = args.dump_crop_tree_num_processes
    overwrite = args.overwrite
    indent = args.indent

    if out_path and not out_format or not out_path and out_format:
        raise ValueError(f'Option --out_format requires --out_path and vice versa')

    random.seed(args.seed)

    coco = None
    coco_count = 0
    for in_json_tree in in_json_tree_list:
        coco = merge_datasets(coco, load_json_tree(in_json_tree))
        coco_count += 1
    for in_json_file in in_json_file_list:
        coco = merge_datasets(coco, load_json_file(in_json_file))
        coco_count += 1

    if coco is None:
        raise ValueError(f'Not found base dataset, please specify either of: '
                         '--in_json_tree / --in_json_file (multiple arguments allowed)')
    if coco_count > 1:
        logger.info(f'Total loaded json dataset: {coco.to_full_str()}')

    coco_crop = None
    coco_crop_count = 0
    for in_crop_tree in in_crop_tree_list:
        coco_crop = merge_datasets(coco_crop, load_crop_tree(in_crop_tree, coco))
        coco_crop_count += 1
    if coco_crop is not None:
        if coco_crop_count > 1:
            logger.info(f'Total loaded crop-tree dataset: {coco_crop.to_full_str()}')
        logger.info('Using coco_crop dataset only.')
        coco = coco_crop

    if drop_invalid_elements:
        coco = remove_invalid_elements(coco)
        logger.info(f'After removing invalid elements: {coco.to_full_str()}')

    if max_crops_per_class:
        logger.info(f'Cutting off crops up to {max_crops_per_class} per class, random seed={seed}')
        coco = cut_annotations_per_category(coco, max_crops_per_class)
        logger.info(f'After cutting off: {coco.to_full_str()}')

    logger.info(f'[.] Result dataset: {coco.to_full_str()}')
    details = ''
    if out_format is not None:
        assert out_path
        dump_kwargs = dict(skip_nulls=True, overwrite=overwrite, indent=indent)
        if out_format == 'json_file':
            dump_fun = dump_json_file
        elif out_format == 'json_tree':
            dump_fun = dump_json_tree
        elif out_format == 'crop_tree':
            dump_fun = dump_crop_tree
            dump_kwargs['num_processes'] = dump_crop_tree_num_processes
        else:
            raise ValueError(out_format)
        dump_fun(coco, out_path, **dump_kwargs)
        if out_path.is_dir():
            details = f': {[p.name for p in out_path.iterdir()]}'

    logger.info(f'[+] Success: {out_format} dumped to {out_path}' + details)

In [None]:
# To test below, you need to do:
# $ cd <project root>
# $ make build
# $ pip install -e .
# reload current notebook kernel

In [None]:
# json_file -> json_tree

! rm -rf /tmp/cococo/json_tree
! cocorepr \
    --in_json_file ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json \
    --out_path /tmp/cococo/json_tree \
    --out_format json_tree \
    --overwrite

INFO: Arguments: Namespace(debug=False, in_crop_tree=[], in_json_file=[PosixPath('../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json')], in_json_tree=[], indent=4, max_crops_per_class=None, out_format='json_tree', out_path=PosixPath('/tmp/cococo/json_tree'), overwrite=True, seed=42)
INFO: Loading json_file from: ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json
INFO:   json file loaded: elapsed 0:00:00.000281
INFO:   dataset constructed: elapsed 0:00:00.003357
INFO: Loaded json_file: elapsed 0:00:00.003739: CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, categories=3)
INFO: Dumping json_tree to dir: /tmp/cococo/json_tree
INFO: Dataset written to /tmp/cococo/json_tree: elapsed 0:00:00.001547
INFO: [+] Success: json_tree dumped to /tmp/cococo/json_tree: ['info.json', 'info', 'categories', 'annotations', 'licenses', 'images']


In [None]:
# json_file -> json_tree (same but with --max_crops_per_class)

! rm -rf /tmp/cococo/json_tree
! cocorepr \
    --in_json_file ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json \
    --out_path /tmp/cococo/json_tree \
    --out_format json_tree \
    --overwrite \
    --max_crops_per_class=1

INFO: Arguments: Namespace(debug=False, in_crop_tree=[], in_json_file=[PosixPath('../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json')], in_json_tree=[], indent=4, max_crops_per_class=1, out_format='json_tree', out_path=PosixPath('/tmp/cococo/json_tree'), overwrite=True, seed=42)
INFO: Loading json_file from: ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json
INFO: Loaded from json_file: CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, categories=3)
INFO: Cutting off crops up to 1 per class, random seed=42
INFO: After cutting off: CocoObjectDetectionDataset(images=3, licenses=8, annotations=3, categories=3)
INFO: Dumping json_tree to dir: /tmp/cococo/json_tree
INFO: [+] Success: json_tree dumped to /tmp/cococo/json_tree: ['info.json', 'info', 'categories', 'annotations', 'licenses', 'images']


In [None]:
# json_tree -> json_file

! rm -rf /tmp/cococo/json_file
! cocorepr \
    --in_json_tree ../examples/coco_chunk/json_tree \
    --out_path /tmp/cococo/json_file/annotations.json \
    --out_format json_file \
    --indent=None

INFO: Arguments: Namespace(debug=False, in_crop_tree=[], in_json_file=[], in_json_tree=[PosixPath('../examples/coco_chunk/json_tree')], indent=None, max_crops_per_class=None, out_format='json_file', out_path=PosixPath('/tmp/cococo/json_file/annotations.json'), overwrite=False, seed=42)
INFO: Loading json_tree from dir: ../examples/coco_chunk/json_tree
INFO: Loaded from json_tree: CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, categories=3)
INFO: Writing dataset CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, categories=3) to json-file: /tmp/cococo/json_file/annotations.json
INFO: [+] Success: json_file dumped to /tmp/cococo/json_file/annotations.json


In [None]:
# json_file -> crop_tree

! rm -rf /tmp/cococo/crop_tree
! cocorepr \
    --in_json_file ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json \
    --out_path /tmp/cococo/crop_tree \
    --out_format crop_tree \
    --overwrite

INFO: Arguments: Namespace(debug=False, in_crop_tree=[], in_json_file=[PosixPath('../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json')], in_json_tree=[], indent=4, max_crops_per_class=None, out_format='crop_tree', out_path=PosixPath('/tmp/cococo/crop_tree'), overwrite=True, seed=42)
INFO: Loading json_file from: ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json
INFO: Loaded from json_file: CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, categories=3)
INFO: Dumping crop_tree to dir: /tmp/cococo/crop_tree
Processing images: 100%|██████████████████████████| 6/6 [00:04<00:00,  1.49it/s]
INFO: [+] Success: crop_tree dumped to /tmp/cococo/crop_tree: ['crops', 'images']


In [None]:
cats = !ls /tmp/cococo/crop_tree/crops
cat = cats[0]
cat

'bicycle--2'

In [None]:
# modify crop_tree

In [None]:
crops = !ls /tmp/cococo/crop_tree/crops/{cat}
deleted_crop = crops[0]
deleted_crop

'124710.png'

In [None]:
! ls /tmp/cococo/crop_tree/crops/{cat}
! rm /tmp/cococo/crop_tree/crops/{cat}/{deleted_crop}
! ls /tmp/cococo/crop_tree/crops/{cat}

124710.png  124713.png
124713.png


In [None]:
# json_file + crop_tree (modified) -> json_tree

! rm -rf /tmp/json_tree_2
! cocorepr \
    --in_json_file ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json \
    --in_crop_tree /tmp/cococo/crop_tree \
    --out_path /tmp/cococo/json_tree_2 \
    --out_format json_tree

! [ ! -f {TMP2}/annotations/{deleted_crop} ] && echo "[+] File successfully not exists"

INFO: Arguments: Namespace(debug=False, in_crop_tree=PosixPath('/tmp/cococo/crop_tree'), in_json_files=[PosixPath('../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json')], in_json_trees=[], indent=4, out_format='json_tree', out_path=PosixPath('/tmp/cococo/json_tree_2'), overwrite=False)
INFO: Loading json file from file: ../examples/coco_chunk/json_file/instances_train2017_chunk3x2.json
INFO: Loaded: images=6, annotations=6, categories=3
INFO: Loading blob list from dir: /tmp/cococo/crop_tree
INFO: Loaded crop tree: len(annotations)=5 len(images)=5 len(categories)=3
INFO: Dumping json tree to dir: /tmp/cococo/json_tree_2
Traceback (most recent call last):
  File "/home/ay/.pyenv/versions/3.7.6/bin/cocorepr", line 33, in <module>
    sys.exit(load_entry_point('cocorepr', 'console_scripts', 'cocorepr')())
  File "/plain/github/nm/cocorepr/cocorepr/main.py", line 81, in main
    dump_fun(coco, out_path, skip_nulls=True, overwrite=overwrite, indent=indent)
  File "/plain/gith

In [None]:
# json_tree + crop_tree -> crop_tree

! rm -rf /tmp/cococo/crop_tree_2
! cocorepr \
    --in_json_tree /tmp/cococo/json_tree_2 \
    --in_crop_tree /tmp/cococo/crop_tree \
    --out_path /tmp/cococo/crop_tree_2 \
    --out_format crop_tree
! ls /tmp/cococo/crop_tree_2 | grep crops

INFO: Arguments: Namespace(debug=False, in_crop_tree=PosixPath('/tmp/cococo/crop_tree'), in_json_files=[], in_json_trees=[PosixPath('/tmp/cococo/json_tree_2')], indent=4, out_format='crop_tree', out_path=PosixPath('/tmp/cococo/crop_tree_2'), overwrite=False)
INFO: Loading json tree from dir: /tmp/cococo/json_tree_2
INFO: Loading blob list from dir: /tmp/cococo/crop_tree
INFO: Loaded crop tree: len(annotations)=5 len(images)=5 len(categories)=3
INFO: Dumping crop tree to dir: /tmp/cococo/crop_tree_2
Processing images: 100%|██████████████████████████| 5/5 [00:04<00:00,  1.20it/s]
INFO: [+] Success: crop_tree dumped to /tmp/cococo/crop_tree_2: ['crops', 'images']
crops


In [None]:
# json_tree + crop_tree -> json_file

! rm -rf /tmp/cococo/json_file_2
! cocorepr \
    --in_json_tree /tmp/cococo/json_tree_2 \
    --in_crop_tree /tmp/cococo/crop_tree \
    --out_path /tmp/cococo/json_file_2/annotations.json \
    --out_format json_file

! ls /tmp/cococo/json_file_2/annotations.json

INFO: Arguments: Namespace(debug=False, in_crop_tree=PosixPath('/tmp/cococo/crop_tree'), in_json_files=[], in_json_trees=[PosixPath('/tmp/cococo/json_tree_2')], indent=4, out_format='json_file', out_path=PosixPath('/tmp/cococo/json_file_2/annotations.json'), overwrite=False)
INFO: Loading json tree from dir: /tmp/cococo/json_tree_2
INFO: Loading blob list from dir: /tmp/cococo/crop_tree
INFO: Loaded crop tree: len(annotations)=5 len(images)=5 len(categories)=3
INFO: Dumping json file to file: /tmp/cococo/json_file_2/annotations.json
INFO: Writing dataset to json file: /tmp/cococo/json_file_2/annotations.json
INFO: [+] Success: json_file dumped to /tmp/cococo/json_file_2/annotations.json
/tmp/cococo/json_file_2/annotations.json


In [None]:
# json_tree + json_file + crop_tree -> json_tree

! rm -rf /tmp/cococo/json_file_3/
! cocorepr \
    --in_json_tree /tmp/cococo/json_tree_2 \
    --in_json_file /tmp/cococo/json_file/annotations.json \
    --out_path /tmp/cococo/json_file_3/annotations.json \
    --out_format json_file \
    --overwrite


INFO: Arguments: Namespace(debug=False, in_crop_tree=None, in_json_files=[PosixPath('/tmp/cococo/json_file/annotations.json')], in_json_trees=[PosixPath('/tmp/cococo/json_tree_2')], indent=4, out_format='json_file', out_path=PosixPath('/tmp/cococo/json_file_3/annotations.json'), overwrite=True)
INFO: Loading json tree from dir: /tmp/cococo/json_tree_2
INFO: Loading json file from file: /tmp/cococo/json_file/annotations.json
INFO: Loaded: images=6, annotations=6, categories=3
INFO: Dumping json file to file: /tmp/cococo/json_file_3/annotations.json
INFO: Writing dataset to json file: /tmp/cococo/json_file_3/annotations.json
INFO: [+] Success: json_file dumped to /tmp/cococo/json_file_3/annotations.json
