# Crop tree representation
> Methods to work with `crop_tree` representation: load/dump from/to a directory.

In [None]:
# default_exp crop_tree

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# export

import json
import shutil
import logging
from collections import defaultdict
from dataclasses import dataclass, Field
from typing import *
from pathlib import Path

from cocorepr.utils import *
from cocorepr.coco import *

In [None]:
logger = logging.getLogger()

In [None]:
# hide
logging.basicConfig(level=logging.DEBUG)

In [None]:
# export

def load_crop_tree(
    source_dir: Union[str, Path],
    base_coco: CocoDataset,
    *,
    kind: str = "object_detection",
) -> CocoDataset:
    """ Load modified set of crops from `{path}/crops` and use it
        to filter out the annotations in `base_coco`.
    """
    dataset_class = get_dataset_class(kind)

    source_dir = Path(source_dir)
    logger.info(f"Loading crop_tree from dir: {source_dir}")
    if not source_dir.is_dir():
        raise ValueError(f"Source dir not found: {source_dir}")

    crops_dir = source_dir / 'crops'
    if not crops_dir.exists():
        raise ValueError(f'Source crops dir not found: {crops_dir}')

    catid2cat = {cat.id: cat for cat in base_coco.categories}
    imgid2img = {img.id: img for img in base_coco.images}
    annid2ann = {ann.id: ann for ann in base_coco.annotations}
    annid2imgid = {ann.id: ann.image_id for ann in base_coco.annotations}

    res_cats = {}
    res_imgs = {}
    res_anns = {}

    for ann_dir in crops_dir.iterdir():
        cat_id = int(ann_dir.name.split('--')[-1])
        cat = catid2cat[cat_id]

        for i, ann_file in enumerate(ann_dir.glob('*.png'), 1):
            ann_id = int(ann_file.stem)
            ann = annid2ann[ann_id]
            img_id = annid2imgid[ann_id]
            img = imgid2img[img_id]

            res_cats[cat.id] = cat
            res_imgs[img.id] = img
            res_anns[ann.id] = ann
        logger.debug(f'Loaded {i} crops from: {ann_dir}')

    D = {
        **base_coco.to_dict(),
        'images': list(res_imgs.values()),
        'annotations': list(res_anns.values()),
        'categories': list(res_cats.values()),
    }
    coco = dataset_class.from_dict(D)
    logger.info(f"Loaded from crop_tree: {coco.to_full_str()}")

    return coco

In [None]:
# hide
import tempfile
from cocorepr.json_tree import load_json_tree, dump_json_tree
from pathlib import Path

SRC_COCO = '../examples/coco_chunk/json_tree/'
SRC_BLOB = '../examples/coco_chunk/crop_tree/'

expected_crop_ids = sorted([int(p.stem)
                            for cat_p in (Path(SRC_BLOB)/'crops').iterdir()
                            for p in cat_p.iterdir()])
display(expected_crop_ids)

DST = tempfile.mktemp()
d = load_crop_tree(SRC_BLOB, load_json_tree(SRC_COCO))

actual_crop_ids = sorted([ann.id for ann in d.annotations])
display(actual_crop_ids)
assert actual_crop_ids == expected_crop_ids, actual_crop_ids

[124710, 124713, 131774, 131812, 183020, 183030]

INFO:root:Loading json_tree from dir: ../examples/coco_chunk/json_tree
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/images
DEBUG:root:Loaded 8 json chunks from ../examples/coco_chunk/json_tree/licenses
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/annotations
DEBUG:root:Loaded 3 json chunks from ../examples/coco_chunk/json_tree/categories
DEBUG:root:Loaded single-file json chunk ../examples/coco_chunk/json_tree/info.json
INFO:root:Loaded from json_tree: CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, categories=3)
INFO:root:Loading crop_tree from dir: ../examples/coco_chunk/crop_tree
DEBUG:root:Loaded 2 crops from: ../examples/coco_chunk/crop_tree/crops/bicycle--2
DEBUG:root:Loaded 2 crops from: ../examples/coco_chunk/crop_tree/crops/person--1
DEBUG:root:Loaded 2 crops from: ../examples/coco_chunk/crop_tree/crops/car--3
INFO:root:Loaded from crop_tree: CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, catego

[124710, 124713, 131774, 131812, 183020, 183030]

In [None]:
# export

def dump_crop_tree(
    coco: CocoDataset,
    target_dir: Union[str, Path],
    *,
    kind: str = 'object_detection',
    skip_nulls: bool = False,
    overwrite: bool = False,
    indent: Optional[int] = 4,
) -> None:
    try:
        from tqdm.auto import tqdm
    except ImportError:
        logger.warning("Could not import tqdm, please run 'pip install tqdm'")
        def tqdm(it, *args, **kwargs):
            yield from it

    dataset_class = get_dataset_class(kind)
    if skip_nulls:
        to_dict_function = dataset_class.to_dict_skip_nulls
    else:
        to_dict_function = dataset_class.to_dict

    target_dir = Path(target_dir)
    logger.info(f"Dumping crop_tree to dir: {target_dir}")

    if overwrite:
        if target_dir.is_dir():
            logger.warning(f'Destination and will be overwritten: {target_dir}')
    elif target_dir.is_dir():
        raise ValueError(f"Destination json tree dir already exists: {target_dir}")

    if target_dir.is_dir():
        logger.info(f'Deleting old target directory {target_dir}')
        shutil.rmtree(str(target_dir))

    target_dir.mkdir(parents=True)
    catid2cat = {cat.id: cat for cat in coco.categories}

    imgid2img = {img.id: img for img in coco.images}
    imgid2anns = defaultdict(list)
    for ann in coco.annotations:
        imgid2anns[ann.image_id].append(ann)

    images_dir = target_dir / 'images'
    images_dir.mkdir(exist_ok=True)

    crops_dir = target_dir / 'crops'
    crops_dir.mkdir(exist_ok=True)

    anns_failed = []
    anns_failed_file = crops_dir / 'crops_failed.ndjson'
    for imgid, anns in tqdm(imgid2anns.items(), desc='Processing images'):
        img = imgid2img[imgid]
        assert img.file_name, f'Empty file name for img: {img}'
        image_file = images_dir / img.file_name
        image = read_image(image_file, download_url=img.coco_url)

        for ann in anns:
            cat = catid2cat[ann.category_id]
            cat_dir = crops_dir / cat.get_dir_name()
            cat_dir.mkdir(exist_ok=True)

            ann_file = cat_dir / f'{ann.id}.png'
            box = cut_bbox(image, ann.bbox)
            try:
                write_image(box, ann_file)
            except ValueError as e:
                logger.error(e)
                anns_failed.append(ann)
                with anns_failed_file.open('a') as f:
                    f.write(json.dumps(ann.to_dict()) + '\n')
    if anns_failed:
        logger.warning(f'Failed to process {len(anns_failed)} crops, see file {anns_failed_file}')

In [None]:
# hide
import tempfile
from cocorepr.json_tree import load_json_tree

SRC = '../examples/coco_chunk/json_tree/'
DST = tempfile.mktemp()
d = load_json_tree(SRC)

dump_crop_tree(d, DST)

actual = ! ls {DST}/crops
actual = set(actual)
expected = set(c.get_dir_name() for c in d.categories)
assert expected == actual, (expected, actual)

INFO:root:Loading json_tree from dir: ../examples/coco_chunk/json_tree
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/images
DEBUG:root:Loaded 8 json chunks from ../examples/coco_chunk/json_tree/licenses
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/annotations
DEBUG:root:Loaded 3 json chunks from ../examples/coco_chunk/json_tree/categories
DEBUG:root:Loaded single-file json chunk ../examples/coco_chunk/json_tree/info.json
INFO:root:Loaded from json_tree: CocoObjectDetectionDataset(images=6, licenses=8, annotations=6, categories=3)
INFO:root:Dumping crop_tree to dir: /tmp/tmpcu9m9qgq
