# Crop tree representation
> Methods to work with `crop_tree` representation: load/dump from/to a directory.

In [None]:
# default_exp crop_tree

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# export

import json
import shutil
import logging
from collections import defaultdict
from dataclasses import dataclass, Field
from typing import *
from pathlib import Path
from multiprocessing import Pool, Lock

from cocorepr.utils import sort_dict, measure_time, read_image, write_image, cut_bbox
from cocorepr.coco import *

In [None]:
logger = logging.getLogger()

In [None]:
# hide
logging.basicConfig(level=logging.DEBUG)

In [None]:
# export

def load_crop_tree(
    source_dir: Union[str, Path],
    base_coco: CocoDataset,
    *,
    kind: str = "object_detection",
) -> CocoDataset:
    """ Load modified set of crops from `{path}/crops` and use it
        to filter out the annotations in `base_coco`.
    """
    dataset_class = get_dataset_class(kind)

    source_dir = Path(source_dir)
    logger.info(f"Loading crop_tree from dir: {source_dir}")
    if not source_dir.is_dir():
        raise ValueError(f"Source dir not found: {source_dir}")

    crops_dir = source_dir / 'crops'
    if not crops_dir.exists():
        raise ValueError(f'Source crops dir not found: {crops_dir}')

    catid2cat = {cat.id: cat for cat in base_coco.categories}
    imgid2img = {img.id: img for img in base_coco.images}
    annid2ann = {ann.id: ann for ann in base_coco.annotations}
    annid2imgid = {ann.id: ann.image_id for ann in base_coco.annotations}

    res_cats = {}
    res_imgs = {}
    res_anns = {}

    with measure_time() as timer1:
        for count1, ann_dir in enumerate(crops_dir.iterdir(), 1):
            cat_id = int(ann_dir.name.split('--')[-1])
            cat = catid2cat[cat_id]

            with measure_time() as timer2:
                for count2, ann_file in enumerate(ann_dir.glob('*.png'), 1):
                    ann_id = int(ann_file.stem)
                    ann = annid2ann[ann_id]
                    img_id = annid2imgid[ann_id]
                    img = imgid2img[img_id]

                    res_cats[cat.id] = cat
                    res_imgs[img.id] = img
                    res_anns[ann.id] = ann
            logger.debug(f'- loaded {count2} crops from {ann_dir}: elapsed {timer2.elapsed}')
        logger.info(f'Loaded from {count1} crop directories: elapsed {timer1.elapsed}')

    with measure_time() as timer:
        D = {
            **base_coco.to_dict(),
            'images': list(res_imgs.values()),
            'annotations': list(res_anns.values()),
            'categories': list(res_cats.values()),
        }
    logger.info(f'Dataset dict constructed: elapsed {timer.elapsed}')
    
    with measure_time() as timer:
        coco = dataset_class.from_dict(D)
    logger.info(f'Dataset object constructed: elapsed {timer.elapsed}: {coco.to_full_str()}')
    
    return coco

In [None]:
# hide
import tempfile
from cocorepr.json_tree import load_json_tree, dump_json_tree
from pathlib import Path

SRC_COCO = '../examples/coco_chunk/json_tree/'
SRC_BLOB = '../examples/coco_chunk/crop_tree/'

expected_crop_ids = sorted([int(p.stem)
                            for cat_p in (Path(SRC_BLOB)/'crops').iterdir()
                            for p in cat_p.iterdir()])
display(expected_crop_ids)

DST = tempfile.mktemp()
d = load_crop_tree(SRC_BLOB, load_json_tree(SRC_COCO))

actual_crop_ids = sorted([ann.id for ann in d.annotations])
display(actual_crop_ids)
assert actual_crop_ids == expected_crop_ids, actual_crop_ids

[124710, 124713, 131774, 131812, 183020, 183030]

INFO:root:Loading json_tree from dir: ../examples/coco_chunk/json_tree
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/annotations
DEBUG:root:Loaded 3 json chunks from ../examples/coco_chunk/json_tree/categories
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/images
DEBUG:root:Loaded 8 json chunks from ../examples/coco_chunk/json_tree/licenses
DEBUG:root:Loaded single-file json chunk ../examples/coco_chunk/json_tree/info.json
INFO:root:  json files loaded: elapsed 0:00:00.006344
INFO:root:  dataset constructed: elapsed 0:00:00.004539
INFO:root:Loaded from json_tree: CocoObjectDetectionDataset(annotations=6, categories=3, images=6, licenses=8)
INFO:root:Loading crop_tree from dir: ../examples/coco_chunk/crop_tree
INFO:root:  loaded 2 crops from ../examples/coco_chunk/crop_tree/crops/bicycle--2: elapsed 0:00:00.000303
INFO:root:  loaded 2 crops from ../examples/coco_chunk/crop_tree/crops/person--1: elapsed 0:00:00.000053
INFO:root:  loaded 2 cr

[124710, 124713, 131774, 131812, 183020, 183030]

In [None]:
# # export
# def _cut_to_chunks(L: List[Any], n) -> List[List[Any]]:
#     assert n > 0
#     return [
#         L[i: i+n] + [None]*(n - len(L[i: i+n]))
#         for i in range(0, len(L), n)
#     ]

In [None]:
# res = _cut_to_chunks([1,2,3,4,5,6,7], 4)
# assert res == [[1, 2, 3, 4], [5, 6, 7, None]]

# res = _cut_to_chunks([1,2,3,4,5,6,7], 3)
# assert res == [[1, 2, 3], [4, 5, 6], [7, None, None]], res

# res = _cut_to_chunks([1,2,3,4,5,6,7], 2)
# assert res == [[1, 2], [3, 4], [5, 6], [7, None]], res

# res = _cut_to_chunks([1,2,3,4,5,6,7], 1)
# assert res == [[1], [2], [3], [4], [5], [6], [7]], res

# res = _cut_to_chunks([], 2)
# assert res == []

In [None]:
# export

def _delete_extra_files(coco, target_dir, images_dir, crops_dir, catid2cat):
    logger.info(f'Collecting extra files to clean in root {target_dir}')
    to_remove = []

    a = {images_dir}
    b = {images_dir/img.get_file_name() for img in coco.images}
    c = {crops_dir}
    d = {crops_dir/catid2cat[cat.id].get_dir_name() for cat in coco.categories}
    e = {crops_dir/catid2cat[ann.category_id].get_dir_name()/ann.get_file_name() for ann in coco.annotations}
    all_files = a | b | c | d | e

    for p in target_dir.glob('**/*'):
        if p not in all_files:
            to_remove.append(p)
    to_remove = sorted(to_remove)
    removed_str = '\n'.join(map(str, to_remove))
    if removed_str:
        removed_str = '\n' + removed_str
    logger.info(f'Removing {len(to_remove)} files and dirs:{removed_str}')
    # reversed so that files get deleted before their dirs
    for p in reversed(to_remove):
        try:
            if p.is_file():
                p.unlink()
            else:
                shutil.rmtree(str(p))
        except BaseException as e:
            fod = 'file' if p.is_file() else 'dir'
            logger.warning(f'Could not delete {fod} {p} (ignoring!): {e}')
            continue
    logger.info(f'[.] Removed {len(to_remove)} files and dirs.')

    
def _process_image(img, anns, images_dir, crops_dir, catid2cat, anns_failed_file):
    file_name = img.get_file_name()
    image_file = images_dir / file_name
    image = None

    for ann in anns:
        cat = catid2cat[ann.category_id]
        cat_dir = crops_dir / cat.get_dir_name()
        cat_dir.mkdir(exist_ok=True)

        ann_file = cat_dir / ann.get_file_name()
        if ann_file.is_file():
            continue
        
        if image is None:
            image = read_image(image_file, download_url=img.coco_url)
        box = cut_bbox(image, ann.bbox)
        try:
            write_image(box, ann_file)
        except ValueError as e:
            logger.error(e)
            with anns_failed_file.open('a') as f:
                f.write(json.dumps(ann.to_dict(), ensure_ascii=False) + '\n')


def _process_image_list(args):
    if not args:
        return
    _process_image(*args)


def dump_crop_tree(
    coco: CocoDataset,
    target_dir: Union[str, Path],
    *,
    kind: str = 'object_detection',
    skip_nulls: bool = True,
    overwrite: bool = False,
    indent: Optional[int] = 4,
    num_processes: int = 1,
) -> None:
    try:
        from tqdm.auto import tqdm
    except ImportError:
        logger.warning("Could not import tqdm, please run 'pip install tqdm'")
        def tqdm(it, *args, **kwargs):
            yield from it

    dataset_class = get_dataset_class(kind)
    if skip_nulls:
        to_dict_function = dataset_class.to_dict_skip_nulls
    else:
        to_dict_function = dataset_class.to_dict

    target_dir = Path(target_dir)
    logger.info(f"Dumping crop_tree to dir: {target_dir}")

    if overwrite:
        if target_dir.is_dir():
            logger.warning(f'Destination and will be overwritten: {target_dir}')
    elif target_dir.is_dir():
        raise ValueError(f"Destination json tree dir already exists: {target_dir}")

    #if overwrite and target_dir.is_dir():
    #    logger.info(f'Deleting old target directory {target_dir}')
    #    shutil.rmtree(str(target_dir))

    target_dir.mkdir(parents=True, exist_ok=True)
    catid2cat = {cat.id: cat for cat in coco.categories}

    imgid2img = {img.id: img for img in coco.images}
    imgid2anns = defaultdict(list)
    for ann in coco.annotations:
        imgid2anns[ann.image_id].append(ann)

    images_dir = target_dir / 'images'
    images_dir.mkdir(exist_ok=True)

    crops_dir = target_dir / 'crops'
    crops_dir.mkdir(exist_ok=True)

    anns_failed = []
    anns_failed_file = crops_dir / 'crops_failed.ndjson'
    
    if overwrite and crops_dir.is_dir():
        _delete_extra_files(coco, target_dir, images_dir, crops_dir, catid2cat)
    
    with measure_time() as timer:
        pairs = [
            (imgid2img[imgid], anns, images_dir, crops_dir, catid2cat, anns_failed_file)
            for (imgid, anns) in imgid2anns.items()
        ]
        #chunks = _cut_to_chunks(pairs, num_processes)
        with Pool(num_processes) as pool:
            #for chunk in tqdm(chunks):
            list(tqdm(pool.imap(_process_image_list, pairs), total=len(pairs), desc='Processing images'))
        #process_map(_process_image_list, chunks, total=len(imgid2anns), desc='Processing images', max_workers=num_processes)
            
    logger.info(f'Crops written to {crops_dir}: elapsed {timer.elapsed}')
    
    if anns_failed:
        logger.warning(f'Failed to process {len(anns_failed)} crops, see file {anns_failed_file}')

In [None]:
# hide
import tempfile
from cocorepr.json_tree import load_json_tree

SRC = '../examples/coco_chunk/json_tree/'
DST = tempfile.mktemp()
d = load_json_tree(SRC)

dump_crop_tree(d, DST, num_processes=2)

actual = ! ls {DST}/crops
actual = set(actual)
expected = set(c.get_dir_name() for c in d.categories)
assert expected == actual, (expected, actual)

some_crops = !ls expected

INFO:root:Loading json_tree from dir: ../examples/coco_chunk/json_tree
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/annotations
DEBUG:root:Loaded 3 json chunks from ../examples/coco_chunk/json_tree/categories
DEBUG:root:Loaded 6 json chunks from ../examples/coco_chunk/json_tree/images
DEBUG:root:Loaded 8 json chunks from ../examples/coco_chunk/json_tree/licenses
DEBUG:root:Loaded single-file json chunk ../examples/coco_chunk/json_tree/info.json
INFO:root:- json files loaded: elapsed 0:00:00.007591
INFO:root:- dataset constructed: elapsed 0:00:00.004008
INFO:root:Loaded from json_tree: CocoObjectDetectionDataset(annotations=6, categories=3, images=6, licenses=8)
INFO:root:Dumping crop_tree to dir: /tmp/tmp9vk7qmar
INFO:root:Crops written to /tmp/tmp9vk7qmar/crops: elapsed 0:00:03.220410


In [None]:
crop_files = ! ls {DST}/crops
some_crops = ! ls {DST}/crops/{crop_files[0]}
! echo {some_crops}
deleted_crop = some_crops[0]
! rm {DST}/crops/{crop_files[0]}/{deleted_crop}
! ls {DST}/crops/{crop_files[0]}

d2 = load_crop_tree(DST, d)
assert deleted_crop not in d2.to_json(), (deleted_crop, d2.to_json())

[124710.png, 124713.png]
124713.png


INFO:root:Loading crop_tree from dir: /tmp/tmp9vk7qmar
INFO:root:  loaded 2 crops from /tmp/tmp9vk7qmar/crops/car--3: elapsed 0:00:00.000067
INFO:root:  loaded 2 crops from /tmp/tmp9vk7qmar/crops/person--1: elapsed 0:00:00.000055
INFO:root:  loaded 1 crops from /tmp/tmp9vk7qmar/crops/bicycle--2: elapsed 0:00:00.000102
INFO:root:Loaded from 3 crop directories: elapsed None
INFO:root:Dataset dict constructed: elapsed 0:00:00.001974
INFO:root:Dataset object constructed: elapsed 0:00:00.001158: CocoObjectDetectionDataset(annotations=5, categories=3, images=5, licenses=8)
