In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps_data')  # append this repo to PYTHONPATH

import json
import os
from collections import Counter, defaultdict
from random import sample
import math

from tqdm import tqdm

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# Importing the Island Conservation 20190529 drop 

This dataset is different in that the class labels are on the bounding box-level. To make it intuitive for future queries, since we only query the `class` attribute on the sequences and images when gathering training data, we add the species label from each bounding box to the `class` attribute at the image level. The sequences are dummy sequences - no sequence information in the original dataset.

The timestamps are not read from the image metadata.

In [8]:
path_to_output_public = '.../CameraTrap/Databases/megadb_2020/to_ingest/island_conservation_200529_megadb.json'
path_to_output_private = '.../CameraTrap/Databases/megadb_2020/to_ingest/island_conservation_200529_private_megadb.json'

dataset_name = 'islandconservation_200529'
dataset_private_name = 'islandconservation_200529_private'
label_map = {'human': 'person', 'vehicle': 'vehicle', 'empty': 'empty'}

# Path to the CCT json, or a loaded json object
path_to_image_cct = '.../data/CameraTraps/CCT_JSONs/island_conservation.json'
path_to_bbox_cct = '.../data/CameraTraps/CCT_JSONs/island_conservation.json'
assert not (path_to_image_cct is None and path_to_bbox_cct is None)

In [3]:
embedded = make_cct_embedded(image_db=path_to_image_cct, bbox_db=path_to_bbox_cct)

Loading image DB...
Number of items from the image DB: 127410
Number of images with more than 1 species: 5927 (4.65% of image DB)
Loading bbox DB...
Number of images added from bbox DB entries:  0
Number of images amended:  0
Number of items in total:  127410
Number of images with more than one bounding box: 5927 (4.651911152970724% of all entries)


In [4]:
sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to islandconservation_200529. Please make sure this is correct!
Making a deep copy of docs...


 20%|██        | 25994/127410 [00:00<00:00, 129953.86it/s]

Putting 127410 images into sequences...


100%|██████████| 127410/127410 [00:00<00:00, 134468.37it/s]


Number of sequences: 127410
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'id', 'class', 'file', 'bbox'}

img_level_properties
{'image_id', 'file'}

image-level properties that really should be sequence-level
{'id', 'class', 'bbox'}

Finished processing sequences.
Example sequence items:

{"dataset": "islandconservation_200529", "seq_id": "dummy_cdfb0f599923412f8c265ca691237d7b", "images": [{"file": "dominicanrepublic/camara02/cam0226junio2015/dominicanrepublic_cam0226junio2015_20131026_063520_sunp0022.jpg"}], "class": ["cow"], "id": "dominicanrepublic_camara02_cam0226junio2015_dominicanrepublic_cam0226junio2015_20131026_063520_sunp0022", "bbox": [{"category": "cow", "bbox": [0, 0.141, 0.397, 0.427]}]}

[{"dataset": "islandconservation_200529", "seq_id": "dummy_d3fc351fbe004817b3ca3c49c802523f", "images": [{"file": "dominicanrepublic/camara04/cam0425abril2015/dominicanrepublic_cam0425abril2015_20130809_045413_sunp17

In [5]:
# this code snippet is by Vardhan
# since all sequences are "dummy" sequences, we move the labels back to the image level so it
# makes more sense to future queries

for index, entry in enumerate(sequences):
    for prop in ['id', 'bbox', 'class']:
        if prop == 'conf' or prop == 'class':
            sequences[index]['images'][0][prop] = entry[prop]
        elif prop == 'bbox':
            for pos, ele in enumerate(entry[prop]):
                ele['class'] = ele['category']
                ele['category'] = label_map.get(ele['category'], 'animal')
                entry[prop][pos] = ele
            sequences[index]['images'][0][prop] = entry[prop]
        del entry[prop]
    sequences[index] = entry

In [9]:
# separate out the private vs public sets because the images are in different containers
sequences_non_human = []
sequences_human = []

for seq in sequences:
    if 'human' in seq['images'][0]['class']:
        sequences_human.append(seq)
    else:
        sequences_non_human.append(seq)
        
len(sequences_non_human)
len(sequences_human)

122602

4808

There are exactly 4808 images in the 'human' folder in the private container, so looks good.

In [10]:
# change the dataset name field of the human set

for seq in sequences_human:
    seq['dataset'] = dataset_private_name

In [12]:
sequences_schema_check.sequences_schema_check(sequences_non_human)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [13]:
sequences_schema_check.sequences_schema_check(sequences_human)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [14]:
write_json(path_to_output_public, sequences_non_human)

In [15]:
write_json(path_to_output_private, sequences_human)

## Add width and height back in for the non-human set
Using existing width and height to avoid downloading each image.

In [None]:
from data_management.cct_json_utils import IndexedJsonDb

In [None]:
cct_json_db = IndexedJsonDb(path_to_image_cct)

In [None]:
file_name_to_dims = {}
for image_id, item in cct_json_db.image_id_to_image.items():
    file_name_to_dims[item['file_name']] = item

In [None]:
def round_to_int(f):
    return int(round(f))

In [None]:
for seq in sequences_non_human:
    for im in seq['images']:
        item = file_name_to_dims[im['file']]
        im['width'] = item['width']
        im['height'] = item['height']
        
        image_width = im['width']
        image_height = im['height']
        
        for b in im['bbox']:
            coords = b['bbox']
            x = max(round_to_int(coords[0] * image_width), 0)
            y = max(round_to_int(coords[1] * image_height), 0)
            box_w = min(round_to_int(coords[2] * image_width), image_width)
            box_h = min(round_to_int(coords[3] * image_height), image_height)

            b['bbox'] = [x, y, box_w, box_h]

In [None]:
write_json('/Users/siyuyang/Source/temp_data/CameraTrap/megadb_from_cct/island_conservation_200529/megadb_to_cct_files/island_conservation_200529_megadb_abs.json', sequences_non_human)

Command for converting from these MegaDB entries back to CCT ("one source of truth"):

```
python data_management/megadb/converters/megadb_to_cct.py "islandconservation_200529" /Users/siyuyang/Source/temp_data/CameraTrap/megadb_from_cct/island_conservation_200529/megadb_to_cct_files/island_conservation_200529_megadb_abs.json /Users/siyuyang/Source/temp_data/CameraTrap/megadb_from_cct/island_conservation_200529/megadb_to_cct_files/island_conservation_200529_megadb_to_cct.json
```

```
...
Final CCT DB has 154379 image entries, and 213562 annotation entries.
```