In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')  # append this repo to PYTHONPATH

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math

from tqdm import tqdm
from unidecode import unidecode 

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# Ingesting channel_islands_tnc and channel_islands_tnc_private

In [4]:
# both animal and human entries are in this label file
path_to_image_cct = '/Users/siyuyang/Data/CameraTraps/CCT_JSONs/channel_islands_camera_traps.json'
path_to_bbox_cct = path_to_image_cct

## Separate out human and non-human images

Some images have "empty" among the species labels but are in fact not empty and has other species labels. All "empty" bbox annotations also have a set of coordinates that covers
 the entire image - these need to be deleted. 

In [16]:
with open(path_to_image_cct) as f:
    cct_json = json.load(f)

In [17]:
len(cct_json['annotations'])

264321

In [18]:
# Delete the bounding box in all annotations with "category_id": 0 (empty).
num_deleted = 0
for anno in cct_json['annotations']:
    if anno['category_id'] == 0 and 'bbox' in anno:
        num_deleted += 1
        del anno['bbox']
        
num_deleted

114894

In [19]:
%%time

embedded = make_cct_embedded(image_db=cct_json, bbox_db=cct_json)

Loading image DB...
Number of items from the image DB: 245529
Number of images with more than 1 species: 14808 (6.03% of image DB)
Loading bbox DB...
Number of images added from bbox DB entries:  0
Number of images amended:  0
Number of items in total:  245529
Number of images with more than one bounding box: 14808 (6.031059467517076% of all entries)
CPU times: user 19 s, sys: 152 ms, total: 19.1 s
Wall time: 19.3 s


Because we did not have a separate bbox CCT json with the bbox coarse categories, we map the species e.g. rodent, fox, etc to the four bbox categories ('animal', 'person', 'vehicle', 'group').

Also get rid of the 'id' field in the image entries. Also for each image, keep 'species' (will be 'class' later) a list of unique species.

In [20]:
for entry in embedded:
    del entry['id']
    
    entry['annotations']['species'] = list(set(entry['annotations']['species']))
    
    for box in entry['annotations']['bbox']:
        category = box['category']
        assert category != 'empty'
        
        if category == 'human':
            box['category'] = 'person'
        else:
            box['category'] = 'animal' # there were only 5 animal categories: fox, skunk, rodent, bird, other
    

In [24]:
sample(embedded, 3)

[{'file_name': 'loc-h500hh06211646/008/694.jpg',
  'seq_id': 'b741c13f-8ca0-4705-b4d0-9bd6cff03eae',
  'seq_num_frames': 20,
  'frame_num': 4,
  'original_relative_path': '2016_02_Set/RECONYX_N/100RECNX/IMG_8545.JPG',
  'location': 'h500hh06211646',
  'temperature': '20 c',
  'annotations': {'species': ['rodent'],
   'bbox': [{'category': 'animal', 'bbox_rel': [0.623, 0.546, 0.0698, 0.115]},
    {'category': 'animal', 'bbox_rel': [0.859, 0.252, 0.0898, 0.186]}]}},
 {'file_name': 'loc-h600hi07237925/008/662.jpg',
  'seq_id': '4b7d1d52-1d77-4556-a0b9-2db6f4cfa13b',
  'seq_num_frames': 3,
  'frame_num': 2,
  'original_relative_path': 'Santa_Rosa/IMG_1537.JPG',
  'location': 'h600hi07237925',
  'temperature': '10 c',
  'annotations': {'species': ['empty'], 'bbox': []}},
 {'file_name': 'loc-h500hh07215885/000/668.jpg',
  'seq_id': '899d0680-a20c-48d0-b8c9-0ffa8b58454c',
  'seq_num_frames': 2,
  'frame_num': 0,
  'original_relative_path': '2014_11B_RRSet3/NOV21_2014_RR_16_SD3/2014/2014-12-25

In [25]:
embedded_human = []
embedded_others = []

for entry in embedded:
    if 'human' in entry['annotations']['species']:
        embedded_human.append(entry)
    else:
        embedded_others.append(entry)
        
len(embedded_human) # 5071 in channel-island-private container
len(embedded_others)

5071

240458

## channel_islands_tnc

In [26]:
dataset_name = 'channel_islands_tnc'
path_to_output = '/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_mdv5/channel_islands_tnc.json' 

In [27]:
%%time

sequences = process_sequences(embedded_others, dataset_name)

The dataset_name is set to channel_islands_tnc. Please make sure this is correct!
Making a deep copy of docs...


 39%|███▉      | 93890/240458 [00:00<00:00, 469391.13it/s]

Putting 240458 images into sequences...


100%|██████████| 240458/240458 [00:00<00:00, 443589.69it/s]


Number of sequences: 50309
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'class', 'file', 'location', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}

img_level_properties
{'class', 'file', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}

image-level properties that really should be sequence-level
{'location'}

Finished processing sequences.
Example sequence items:

{"dataset": "channel_islands_tnc", "seq_id": "836f6487-50fd-42f5-8dcc-336fc538b7a8", "location": "h500ee07133376", "images": [{"frame_num": 0, "original_relative_path": "2011_09_Set/Station%201/2011/2011-09-13/IMG_0001.JPG", "temperature": "21 c", "file": "loc-h500ee07133376/000/000.jpg", "class": ["empty"], "bbox": []}, {"frame_num": 1, "original_relative_path": "2011_09_Set/Station%201/2011/2011-09-13/IMG_0002.JPG", "temperature": "21 c", "file": "loc-h500ee07133376/000/001.jpg", "class": ["empty"], "bbox": []}, {"frame_num"

In [28]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 27.2 s, sys: 96.8 ms, total: 27.3 s
Wall time: 27.4 s


In [29]:
with open(path_to_output, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, ensure_ascii=False)

## channel_islands_tnc_private

In [39]:
dataset_name = 'channel_islands_tnc_private'
path_to_output = '/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_mdv5/channel_islands_tnc_private.json' 

In [40]:
%%time

sequences = process_sequences(embedded_human, dataset_name)

100%|██████████| 5071/5071 [00:00<00:00, 332073.62it/s]

The dataset_name is set to channel_islands_tnc_private. Please make sure this is correct!
Making a deep copy of docs...
Putting 5071 images into sequences...
Number of sequences: 768
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'class', 'file', 'location', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}

img_level_properties
{'class', 'file', 'original_relative_path', 'temperature', 'frame_num', 'bbox'}

image-level properties that really should be sequence-level
{'location'}

Finished processing sequences.
Example sequence items:

{"dataset": "channel_islands_tnc_private", "seq_id": "836f6487-50fd-42f5-8dcc-336fc538b7a8", "location": "h500ee07133376", "images": [{"frame_num": 3, "original_relative_path": "2011_09_Set/Station%201/2011/2011-09-13/IMG_0004.JPG", "temperature": "21 c", "file": "loc-h500ee07133376/000/003.jpg", "class": ["human"], "bbox": [{"category": "person", "bbox": [0, 0.608, 0.999, 




In [41]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 796 ms, sys: 5.59 ms, total: 802 ms
Wall time: 804 ms


In [42]:
with open(path_to_output, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, ensure_ascii=False)