In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math

from tqdm import tqdm

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

In [4]:
# annotations

anno_batch_10 = '/Users/siyuyang/Source/temp_data/CameraTrap/annotations/201910/batch10'
anno_batch_11 = '/Users/siyuyang/Source/temp_data/CameraTrap/annotations/201910/MsWLB_imerit_batch_11_18sep2019'

For some (especially the first few) datasets, the sections "Make into MegaDB format" and "Add bounding boxes" are run after loading the database and spot checking.

## awc_190430

In [4]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/awc_190430_20190816_embedded.json') as f:
    awc_embedded = json.load(f)

In [5]:
len(awc_embedded)

159273

In [6]:
awc_embedded[100]

{'width': 2048,
 'file_name': 'WetTropics/2018/Spurgeon Bettong/QWT-S-TSCAM087 (WF269-269)/IMG_0805.JPG',
 'image_id': '102328',
 'dataset': 'awc_190430',
 'height': 1536,
 'location': 'QWT-S-TSCAM087 (WF269-269)',
 'annotations': {'species': ['empty']}}

In [8]:
awc_locs = Counter()

for i in awc_embedded:
    awc_locs.update([i['location']])
    
len(awc_locs)

193

In [None]:
# some species arrays had two "empty" labels

In [9]:
for e in awc_embedded:
    if 'annotations' in e:
        e['annotations']['species'] = list(set(e['annotations']['species']))

## bellevue_190602

In [26]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/bellevue_190602_20190812_embedded.json') as f:
    bellevue_embedded = json.load(f)

In [27]:
len(bellevue_embedded)

1848

In [28]:
bellevue_embedded[100]

{'image_id': 'd3695634-a9a9-11e9-a392-5cf370671a19',
 'file_name': '2019.01.20/newcam/empty/DSCF0057.JPG',
 'annotations': {'species': ['empty']},
 'dataset': 'bellevue_190602'}

## idfg

In [5]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/idfg_20190807_embedded.json') as f:
    idfg_embedded = json.load(f)

In [6]:
len(idfg_embedded)

693870

In [7]:
sample(idfg_embedded, 1)

[{'image_id': 'Beaverhead_elk~AM206~Trip 1~100RECNX~2016-03-04 15-45-00 T',
  'file_name': 'Beaverhead_elk/AM206/Trip 1/100RECNX/2016-03-04 15-45-00 T.JPG',
  'dataset': 'idfg',
  'annotations': {'species': ['empty']},
  'datetime': '04-Mar-2016 15:45:00',
  'location': 'Beaverhead_elk+AM206'}]

In [12]:
no_species = []
no_annotation = []
for e in idfg_embedded:
    if 'annotations' not in e:
        no_annotation.append(e)
        continue
    if 'species' not in e['annotations']:
        no_species.append(e)

In [14]:
len(no_annotation)

6224

In [15]:
no_annotation[100]

{'height': 1536,
 'file_name': 'CrowCreek_moose/805/100RECNX/IMG_4593.JPG',
 'dataset': 'idfg',
 'width': 2048,
 'image_id': 'CrowCreek_moose~805~100RECNX~IMG_4593',
 'location': 'CrowCreek_moose+805'}

In [13]:
len(no_species)

8776

In [17]:
no_species[100]

{'height': 1536,
 'file_name': 'Focal_lion/53/101RECNX/IMG_0510.JPG',
 'dataset': 'idfg',
 'annotations': {'bbox': [{'category': 'animal',
    'bbox_rel': [0.296, 0.601, 0.208, 0.376],
    'bbox_abs': [607.5616438356172,
     923.1780821917747,
     426.08219178082305,
     578.6301369862963]}]},
 'width': 2048,
 'image_id': 'Focal_lion~53~101RECNX~IMG_0510',
 'location': 'Focal_lion+53'}

In [18]:
for e in idfg_embedded:
    if 'annotations' not in e:
        e['annotations'] = {
            'species': ['__label_unavailable']
        }
        continue
    if 'species' not in e['annotations']:
        if 'bbox' in e['annotations']:
            bbox = e['annotations']['bbox']
            if len(bbox) == 0:
                e['annotations']['species'] = ['empty']
            else:
                if bbox[0]['category'] == 'animal':
                    e['annotations']['species'] = ['unidentified']
                elif bbox[0]['category'] == 'person':
                    e['annotations']['species'] = ['human']
                else:
                    e['annotations']['species'] = ['__label_unavailable']

In [19]:
# some images from Trip 2 of a location which we got vehicle annotations for were not in the CCT
# it seems, so adding them here manually:

idfg_missing = ['Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-12 15-01-50 M 5_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-02-19 09-31-54 M 3_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-02-19 09-31-55 M 4_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-14 12-40-00 T.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-12 15-01-47 M 2_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-02-19 09-31-53 M 1_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-02-19 09-31-56 M 5_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-12 15-01-52 M 2_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-12 15-01-48 M 3_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-12 15-01-51 M 1_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-14 12-40-00 M 3_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-12 15-01-46 M 1_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-12 15-01-49 M 4_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-14 12-39-59 M 2_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-02-19 09-31-53 M 2_5.JPG',
 'Beaverhead_elk~AM25~Trip 2~100RECNX~2016-03-14 12-39-58 M 1_5.JPG']

In [20]:
idfg_missing_items = []
for i in idfg_missing:
    item = {
        'image_id': i.split('.JPG')[0],
      'file_name': i.replace('~', '/'),
      'dataset': 'idfg',
      'annotations': {'species': ['__label_unavailable']},
      'location': 'Beaverhead_elk+AM25'
    }
    idfg_missing_items.append(item)

In [21]:
len(idfg_embedded)
idfg_embedded.extend(idfg_missing_items)
len(idfg_embedded)

693870

693886

In [24]:
sample(idfg_embedded, 10)

[{'height': 1536,
  'image_id': 'Beaverhead_elk~AM196~Trip 1~100RECNX~2016-01-06 10-35-00 T',
  'file_name': 'Beaverhead_elk/AM196/Trip 1/100RECNX/2016-01-06 10-35-00 T.JPG',
  'width': 2048,
  'dataset': 'idfg',
  'annotations': {'species': ['deer'],
   'bbox': [{'category': 'animal',
     'bbox_rel': [0.0198, 0.602, 0.0305, 0.0491],
     'bbox_abs': [40.62119711261901,
      924.9134111796326,
      62.49414940402893,
      75.43936606629273]},
    {'category': 'animal',
     'bbox_rel': [0.53, 0.432, 0.0854, 0.055],
     'bbox_abs': [1087.0,
      664.5103759765555,
      174.9486083984384,
      84.48962402343781]}]},
  'datetime': '06-Jan-2016 10:35:00',
  'location': 'Beaverhead_elk+AM196'},
 {'image_id': 'Beaverhead_elk~AM37~Trip 1~100RECNX~2016-01-31 12-20-00 T',
  'file_name': 'Beaverhead_elk/AM37/Trip 1/100RECNX/2016-01-31 12-20-00 T.JPG',
  'dataset': 'idfg',
  'annotations': {'species': ['empty']},
  'datetime': '31-Jan-2016 12:20:00',
  'location': 'Beaverhead_elk+AM37'},


## nacti

In [5]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/nacti_20190819_embedded.json') as f:
    nacti_embedded = json.load(f)

In [6]:
len(nacti_embedded)

3382215

In [7]:
sample(nacti_embedded, 2)

[{'height': 1536,
  'width': 2048,
  'annotations': {'species': ['domestic cow']},
  'image_id': 'FL-07_06_29_2015_FL-07_0042512',
  'location': 'Archbold, FL',
  'study': 'FL',
  'dataset': 'nacti',
  'file_name': 'part1/sub147/FL-07_06_29_2015_FL-07_0042512.jpg'},
 {'height': 1536,
  'width': 2048,
  'annotations': {'species': ['red deer']},
  'image_id': '2014_Unit70_Ivan053_img0337',
  'location': 'San Juan Mntns, Colorado',
  'study': 'CPW',
  'dataset': 'nacti',
  'file_name': 'part0/sub015/2014_Unit70_Ivan053_img0337.jpg'}]

In [25]:
species_counter = Counter()
for i in nacti_embedded:
    species_counter.update(i['annotations']['species'])

In [8]:
nacti_locs = Counter()

for i in nacti_embedded:
    nacti_locs.update([i['location']])
    
len(nacti_locs)

4

In [9]:
nacti_locs

Counter({'Archbold, FL': 1802622,
         'Lebec, California': 780549,
         '': 469356,
         'San Juan Mntns, Colorado': 329688})

In [30]:
# Counter({'CA': 261600, 'FL': 207756})

for i in nacti_embedded:
    if i['location'] == '':
        i['location'] = i['image_id'].split('-')[0]

## rspb_gola_20190409

In [25]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/rspb_gola_20190409_embedded.json') as f:
    gola_embedded = json.load(f)

In [26]:
len(gola_embedded)

62835

In [27]:
sample(gola_embedded, 1)

[{'location': 'P0440__C11',
  'file_name': 'P0440/C11/P0440__C11__2012-05-26__10-16-28(6).JPG',
  'datetime': '26/05/2012 10:16',
  'image_id': 'P0440__C11__2012-05-26__10-16-28(6)',
  'dataset': 'rspb_gola',
  'annotations': {'species': ['empty']},
  'height': 1536,
  'width': 2048}]

Add sequence info to the RSPB database using info in the file names. 

In [28]:
for e in gola_embedded:
    frame_num = int(e['image_id'].split('(')[1].split(')')[0])
    seq_id = e['image_id'].split('(')[0]
    e['seq_id'] = seq_id
    e['frame_num'] = frame_num

In [29]:
for e in gola_embedded:
    if 'species' not in e['annotations']:
        e['annotations']['species'] = ['__label_unavailable']

## wcs_20190817

In [5]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/wcs_20190817_embedded.json') as f:
    wcs_embedded = json.load(f)

In [6]:
len(wcs_embedded)

1207490

In [7]:
sample(wcs_embedded, 1)

[{'datetime': '2008-02-19 13:09:05.000',
  'seq_id': 'ken-007-d0037-1040',
  'seq_num_frames': 33,
  'frame_num': 2,
  'match_level': 1,
  'file_name': 'animals/0200/0530.jpg',
  'width': 1920,
  'image_id': '0fb7c9ee-92d5-11e9-a2bf-000d3a74c7de',
  'location': '1256',
  'corrupt': False,
  'dataset': 'wcs',
  'annotations': {'sex': 'unknown',
   'species': ['empty'],
   'count': -1,
   'age': 'unknown'},
  'wcs_id': 'ken-007-d0037-i012594',
  'country_code': 'ken',
  'height': 1080}]

In [12]:
wcs_locations = Counter()

for e in wcs_embedded:
    wcs_locations.update([e['location']])

In [15]:
wcs_locations['unknown']

2191

In [16]:
seqs_unknown = []

for e in wcs_embedded:
    if e['seq_id'] == 'unknown':
        seqs_unknown.append(e)

In [17]:
len(seqs_unknown)

242053

In [20]:
seqs_unknown[-1]

{'datetime': '2013-07-12 04:07:43.000',
 'seq_id': 'unknown',
 'seq_num_frames': -1,
 'frame_num': -1,
 'match_level': 4,
 'file_name': 'animals/0560/0614.jpg',
 'width': 1920,
 'image_id': '9475a0e7-92d5-11e9-9bd6-000d3a74c7de',
 'location': '4342',
 'corrupt': False,
 'dataset': 'wcs',
 'annotations': {'sex': 'male',
  'species': ['leopardus pardalis'],
  'count': 1,
  'age': 'unknown'},
 'wcs_id': 'gtm-003-d0045-i069662',
 'country_code': 'gtm',
 'height': 1080}

Delete the sequence properties for these sequences so that dummy seq ID can be created for them 

In [21]:
for e in wcs_embedded:
    if e['seq_id'] == 'unknown':
        del e['seq_id']
        del e['seq_num_frames']
        del e['frame_num']

In [38]:
corrupt_ims = []
for e in wcs_embedded:
    if e['corrupt']:
        corrupt_ims.append(e)
        
len(corrupt_ims)

78

In [42]:
non_jpg = []
for e in wcs_embedded:
    if not e['file_name'].endswith('.jpg'):
        non_jpg.append(e)
len(non_jpg)

141

Added `.bmp`, `.avi`, `.tif` as allowed endings for `file` in the schema

Some datetime are nan - make these into string so the schema check can pass

In [35]:
for e in wcs_embedded:
    if type(e['datetime']) is not str:
        e['datetime'] = str(e['datetime'])

In [51]:
weird_class_label = []
for e in wcs_embedded:
    if '#ref!' in e['annotations']['species']:
        weird_class_label.append(e)
len(weird_class_label)

38

These appear to be cars

In [52]:
for e in wcs_embedded:
    if '#ref!' in e['annotations']['species']:
        e['annotations']['species'] = ['vehicle']

## wcs_private

In [60]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/wcs_private_20190819_embedded.json') as f:
    wcs_private_embedded = json.load(f)

In [61]:
len(wcs_private_embedded)

162501

In [63]:
for e in wcs_private_embedded:
    if e['seq_id'] == 'unknown':
        del e['seq_id']
        del e['seq_num_frames']
        del e['frame_num']

In [66]:
for e in wcs_private_embedded:
    if type(e['datetime']) is not str:
        e['datetime'] = str(e['datetime'])

## wiitigers

In [5]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/wiitigers_20190815_embedded.json') as f:
    wii_embedded = json.load(f)

In [6]:
len(wii_embedded)
sample(wii_embedded, 1)

1125938

[{'annotations': {'species': ['human']},
  'file_name': 'raw_camtrap/TATR DATA_2014,15,16,17/2018/block_1/73/A/tourist/I__01068.JPG',
  'width': 2576,
  'dataset': 'wiitigers',
  'height': 1984,
  'image_id': 'raw_camtrap~TATR DATA_2014,15,16,17~2018~block_1~73~A~tourist~I__01068'}]

In [15]:
# manually fixed this oen image
for i in wii_embedded:
    if not i['file_name'].endswith(('.JPG', '.jpg')):
        i['file_name'] = i['file_name'].split('.JFG')[0] + '.JPG'

**The following datasets that were annotated in batch 10 did not have their embedded CCT compiled... Some do not yet have a database json.**

## trailguard_night_mara_190515

In [7]:
dataset_name = 'trailguard_night_mara_190515'
annotation_path = anno_batch_10

In [8]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  4%|▎         | 6560/177122 [00:00<00:05, 32792.29it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:05<00:00, 31801.70it/s]


54 boxes on 63 images were in the annotation file(s). 215715 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
animal: 1
person: 53


In [13]:
# checked that this label should be person, not animal

image_filename_to_bboxes['tgv3_01003.jpg'][0]['category'] = 'person'

In [18]:
sequences = []

for image_filename, bbox in image_filename_to_bboxes.items():
    im_cat = 'person' if len(bbox) > 0 else 'empty'
    sequences.append({
        'dataset': 'trailguard_night_mara_190515',
        'seq_id': 'dummy_' + image_filename.split('.jpg')[0],
        'images': [
            {
                'file': image_filename,  # all flat, no separator /
                'class': [im_cat]
            }
        ]
    })

In [20]:
sequences, images_updated = add_annotations_to_sequences(sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: trailguard_night_mara_190515. The bboxes should also be for this set.
63 images updated; 0 images had their bbox overwritten; 0 images not updated


In [22]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [23]:
write_json('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/trailguard_night_mara_190515_w_batch_10_boxes.json', sequences)

## islandconservation_190705

In [5]:
dataset_name = 'islandconservation_190705'
annotation_path = anno_batch_10

In [6]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  2%|▏         | 2901/177122 [00:00<00:12, 14410.80it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:08<00:00, 21980.72it/s]

10806 boxes on 30000 images were in the annotation file(s). 204963 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
animal: 8695
person: 2102
vehicle: 9





In [8]:
9 + 2102 + 8695  # about two-thirds were empty

10806

In [37]:
sequences = []
count = 0
for image_filename, bbox in image_filename_to_bboxes.items():
    im_cat = 'empty' if len(bbox) == 0 else bbox[0]['category']
    if im_cat == 'animal':
        im_cat = 'unidentified'
    sequences.append({
        'dataset': 'islandconservation_190705',
        'seq_id': 'dummy_' + dataset_name + '_' + str(count),
        'images': [
            {
                'file': image_filename.replace('~', '/'),
                'class': [im_cat]
            }
        ]
    })
    
    count += 1

In [38]:
sequences, images_updated = add_annotations_to_sequences(sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: islandconservation_190705. The bboxes should also be for this set.
30000 images updated; 0 images had their bbox overwritten; 0 images not updated


In [39]:
sample(sequences, 1)

[{'dataset': 'islandconservation_190705',
  'seq_id': 'dummy_islandconservation_190705_1268',
  'images': [{'file': 'Ngeruktabel Camera Study/Cam07a/CAM07A083118/Folder 1/RCNX0022.JPG',
    'class': ['person'],
    'bbox': [{'category': 'person',
      'bbox': [0.08307, 0.02226, 0.915, 0.791]}]}]}]

In [40]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [41]:
write_json('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/islandconservation_190705_w_batch_10_boxes.json', sequences)

## parkscanada_20190715

In [5]:
dataset_name = 'parkscanada_20190715'
annotation_path = anno_batch_10

In [6]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  4%|▍         | 6775/177122 [00:00<00:05, 33866.19it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:05<00:00, 30678.99it/s]


22 boxes on 22 images were in the annotation file(s). 215747 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
person: 5
vehicle: 17


In [8]:
sequences = []
count = 0
for image_filename, bbox in image_filename_to_bboxes.items():
    im_cat = 'empty' if len(bbox) == 0 else bbox[0]['category']
    if im_cat == 'animal':
        im_cat = 'unidentified'
    sequences.append({
        'dataset': 'parkscanada_20190715',
        'seq_id': 'dummy_' + 'parkscanada_20190715_cars' + '_' + str(count),
        'images': [
            {
                'file': image_filename.replace('~', '/'),
                'class': [im_cat]
            }
        ]
    })
    
    count += 1

In [10]:
sequences, images_updated = add_annotations_to_sequences(sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: parkscanada_20190715. The bboxes should also be for this set.
22 images updated; 0 images had their bbox overwritten; 0 images not updated


In [11]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [12]:
write_json('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/parkscanada_20190715_w_batch_10_boxes.json', sequences)

## bnf_20190624and0815

The BNF data is too messy to put into sequences or locations.

In [5]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/batch_9_10_11/BNF/BNF_20190624and0815_20190826.json') as f:
    bnf_cct = json.load(f)

In [7]:
bnf_embedded = make_cct_embedded(bnf_cct)  # did not have any bboxes

Loading image DB...
Number of items from the image DB: 25000
Number of images with more than 1 species: 0 (0.0% of image DB)
No bbox DB provided.


In [18]:
len(bnf_embedded)  # only the ones sampled to get bboxes were labeled

25000

In [19]:
for e in bnf_embedded:
    e['image_id'] = e['id']
    del e['id']
    
    if 'annotations' not in e or 'species' not in e['annotations']:
        e['annotations'] = {'species': ['__label_unavailable']}

In [21]:
sample(bnf_embedded, 2)

[{'file_name': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2011/Camera trap locations 2011/KM 4 x Railway/Km4 Railway 19-11-11/Cam 2011-5/CDY_0071.JPG',
  'image_id': 15551,
  'annotations': {'species': ['__label_unavailable']}},
 {'file_name': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2010/Best of 2010/Best of Agustus/Km 3 Macaque 12-07-2010 (3).JPG',
  'annotations': {'species': ['macaque']},
  'image_id': 14204}]

## sulross_kitfox

In [5]:
dataset_name = 'sulross_kitfox'
annotation_path = anno_batch_10

In [6]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  4%|▍         | 7041/177122 [00:00<00:04, 35191.99it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:07<00:00, 22213.96it/s]

72 boxes on 39 images were in the annotation file(s). 215697 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
animal: 5
person: 29
vehicle: 38





In [8]:
sequences = []
count = 0
for image_filename, bbox in image_filename_to_bboxes.items():
    im_cat = 'empty' if len(bbox) == 0 else bbox[0]['category']
    if im_cat == 'animal':
        im_cat = 'unidentified'
    sequences.append({
        'dataset': 'sulross_kitfox',
        'seq_id': 'dummy_' + 'sulross_kitfox_cars' + '_' + str(count),
        'images': [
            {
                'file': image_filename.replace('~', '/'),
                'class': [im_cat]
            }
        ]
    })
    
    count += 1

In [10]:
sequences, images_updated = add_annotations_to_sequences(sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: sulross_kitfox. The bboxes should also be for this set.
39 images updated; 0 images had their bbox overwritten; 0 images not updated


In [11]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [12]:
write_json('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/sulross_kitfox_w_batch_10_boxes.json', sequences)

## nacti_private

In [5]:
dataset_name = 'nacti_private'
annotation_path = anno_batch_10

In [6]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  3%|▎         | 6149/177122 [00:00<00:05, 30737.74it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:05<00:00, 29838.96it/s]

1083 boxes on 1000 images were in the annotation file(s). 214686 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
animal: 63
group: 1
person: 842
vehicle: 177





In [9]:
list(image_filename_to_bboxes.items())[100]

('FL-11_06_01_2015_FL-11_0000039.jpg',
 [{'category': 'person', 'bbox': [0.3995, 0.02225, 0.1825, 0.8259]}])

In [10]:
sequences = []
count = 0
for image_filename, bbox in image_filename_to_bboxes.items():
    im_cat = 'empty' if len(bbox) == 0 else bbox[0]['category']
    if im_cat == 'animal':
        im_cat = 'unidentified'
    sequences.append({
        'dataset': 'nacti_private',
        'seq_id': 'dummy_' + 'nacti_private' + '_' + str(count),
        'images': [
            {
                'file': image_filename.replace('~', '/'),
                'class': [im_cat]
            }
        ]
    })
    
    count += 1

In [11]:
sample(sequences, 2)

[{'dataset': 'nacti_private',
  'seq_id': 'dummy_nacti_private_258',
  'images': [{'file': '2015_Unit097_Ivan076_img1325.jpg',
    'class': ['person']}]},
 {'dataset': 'nacti_private',
  'seq_id': 'dummy_nacti_private_753',
  'images': [{'file': 'CA-45_10_07_2015_CA-45_0011459.jpg',
    'class': ['person']}]}]

In [12]:
sequences, images_updated = add_annotations_to_sequences(sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: nacti_private. The bboxes should also be for this set.
1000 images updated; 0 images had their bbox overwritten; 0 images not updated


In [13]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [14]:
write_json('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/nacti_private_w_batch_10_boxes.json', sequences)

## caltech

In [5]:
# correct database with the unidentified_animal problem fixed

im_db_path = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/databases_caltech_190918/caltech_images_20190919.json'
bbox_db_path = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/databases_caltech_190918/caltech_bboxes_20190904.json'

In [6]:
with open(im_db_path) as f:
    caltech_im_db = json.load(f)
with open(bbox_db_path) as f:
    caltech_bbox_db = json.load(f)

In [13]:
# get rid of the train_val and test prefix to the file_name field in the image DB
# - these are not in the unzipped version of the images

for i in caltech_im_db['images']:
    i['file_name'] = i['file_name'].split('/')[1]

In [15]:
caltech_embedded = make_cct_embedded(image_db=caltech_im_db, bbox_db=caltech_bbox_db)

Loading image DB...
Number of items from the image DB: 243100
Number of images with more than 1 species: 1569 (0.65% of image DB)
Loading bbox DB...
Number of images added from bbox DB entries:  1080
Number of images amended:  0
Number of items in total:  244180
Number of images with more than one bounding box: 2571 (1.0529117863870916% of all entries)


In [18]:
# In the unzipped version both train_val and test are stored in one folder...
caltech_species = Counter()

unlabeled = []

for e in caltech_embedded: 
    if 'annotations' in e and 'species' in e['annotations']:
        caltech_species.update(e['annotations']['species'])
    else:
        unlabeled.append(e)
len(unlabeled)

1080

In [19]:
unlabeled[100]

{'date_captured': '2013-02-15 09:32:53',
 'file_name': '59e441b7-23d2-11e8-a6a3-ec086b02610b.jpg',
 'seq_id': '70136b4c-5567-11e8-8d07-dca9047ef277',
 'seq_num_frames': 3,
 'rights_holder': 'Justin Brown',
 'location': '61',
 'frame_num': 1,
 'id': '59e441b7-23d2-11e8-a6a3-ec086b02610b',
 'annotations': {'bbox': [{'category': 'animal',
    'bbox': [0.686, 0.478, 0.0444, 0.0596]}]}}

In [17]:
caltech_species

Counter({'empty': 125745,
         'deer': 12196,
         'cat': 5297,
         'opossum': 16698,
         'car': 4717,
         'rabbit': 12519,
         'dog': 4512,
         'raccoon': 11390,
         'rodent': 4279,
         'coyote': 16601,
         'skunk': 1898,
         'fox': 2574,
         'bird': 10003,
         'squirrel': 4450,
         'bobcat': 8098,
         'lizard': 309,
         'badger': 50,
         'mountain_lion': 145,
         'pig': 2,
         'insect': 6,
         'bat': 3,
         'cow': 3626})

In [22]:
new_species = Counter()
for e in caltech_embedded: 
    if 'species' not in e['annotations']:
        e['annotations']['species'] = ['__label_unavailable']
        
    new_species.update(e['annotations']['species'])

    if 'bbox' in e['annotations']:
        # this copy already has the coordinates in relative coord
        for b in e['annotations']['bbox']:
            b['bbox_rel'] = b['bbox']

In [96]:
new_species

Counter({'empty': 125745,
         'deer': 12196,
         'cat': 5297,
         'opossum': 16698,
         'car': 4717,
         'rabbit': 12519,
         'dog': 4512,
         'raccoon': 11390,
         'rodent': 4279,
         'coyote': 16601,
         'skunk': 1898,
         'fox': 2574,
         'bird': 10003,
         'squirrel': 4450,
         'bobcat': 8098,
         'lizard': 309,
         'badger': 50,
         'mountain_lion': 145,
         'pig': 2,
         'insect': 6,
         'bat': 3,
         'cow': 3626,
         '__label_unavailable': 1080})

Some location fields are str - these were the ones added from the bbox DB.

Change `date_captured` to `datetime` which is more standard.

In [34]:
for e in caltech_embedded:
    e['location'] = int(e['location'])
    e['datetime'] = e['date_captured']
    del e['date_captured']

In [35]:
sample(caltech_embedded, 2)

[{'seq_num_frames': 3,
  'seq_id': '6efebedc-5567-11e8-a4be-dca9047ef277',
  'height': 1494,
  'width': 2048,
  'location': 38,
  'rights_holder': 'Justin Brown',
  'file_name': '59cb3880-23d2-11e8-a6a3-ec086b02610b.jpg',
  'id': '59cb3880-23d2-11e8-a6a3-ec086b02610b',
  'frame_num': 3,
  'annotations': {'species': ['rabbit']},
  'datetime': '2011-06-23 09:18:17'},
 {'seq_num_frames': 3,
  'seq_id': '701de9e3-5567-11e8-8874-dca9047ef277',
  'height': 584,
  'width': 800,
  'location': 136,
  'rights_holder': 'Justin Brown',
  'file_name': '593a4f37-23d2-11e8-a6a3-ec086b02610b.jpg',
  'id': '593a4f37-23d2-11e8-a6a3-ec086b02610b',
  'frame_num': 3,
  'annotations': {'species': ['bird']},
  'datetime': '2014-02-10 08:03:47'}]

In [40]:
dataset_name = 'caltech'
cal_sequences = process_sequences(caltech_embedded, dataset_name)

The dataset_name is set to caltech. Please make sure this is correct!
Making a deep copy of docs...
Putting 244180 images into sequences...
Number of sequences: 181110
Checking the location field...

all_img_properties
{'datetime', 'id', 'rights_holder', 'bbox', 'file', 'class', 'location', 'frame_num'}

img_level_properties
{'datetime', 'id', 'bbox', 'file', 'class', 'frame_num'}

image-level properties that really should be sequence-level
{'rights_holder', 'location'}

Finished processing sequences.
Example sequence items:

{'seq_id': '6f2160eb-5567-11e8-990e-dca9047ef277', 'dataset': 'caltech', 'images': [{'id': '5968c0f9-23d2-11e8-a6a3-ec086b02610b', 'frame_num': 1, 'datetime': '2013-10-04 13:31:53', 'file': '5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg', 'class': ['empty']}], 'rights_holder': 'Erin Boydston', 'location': 26}

[{'seq_id': '6f88cc91-5567-11e8-9568-dca9047ef277', 'dataset': 'caltech', 'images': [{'id': '5a16417d-23d2-11e8-a6a3-ec086b02610b', 'frame_num': 1, 'datetime': '

Some frame_num are not unique in a sequence.

In [42]:
problem_sequences = []

for seq in cal_sequences:
    if 'images' not in seq:
        continue
        
    # if there are more than one image item, each needs a frame_num
    if len(seq['images']) > 1:
        frame_num_set = []
        for i in seq['images']:
            if 'frame_num' not in i:
                assert False, 'sequence {} has more than one image but not all images have frame_num'.format(seq['seq_id'])

            frame_num_set.append(i['frame_num'])
            
        if len(set(frame_num_set)) != len(seq['images']):
            problem_sequences.append(seq)

In [43]:
len(problem_sequences)

44

In [44]:
write_json('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/databases_caltech_190918/caltech_problem_seqs.json', problem_sequences)

Manually corrected using the timestamp. Some had the same timestamp - chose frame_num randomly in that case.

In [91]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/databases_caltech_190918/caltech_corrected_seqs.json') as f:
    corrected_seqs = json.load(f)

In [92]:
corrected = {}
for s in corrected_seqs:
    corrected[s['seq_id']] = s['images']

In [94]:
for seq in cal_sequences:
    if seq['seq_id'] in corrected:
        seq['images'] = corrected[seq['seq_id']]

In [98]:
for seq in cal_sequences:
    for im in seq['images']:
        im['class'] = list(set(im['class']))  # some dups

In [97]:
cal_sequences[0]

OrderedDict([('dataset', 'caltech'),
             ('seq_id', '6f2160eb-5567-11e8-990e-dca9047ef277'),
             ('location', 26),
             ('images',
              [{'id': '5968c0f9-23d2-11e8-a6a3-ec086b02610b',
                'frame_num': 1,
                'datetime': '2013-10-04 13:31:53',
                'file': '5968c0f9-23d2-11e8-a6a3-ec086b02610b.jpg',
                'class': ['empty']}]),
             ('rights_holder', 'Erin Boydston')])

In [99]:
sequences_schema_check.sequences_schema_check(cal_sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [101]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  1%|          | 1261/177122 [00:00<00:15, 11680.87it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:09<00:00, 19244.48it/s]

2773 boxes on 2615 images were in the annotation file(s). 212996 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
animal: 2
person: 200
vehicle: 2571





In [102]:
cal_sequences, images_updated = add_annotations_to_sequences(cal_sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: caltech. The bboxes should also be for this set.
2615 images updated; 2615 images had their bbox overwritten; 241565 images not updated


In [103]:
write_json('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/caltech_w_batch_10_boxes.json', cal_sequences)

## snapshotserengeti_private

The images sent to get annotated all had the prefix `snapshotserengeti`.

Need a separate `dataset_name` for the private set.

In [110]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/batch_9_10_11/SS_1_11/SnapshotSerengeti_v2_0.json') as f:
    all_ss = json.load(f)

In [113]:
len(all_ss['images'])
len(all_ss['annotations'])
len(all_ss['categories'])

6679039

6755090

61

In [115]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/databases_201908/snapshotserengeti/SnapshotSerengetiBboxes_20190903.json') as f:
    all_ss_bbox = json.load(f)

In [116]:
ss_embedded = make_cct_embedded(image_db=all_ss, bbox_db=all_ss_bbox)

Loading image DB...
Number of items from the image DB: 6679039
Number of images with more than 1 species: 75347 (1.13% of image DB)
Loading bbox DB...
Number of images added from bbox DB entries:  0
Number of images amended:  82938
Number of items in total:  6679039
Number of images with more than one bounding box: 24905 (0.37288298511207973% of all entries)


In [117]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/snapshotserengeti_embedded.json', 'w') as f:
    json.dump(ss_embedded, f)

Restarts here... `ss_embedded` is 3GB; `all_ss` is 5GB.

In [5]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/cosmos/snapshotserengeti_embedded.json') as f:
    ss_embedded = json.load(f)

In [16]:
sample(ss_embedded, 1)

[{'id': 'S3/H03/H03_R11/S3_H03_R11_IMAG4457',
  'file_name': 'S3/H03/H03_R11/S3_H03_R11_IMAG4457.JPG',
  'frame_num': 3,
  'seq_id': 'SER_S3#H03#11#1525',
  'width': 2048,
  'height': 1536,
  'corrupt': False,
  'location': 'H03',
  'seq_num_frames': 3,
  'datetime': '2011-12-27 10:31:05',
  'annotations': {'species': ['empty'],
   'season': 'S3',
   'subject_id': 'ASG0009o9z',
   'count': nan,
   'standing': nan,
   'resting': nan,
   'moving': nan,
   'interacting': nan,
   'young_present': nan}}]

In [12]:
# combine with S11 metadata
with open('/Users/siyuyang/Source/temp_data/CameraTrap/batch_9_10_11/SS_1_11/SnapshotSerengetiS11.json') as f:
    s11_cct = json.load(f)

In [15]:
s11_embedded = make_cct_embedded(image_db=s11_cct)

Loading image DB...
Number of items from the image DB: 499401
Number of images with more than 1 species: 6727 (1.35% of image DB)
No bbox DB provided.


In [17]:
sample(s11_embedded, 1)

[{'id': 'SER_S11/G03/G03_R1/SER_S11_G03_R1_IMAG0750',
  'file_name': 'SER_S11/G03/G03_R1/SER_S11_G03_R1_IMAG0750.JPG',
  'frame_num': 3,
  'seq_id': 'SER_S11#G03#1#334',
  'width': 2560,
  'height': 1920,
  'corrupt': False,
  'location': 'G03',
  'seq_num_frames': 3,
  'datetime': '2015-08-13 11:10:20',
  'annotations': {'species': ['zebra'],
   'season': 'SER_S11',
   'subject_id': 21951579,
   'count': '2',
   'standing': 0.63,
   'resting': 0.05,
   'moving': 0.11,
   'interacting': 0.0,
   'young_present': 0.0}}]

In [18]:
ss_embedded.extend(s11_embeddededded)

In [11]:
for e in ss_embedded:
    if 'bbox' in e['annotations']:
    # this DB already has the coordinates in relative coord
        for b in e['annotations']['bbox']:
            b['bbox_rel'] = b['bbox']

In [19]:
# how many images are there in each season?
im_per_season = defaultdict(int)

for e in ss_embedded:
    season = e['id'].split('/')[0]
    im_per_season[season] += 1

In [20]:
im_per_season

defaultdict(int,
            {'S10': 685481,
             'S2': 573200,
             'S5': 827224,
             'S6': 462846,
             'S9': 982404,
             'S8': 980256,
             'S3': 392507,
             'S7': 832153,
             'S4': 531554,
             'S1': 411414,
             'SER_S11': 499401})

In [27]:
dataset_name = 'snapshotserengeti'
annotation_path = anno_batch_10

In [23]:
ss_sequences = process_sequences(ss_embedded, dataset_name, deepcopy_docs=False)

  0%|          | 21701/7178440 [00:00<01:05, 108481.70it/s]

The dataset_name is set to snapshotserengeti. Please make sure this is correct!
Putting 7178440 images into sequences...


100%|██████████| 7178440/7178440 [01:38<00:00, 73077.69it/s] 


Number of sequences: 2659222
Checking the location field...

all_img_properties
{'standing', 'bbox', 'class', 'subject_id', 'count', 'location', 'frame_num', 'moving', 'corrupt', 'id', 'young_present', 'datetime', 'file', 'resting', 'season', 'interacting'}

img_level_properties
{'file', 'id', 'frame_num', 'bbox'}

image-level properties that really should be sequence-level
{'standing', 'class', 'subject_id', 'count', 'location', 'moving', 'corrupt', 'young_present', 'datetime', 'resting', 'season', 'interacting'}

! Sequence-level property corrupt with value False should be a dataset-level property. Removed from sequences.
Finished processing sequences.
Example sequence items:

{'seq_id': 'SER_S10#C05#2#519', 'dataset': 'snapshotserengeti', 'images': [{'id': 'S10/C05/C05_R2/S10_C05_R2_IMAG1333', 'frame_num': 2, 'file': 'S10/C05/C05_R2/S10_C05_R2_IMAG1333.JPG'}, {'id': 'S10/C05/C05_R2/S10_C05_R2_IMAG1332', 'frame_num': 1, 'file': 'S10/C05/C05_R2/S10_C05_R2_IMAG1332.JPG'}, {'id': 'S10/C

In [24]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/batch_9_10_11/SS_1_11/ss_sequences.json', 'w') as f:
    json.dump(ss_sequences, f)  # this is 1.6GB

In [25]:
sequences_schema_check.sequences_schema_check(ss_sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [29]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  2%|▏         | 3191/177122 [00:00<00:05, 31903.64it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:07<00:00, 22770.98it/s]

4868 boxes on 3680 images were in the annotation file(s). 210901 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
animal: 840
group: 10
person: 2412
vehicle: 1606





In [30]:
ss_sequences, images_updated = add_annotations_to_sequences(ss_sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: snapshotserengeti. The bboxes should also be for this set.
3680 images updated; 533 images had their bbox overwritten; 7174760 images not updated


Now `ss_sequences` contains all the past and batch 10 bboxes. Need to separate out the human ones. If any image belongs to the private set, move that sequence to the private set.

In [33]:
num_im_w_box = 0
for seq in ss_sequences:
    for im in seq['images']:
        if 'bbox' in im:
            num_im_w_box += 1
num_im_w_box

81176

In [34]:
dataset_name_pri = 'snapshotserengeti_private'

In [36]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/batch_9_10_11/SS_1_11/list_humans.txt') as f:
    list_human_ims = f.readlines()
list_human_ims = [i.strip() for i in list_human_ims]

In [39]:
list_human_ims = [i for i in list_human_ims if i.endswith('.JPG')]
len(list_human_ims)

47015

In [43]:
list_human_ims = ['/'.join(i.split('/')[1:]) for i in list_human_ims]

In [48]:
human_ims = set(list_human_ims)
len(human_ims)

47015

In [51]:
ss_pri_seqs = []
ss_pub_seqs = []

for seq in tqdm(ss_sequences):
    
    has_hum_im = False
    
    for im in seq['images']:
        if im['file'] in human_ims:
            has_hum_im = True
            break
    if has_hum_im:
        seq['dataset'] = dataset_name_pri
        ss_pri_seqs.append(seq)
    else:
        ss_pub_seqs.append(seq)

100%|██████████| 2659222/2659222 [01:34<00:00, 28131.78it/s]


In [52]:
len(ss_pri_seqs)
len(ss_pub_seqs)

17106

2642116

In [54]:
write_json('/Users/siyuyang/Source/temp_data/CameraTrap/batch_9_10_11/SS_1_11/snapshotserengeti_private_w_batch_10_boxes.json', ss_pri_seqs)

In [55]:
write_json('/Users/siyuyang/Source/temp_data/CameraTrap/batch_9_10_11/SS_1_11/snapshotserengeti_w_batch_10_boxes.json', ss_pub_seqs)

# Make into MegaDB format

In [25]:
docs = idfg_embedded

In [26]:
dataset_name = 'idfg'
partial_mega_db_path = ''
annotation_path = anno_batch_10
mega_db_with_bbox = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/idfg_w_batch_10_boxes_fixed.json'

In [27]:
sequences = process_sequences(docs, dataset_name)

The dataset_name is set to idfg. Please make sure this is correct!
Making a deep copy of docs...


  2%|▏         | 15408/693886 [00:00<00:08, 77030.06it/s]

Putting 693886 images into sequences...


100%|██████████| 693886/693886 [00:08<00:00, 80554.75it/s]


Number of sequences: 693886
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'bbox', 'class', 'file', 'dataset', 'datetime', 'location', 'image_id'}

img_level_properties
{'bbox', 'file', 'image_id', 'datetime'}

image-level properties that really should be sequence-level
{'class', 'dataset', 'location'}

Finished processing sequences.
Example sequence items:

{'seq_id': 'dummy_169821ab557440368b9bc9e03ca2ff46', 'dataset': 'idfg', 'images': [{'image_id': 'ClearCreek_mustelids~Summer2015~FS-034~RCNX2002', 'datetime': '8/25/15 7:41:12', 'file': 'ClearCreek_mustelids/Summer2015/FS-034/RCNX2002.JPG'}], 'class': ['cattle'], 'location': 'ClearCreek_mustelids+FS-034'}

[{'seq_id': 'dummy_4bf420bdc7f64da0bc68502672051d98', 'dataset': 'idfg', 'images': [{'image_id': 'St.Joe_elk~AM164~Trip 1~100RECNX~RCNX5004', 'datetime': '18-Mar-2016 12:16:56', 'file': 'St.Joe_elk/AM164/Trip 1/100RECNX/RCNX5004.JPG'}], 'class': ['empty'], 'loc

In [28]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [35]:
# write_json(partial_mega_db_path, sequences)

# Add bounding boxes

In [29]:
image_filename_to_bboxes = extract_annotations(annotation_path, dataset_name)

  4%|▎         | 6361/177122 [00:00<00:05, 31793.85it/s]

19 files found in directory at annotation_path
Number of annotation entries found: 177122


100%|██████████| 177122/177122 [00:06<00:00, 29340.24it/s]

43 boxes on 29 images were in the annotation file(s). 215726 boxes skipped because they are not for the requested dataset

Category counts for the bboxes:
person: 15
vehicle: 28





In [30]:
sequences, images_updated = add_annotations_to_sequences(sequences, image_filename_to_bboxes)

Dataset to which the sequences belong to: idfg. The bboxes should also be for this set.
29 images updated; 0 images had their bbox overwritten; 693857 images not updated


In [32]:
sample(sequences, 2)

[OrderedDict([('dataset', 'idfg'),
              ('seq_id', 'dummy_0f3eef05b402408787513e1268d17d06'),
              ('location', 'St_Joe_elk+AM142'),
              ('images',
               [{'image_id': 'St.Joe_elk~AM142~Trip 1~100RECNX~RCNX0374',
                 'datetime': '15-Feb-2016 02:25:05',
                 'file': 'St.Joe_elk/AM142/Trip 1/100RECNX/RCNX0374.JPG'}]),
              ('class', ['empty'])]),
 OrderedDict([('dataset', 'idfg'),
              ('seq_id', 'dummy_00bc06d550144816a5378bc153cf5373'),
              ('location', 'ClearCreek_mustelids+FS-045'),
              ('images',
               [{'image_id': 'ClearCreek_mustelids~Winter2015-16~FS-045-P~RCNX0932',
                 'datetime': '11/26/15 16:28:56',
                 'file': 'ClearCreek_mustelids/Winter2015-16/FS-045-P/RCNX0932.JPG',
                 'bbox': [{'category': 'animal',
                   'bbox': [0.0587, 0.663, 0.197, 0.191]}]}]),
              ('class', ['deer'])])]

In [33]:
write_json(mega_db_with_bbox, sequences)