In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps_data')  # append this repo to PYTHONPATH

In [38]:
import json
import os
from collections import Counter, defaultdict
from random import sample, shuffle
import math

from tqdm import tqdm

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# Importing the UBC dataset (private) to MegaDB

First draft by Gramener. Siyu re-ran with verified data (~400k) on 2020 Sept 2. 

In [4]:
path_to_output = '.../data/CameraTraps/MegaDB/ubc_fennell_megadb.json'

In [5]:
dataset_name = 'ubc_fennell'

In [6]:
# path to the CCT json, or a loaded json object
path_to_image_cct = '.../data/CameraTraps/CCT_JSONs/ubc.json'  # set to None if not available
path_to_bbox_cct = None  # set to None if not available
assert not (path_to_image_cct is None and path_to_bbox_cct is None)

In [10]:
with open(path_to_image_cct) as f:
    cct_json = json.load(f)

There are back slashes in the file names in the CCT JSON - changing them to forward slash

In [14]:
for i in cct_json['images']:
    i['file_name'] = i['file_name'].replace('\\', '/')

In [None]:
cct_json['images'][1000]

In [None]:
cct_json['annotations'][1000]

In [31]:
## save a copy
write_json('.../data/CameraTraps/CCT_JSONs/ubc_fennell_cct.json', cct_json)

In [16]:
# pass in the updated CCT JSON
embedded = make_cct_embedded(image_db=cct_json)

Loading image DB...
Number of items from the image DB: 441483
Number of images with more than 1 species: 49178 (11.14% of image DB)
No bbox DB provided.


In [None]:
sequences = process_sequences(embedded, dataset_name)

We have no sequence information but it seems that the annotation was done at the sequence level... Moving various properties back to the image level so that it makes more sense for future queries.

In [22]:
for seq in sequences:
    assert len(seq['images']) == 1

In [24]:
for i_seq, seq in enumerate(sequences):
    del seq['id']  # seems to be exactly the same as file path to the only image
    
    for prop in ['species_count', 'datetime', 'class']:
        if prop in seq:
            seq['images'][0][prop] = seq[prop]
            del seq[prop]

In [None]:
json.dumps(sample(sequences, 3))

In [29]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [30]:
write_json(path_to_output, sequences)

## Train/val/test splits
in 70:15:15, as there are lots of vehicles in this dataset and we need more vehicles/humans in our val/test splits to better evaluate megadetector.

In [35]:
for seq in sequences:
    assert 'location' in seq

In [36]:
locations = set()
for seq in sequences:
    locations.add(seq['location'])
    
len(locations)

197

In [39]:
li_locations = list(locations)
print(f'first loc before shuffling is {li_locations[0]}')
shuffle(li_locations)
print(f'first loc after shuffling is {li_locations[0]}')

num_train = round(0.7 * len(locations))
num_val = round(0.15 * len(locations))

locs_train = li_locations[:num_train]
locs_val = li_locations[num_train:num_train + num_val]
locs_test = li_locations[num_train + num_val:]

first loc before shuffling is South Chilcotins Wildlife Survey 2018_F7
first loc after shuffling is South Chilcotins Wildlife Survey 2018_L9


In [41]:
splits_table = [
    {
        'dataset': dataset_name,
        'train': locs_train,
        'val': locs_val,
        'test': locs_test
    }
]

In [43]:
with open('.../CameraTrap/Databases/megadb_2020/ubc_fennell_splits.json', 'w') as f:
    json.dump(splits_table, f, indent=4)