In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is 'last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
from random import shuffle, sample
from collections import defaultdict

import sys
sys.path.append('../../../')  # CameraTraps repository base dir

from data_management.megadb.megadb_utils import MegadbUtils

In [3]:
splits_dir = '/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_splits_table'

batch_12_dir = '/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_batch_12/after_anno'

In [104]:
megadb_utils = MegadbUtils()

In [105]:
splits_table = megadb_utils.get_splits_table()

In [106]:
len(splits_table)
sorted(splits_table.keys())

42

['alka_squirrels',
 'amapa_1819',
 'au_nt_gov_kerr',
 'auckland_doc_2019',
 'awc_190430',
 'awc_202103',
 'caltech',
 'channel_islands_tnc',
 'channel_islands_tnc_private',
 'fws_butler_redrock',
 'fws_hawaii_kauai_forest_birds_a24s',
 'fws_lehnen_mice',
 'idfg',
 'idfg_swwlf_2020',
 'islandconservation_midway_2020',
 'islandconservation_rodents_2020',
 'kays_emammal',
 'mcshea_emammal',
 'nacti',
 'peaceparks_201908_humans',
 'rspb_atkinson',
 'rspb_gola',
 'rspb_gola_2020',
 'saola',
 'saola_private',
 'snapshot_camdeboo',
 'snapshot_enonkishu',
 'snapshot_karoo',
 'snapshot_kgalagadi',
 'snapshot_kruger',
 'snapshot_mountain_zebra',
 'snapshot_safari_private',
 'snapshotserengeti',
 'snapshotserengeti_private',
 'ubc_fennell',
 'umn_gomez',
 'uw_gardner',
 'wcs',
 'wcs_private',
 'wellington_nz',
 'wpz_emammal_2018',
 'zsl_borneo']

In [107]:
datasets_table = megadb_utils.get_datasets_table()

In [108]:
len(datasets_table)

64

In [109]:
all_datasets = set(datasets_table.keys())
datasets_w_splits = set(splits_table.keys())

need_splitting = all_datasets - datasets_w_splits

In [110]:
need_splitting

{'bellevue_190602',
 'bnf_20190624and0815',
 'ena24',
 'fws_hawaii_kauai_forest_birds_training',
 'idfg_swwlf_2019',
 'islandconservation_190705',
 'islandconservation_200529',
 'islandconservation_200529_private',
 'nacti_private',
 'parkscanada_20190715',
 'parkscanada_garrow_201920_trains',
 'rspb_gola_1819',
 'sulross_2018',
 'sulross_2019_spring',
 'sulross_kitfox',
 'trailguard_night_mara_190515',
 'wii20190517',
 'wiitigers',
 'wps_190624',
 'wps_210101',
 'wpz_wolverine_labeled',
 'wpz_wolverine_unlabeled'}

## Helper functions

In [6]:
def count_boxes_at_locations(sequences):
    counter = defaultdict(int)
    for seq in sequences:
        location = seq['location'] if 'location' in seq else 'NO_LOCATION'
        num_boxes = 0
        for im in seq['images']:
            if 'bbox' in im:
                num_boxes += len(im['bbox'])
        counter[location] += num_boxes
    return counter

def get_proportions(counter, train, val, test):
    # proportion of each split in terms of number of bounding boxes
    assert sum((len(train), len(val), len(test))) == len(counter)
    
    total = sum(counter.values())
    train_percent = 100 * sum([counter[location] for location in train]) / total
    val_percent = 100 * sum([counter[location] for location in val]) / total
    test_percent = 100 * sum([counter[location] for location in test]) / total
    
    print(f'Total {total} boxes: {train_percent:.2f}% train, {val_percent:.2f}% val, {test_percent:.2f}% test')

def write_splits(dataset_name, train, val, test):
    splits_obj = {
        'dataset': dataset_name,
        'train': sorted(train),
        'val': sorted(val),
        'test': sorted(test)
    }

    with open(os.path.join(splits_dir, f'splits_{dataset_name}.json'), 'w', encoding='utf8') as f:
        json.dump(splits_obj, f, indent=1, ensure_ascii=False)

## alka_squirrels

In [30]:
with open(os.path.join(batch_12_dir, 'alka_squirrels.json')) as f:
    sequences = json.load(f)

In [None]:
location_boxes = count_boxes_at_locations(sequences)
location_boxes
len(location_boxes)

In [15]:
val = ['tagged_0', '2020_05_08_42C']
test = ['2020_05_07_30C']
train = [loc for loc in location_boxes.keys() if loc not in val and loc not in test and loc != 'NO_LOCATION']

In [18]:
get_proportions(location_boxes, train, val, test)

Total 6394 boxes: 89.10% train, 5.15% val, 5.76% test


In [26]:
write_splits('alka_squirrels', train, val, test)

## amapa_1819

In [32]:
dataset_name = 'amapa_1819'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [33]:
location_boxes = count_boxes_at_locations(sequences)
location_boxes
len(location_boxes)

defaultdict(int,
            {'C2012P031': 292,
             'C2012P032': 629,
             'C2012P041_A': 139,
             'C2012P041_B': 27,
             'C2012P06': 147,
             'C2012P07': 59,
             'C2012P081': 277,
             'C2012P091': 153,
             'C2012P12': 201,
             'C2012P13': 124,
             'C2012P16': 498,
             'C2012P181': 627,
             'C2012P242': 151,
             'C2012P28': 529,
             'C2017P2': 87,
             'C2017P3': 482,
             'C2017P4': 204,
             'C2017P5': 155,
             'C2017P6': 22,
             'C2017P7_escondida': 88,
             'CBF27': 321,
             '2012P03': 262,
             '2012P03_c2': 456,
             '2012P04': 64,
             '2012P04B': 99,
             '2012P06': 97,
             '2012P07': 95,
             '2012P08': 330,
             '2012P09': 84,
             '2012P12': 116,
             '2012P13': 416,
             '2012P16': 148,
             '2012P18': 119

42

In [40]:
val = ['2012P06', 'C2017P3', 'BF18']
test = ['C2017P5', '2012P12']
train = [loc for loc in location_boxes.keys() if loc not in val and loc not in test and loc != 'NO_LOCATION']

In [41]:
get_proportions(location_boxes, train, val, test)

Total 9844 boxes: 90.91% train, 6.34% val, 2.75% test


In [42]:
write_splits(dataset_name, train, val, test)

## au_nt_gov_kerr
Incomplete location info - many with location 'unknown'

In [43]:
dataset_name = 'au_nt_gov_kerr'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [None]:
location_boxes = count_boxes_at_locations(sequences)
location_boxes
len(location_boxes)

In [51]:
val = ['AR', 'BN', 'WAD', 'WN']
test = ['BP', 'MAN']
train = [loc for loc in location_boxes.keys() if loc not in val and loc not in test and loc != 'NO_LOCATION']

get_proportions(location_boxes, train, val, test)

Total 19758 boxes: 95.14% train, 3.39% val, 1.47% test


In [52]:
write_splits(dataset_name, train, val, test)

## auckland_doc_2019

In [53]:
dataset_name = 'auckland_doc_2019'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [56]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

337

In [57]:
locations = list(location_boxes.keys())
locations[0]
shuffle(locations)
locations[0]

'vBqd'

'vkR4'

In [59]:
val = locations[:15]
test = locations[15:30]
train = locations[30:]

get_proportions(location_boxes, train, val, test)

Total 35529 boxes: 96.14% train, 2.38% val, 1.49% test


In [60]:
write_splits(dataset_name, train, val, test)

## awc_202103

In [61]:
dataset_name = 'awc_202103'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [62]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

157

In [73]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:8]
test = locations[8:16]
train = locations[16:]

assert 'unknown' in train

get_proportions(location_boxes, train, val, test)

Total 19467 boxes: 91.41% train, 3.79% val, 4.80% test


In [74]:
write_splits(dataset_name, train, val, test)

## channel_islands_tnc and channel_islands_tnc_private

Since they come from a common set of locations, they should share one way of splitting.

In [76]:
dataset_name = 'channel_islands_tnc'

with open('/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_mdv5/channel_islands_tnc.json') as f:
    sequences = json.load(f)

In [77]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

73

In [78]:
dataset_name1 = 'channel_islands_tnc_private'

with open('/Users/siyuyang/Library/CloudStorage/OneDrive-Microsoft/Projects/CameraTrap/Databases/megadb_mdv5/channel_islands_tnc_private.json') as f:
    sequences1 = json.load(f)

In [79]:
location_boxes1 = count_boxes_at_locations(sequences1)
len(location_boxes1)

59

In [92]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:10] # more generous on the val/test proportion as we need more person/vehicle images in val/test overall
test = locations[10:20]
train = locations[20:]

get_proportions(location_boxes, train, val, test)

Total 143346 boxes: 70.45% train, 13.30% val, 16.25% test


In [None]:
[s for s in test if s in location_boxes1]

In [97]:
write_splits(dataset_name, train, val, test)

In [98]:
train = [s for s in train if s in location_boxes1]
val = [s for s in val if s in location_boxes1]
test = [s for s in test if s in location_boxes1]

len(train) + len(val) + len(test)
write_splits(dataset_name1, train, val, test)

59

## ena24

No location info. 

## fws_butler_redrock

In [105]:
dataset_name = 'fws_butler_redrock'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [106]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

24

In [107]:
location_boxes

defaultdict(int,
            {'Ash1': 382,
             'Ash2': 27,
             'Ash3': 14,
             'Ashwind': 244,
             'G0': 51,
             'G10': 18,
             'G11': 8,
             'G11c': 502,
             'G2': 30,
             'G3': 65,
             'G4': 5,
             'G5': 64,
             'G6': 17,
             'G7box': 487,
             'G7c': 191,
             'G8': 279,
             'G9': 560,
             'Jacks': 450,
             'P1_2_3': 1048,
             'P1_4d': 1612,
             'P1_4f': 4240,
             'P1_5': 2266,
             'P3_4': 1469,
             'Solar': 29863})

In [110]:
val = ['Ash3', 'G3', 'P1_5']
test = ['P3_4']
train = [loc for loc in location_boxes.keys() if loc not in val and loc not in test and loc != 'NO_LOCATION']

get_proportions(location_boxes, train, val, test)

Total 43892 boxes: 91.31% train, 5.34% val, 3.35% test


In [111]:
write_splits(dataset_name, train, val, test)

## fws_hawaii_kauai_forest_birds_a24s

In [112]:
dataset_name = 'fws_hawaii_kauai_forest_birds_a24s'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [113]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

102

In [115]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:15]
test = locations[15:30]
train = locations[30:]

get_proportions(location_boxes, train, val, test)

Total 5738 boxes: 75.34% train, 14.69% val, 9.97% test


In [117]:
write_splits(dataset_name, train, val, test)

## fws_lehnen_mice

In [118]:
dataset_name = 'fws_lehnen_mice'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [119]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

80

In [125]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:6]
test = locations[6:10]
train = locations[10:]

get_proportions(location_boxes, train, val, test)

Total 7931 boxes: 89.17% train, 6.14% val, 4.69% test


In [126]:
write_splits(dataset_name, train, val, test)

## idfg_swwlf_2020

In [128]:
dataset_name = 'idfg_swwlf_2020'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [129]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes) # few boxes/non-empty images at each location

517

In [137]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:25]
test = locations[25:50]
train = locations[50:]

get_proportions(location_boxes, train, val, test)

Total 18458 boxes: 87.88% train, 5.83% val, 6.29% test


In [138]:
write_splits(dataset_name, train, val, test)

## islandconservation_190705, islandconservation_200529 and islandconservation_200529_private
Do not have location info

## islandconservation_midway_2020

In [142]:
dataset_name = 'islandconservation_midway_2020'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [144]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

44

In [145]:
location_boxes

defaultdict(int,
            {'01W 8-5-17': 31,
             '02W 8-5-17': 60,
             '04W 8-5-17': 364,
             '05W 8-6-17': 410,
             '10E 8-7-17': 13,
             '5W Aug 2017': 314,
             'Bird 8-6-17': 33,
             'Lehua bait station 5 8-23 to 11-15 2017': 1819,
             'Lehua rat eats baby bird': 60,
             'Lehua2016': 10816,
             'Floreana_CAM1': 145,
             'Floreana_CAM10': 83,
             'Floreana_CAM11': 44,
             'Floreana_CAM12': 408,
             'Floreana_CAM13': 947,
             'Floreana_CAM14': 5756,
             'Floreana_CAM15': 1456,
             'Floreana_CAM16': 165,
             'Floreana_CAM2': 19739,
             'Floreana_CAM3': 573,
             'Floreana_CAM4': 93,
             'Floreana_CAM5': 306,
             'Floreana_CAM6': 51,
             'Floreana_CAM7': 191,
             'Floreana_CAM8': 91,
             'Floreana_CAM9': 3,
             'SantaCruz_CAM17': 361,
             'SantaC

In [149]:
val = ['04W 8-5-17', 'MidwayBoninNight_Sec13', 'SantaCruz_CAM31']
test = ['Floreana_CAM15', 'MidwayBoninNight_Sec37']
train = [loc for loc in location_boxes.keys() if loc not in val and loc not in test and loc != 'NO_LOCATION']

get_proportions(location_boxes, train, val, test)

Total 81682 boxes: 95.11% train, 1.81% val, 3.08% test


In [150]:
write_splits(dataset_name, train, val, test)

## islandconservation_rodents_2020

In [153]:
dataset_name = 'islandconservation_rodents_2020'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [154]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

36

In [155]:
location_boxes

defaultdict(int,
            {'Aug2020/CAM003': 3,
             'Aug2020/CAM007': 1,
             'Aug2020/CAM010': 32,
             'Aug2020/CAM012 BF': 6,
             'Aug2020/CAM056': 39,
             'Aug2020/CAM057': 13,
             'Aug2020/CAM110': 41,
             'Aug2020/cam052': 5,
             'Aug2020/cam055': 3,
             'July2020/CAM054': 3,
             'July2020/CAM055': 3,
             'July2020/CAM056': 116,
             'July2020/CAM057': 6,
             'July2020/CAM058': 19,
             'Nov2020/TimelapseExport/CAM056': 12,
             'Nov2020/TimelapseExport/CAM059': 3,
             'Oct2020/cam001': 6,
             'Oct2020/cam004': 9,
             'Oct2020/cam012': 8,
             'Oct2020/cam051': 3,
             'Oct2020/cam052': 6,
             'Oct2020/cam056': 18,
             'Oct2020/cam057': 54,
             'Oct2020/cam060': 289,
             'Oct2020/cam101': 6,
             'Oct2020/cam106': 0,
             'Oct2020/cam108': 4,
             

In [157]:
val = ['Aug2020/CAM056', 'Oct2020/cam057', 'Sep2020/cam106']
test = ['Oct2020/cam012', 'Nov2020/TimelapseExport/CAM059', 'July2020/CAM058']
train = [loc for loc in location_boxes.keys() if loc not in val and loc not in test and loc != 'NO_LOCATION']

get_proportions(location_boxes, train, val, test)

Total 820 boxes: 83.90% train, 12.44% val, 3.66% test


In [158]:
write_splits(dataset_name, train, val, test)

## rspb_atkinson
Only 9 locations

In [159]:
dataset_name = 'rspb_atkinson'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [160]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

9

In [161]:
location_boxes

defaultdict(int,
            {'Test': 3,
             'OP211/ONWP335': 14,
             'OP212/ONWP271': 52,
             'OP206/ONWP362': 51,
             'OP209/ONWP363': 186,
             'OP210/ONWP272': 9,
             'OP208/ONWP365': 3,
             'OP207/ONWP334': 58,
             'OP205/ONWP364': 83})

In [162]:
val = ['OP212/ONWP271']
test = ['OP207/ONWP334']
train = [loc for loc in location_boxes.keys() if loc not in val and loc not in test and loc != 'NO_LOCATION']

get_proportions(location_boxes, train, val, test)

Total 459 boxes: 76.03% train, 11.33% val, 12.64% test


In [163]:
write_splits(dataset_name, train, val, test)

## rspb_gola_2020

In [7]:
dataset_name = 'rspb_gola_2020'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [8]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

75

In [12]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:4]
test = locations[4:8]
train = locations[8:]

get_proportions(location_boxes, train, val, test)

Total 15129 boxes: 88.18% train, 9.65% val, 2.17% test


In [13]:
write_splits(dataset_name, train, val, test)

## saola and saola_private

In [31]:
dataset_name = 'saola'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [32]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

935

In [55]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:100]
test = locations[100:150]
train = locations[150:]

get_proportions(location_boxes, train, val, test)

Total 100647 boxes: 85.93% train, 9.72% val, 4.35% test


In [56]:
write_splits(dataset_name, train, val, test)

In [57]:
dataset_name1 = 'saola_private'

with open(os.path.join(batch_12_dir, f'{dataset_name1}.json')) as f:
    sequences1 = json.load(f)

location_boxes1 = count_boxes_at_locations(sequences1)
len(location_boxes1)

974

In [58]:
sum(location_boxes1.values()) # we only bbox labeled a subset

1021

In [59]:
train = [loc for loc in location_boxes1 if loc not in val and loc not in test]
len(train)

825

In [60]:
write_splits(dataset_name1, train, val, test)

In [61]:
boxes_val = 0
for loc in val:
    boxes_val += location_boxes1[loc]
boxes_val

128

## umn_gomez

In [62]:
dataset_name = 'umn_gomez'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [63]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

44

In [66]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:4]
test = locations[4:6]
train = locations[6:]

get_proportions(location_boxes, train, val, test)

Total 76679 boxes: 82.86% train, 12.11% val, 5.03% test


In [67]:
write_splits(dataset_name, train, val, test)

## uw_gardner

Mostly animals

In [68]:
dataset_name = 'uw_gardner'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [69]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

28

In [85]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:3]
test = locations[3:6]
train = locations[6:]

get_proportions(location_boxes, train, val, test)

Total 9176 boxes: 93.79% train, 3.98% val, 2.23% test


In [86]:
write_splits(dataset_name, train, val, test)

## wellington_nz
LILA dataset

In [87]:
dataset_name = 'wellington_nz'

with open(os.path.join(batch_12_dir, f'{dataset_name}.json')) as f:
    sequences = json.load(f)

In [88]:
location_boxes = count_boxes_at_locations(sequences)
len(location_boxes)

215

In [90]:
locations = list(location_boxes.keys())
shuffle(locations)

val = locations[:25]
test = locations[25:50]
train = locations[50:]

get_proportions(location_boxes, train, val, test)

Total 215101 boxes: 74.53% train, 12.41% val, 13.06% test


In [91]:
write_splits(dataset_name, train, val, test)