In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps_data')  # append this repo to PYTHONPATH
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math

from tqdm import tqdm
from unidecode import unidecode 

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

import sas_blob_utils

# Importing Season 1 of the Snapshot Safari projects

This notebook imports the first season of the 6 Snapshot Safari datasets into the MegaDB format. 

Each Snapshot Safari project has its own MegaDB dataset with public images, so that the species can be mapped separately in case they are named differently. Future seasons can be added to these datasets.

All private images are lumped into one dataset `snapshot_safari_private` however, as the species there are few.

### List all images in the private container as they need to be a separate dataset

In [4]:
# SAS key to where the private images are stored
container_sas = ''

In [5]:
%%time

list_private_images = sas_blob_utils.list_blobs_in_container(
    container_uri=container_sas
)

0it [00:00, ?it/s]

listing blobs...


2616it [00:02, 1073.69it/s]

Enumerated 2616 matching blobs out of 2616 total
CPU times: user 871 ms, sys: 46.3 ms, total: 917 ms
Wall time: 2.45 s





In [6]:
list_private_images[1000]

'ENO_S1_private/ENO_S1/D04/D04_R1/ENO_S1_D04_R1_IMAG0030.JPG'

In [7]:
list_private_images = ['/'.join(i.split('/')[1:]) for i in list_private_images]

In [8]:
list_private_images[1000]

'ENO_S1/D04/D04_R1/ENO_S1_D04_R1_IMAG0030.JPG'

In [9]:
list_private_images = set(list_private_images)

### Converting to MegaDB format

In [10]:
name_to_cct = {
    'snapshot_karoo': 'SnapshotKaroo_S1_v1.0.json',
    'snapshot_enonkishu': 'SnapshotEnonkishu_S1_v1.0.json',
    'snapshot_camdeboo': 'SnapshotCamdeboo_S1_v1.0.json',
    'snapshot_mountain_zebra': 'SnapshotMountainZebra_S1_v1.0.json',
    'snapshot_kruger': 'SnapshotKruger_S1_v1.0.json',
    'snapshot_kgalagadi': 'SnapshotKgalagai_S1_v1.0.json'
}

private_set_name = 'snapshot_safari_private'

cct_dir = '.../data/CameraTraps/CCT_JSONs'
output_dir = '.../AI4Earth/CameraTrap/Databases/megadb_2020/to_ingest'

In [None]:
name_to_sequences = {}
private_embedded = []

for dataset_name, cct_fn in name_to_cct.items():
    path_to_image_cct = os.path.join(cct_dir, cct_fn)
    embedded_all = make_cct_embedded(image_db=path_to_image_cct)
    
    # move private images out
    embedded = []
    for i in embedded_all:
        if i['file_name'] in list_private_images:
            private_embedded.append(i)
        else:
            embedded.append(i)
    sequences = process_sequences(embedded, dataset_name)
    name_to_sequences[dataset_name] = sequences
    print('===========================================')

Loading image DB...
Number of items from the image DB: 38293
Number of images with more than 1 species: 27 (0.07% of image DB)
No bbox DB provided.
The dataset_name is set to snapshot_karoo. Please make sure this is correct!
Making a deep copy of docs...


100%|██████████| 38074/38074 [00:00<00:00, 281090.91it/s]

Putting 38074 images into sequences...
Number of sequences: 14806
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...






all_img_properties
{'resting', 'corrupt', 'location', 'id', 'standing', 'file', 'moving', 'frame_num', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}

img_level_properties
{'file', 'frame_num', 'id'}

image-level properties that really should be sequence-level
{'resting', 'corrupt', 'standing', 'location', 'moving', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}

! Sequence-level property corrupt with value False should be a dataset-level property. Removed from sequences.
Finished processing sequences.
Example sequence items:

OrderedDict([('dataset', 'snapshot_karoo'), ('seq_id', 'KAR_S1#A01#1#1'), ('location', 'A01'), ('images', [{'id': 'KAR_S1/A01/A01_R1/KAR_S1_A01_R1_IMAG00008', 'frame_num': 1, 'file': 'KAR_S1/A01/A01_R1/KAR_S1_A01_R1_IMAG00008.JPG'}]), ('class', ['empty']), ('datetime', '2017-10-04 01:01:44'), ('resting', None), ('standing', None), ('moving', None), ('young_present', None), ('count', No

100%|██████████| 28544/28544 [00:00<00:00, 289525.37it/s]

Putting 28544 images into sequences...
Number of sequences: 12969
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...






all_img_properties
{'resting', 'corrupt', 'location', 'id', 'standing', 'file', 'moving', 'frame_num', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}

img_level_properties
{'file', 'frame_num', 'id'}

image-level properties that really should be sequence-level
{'resting', 'corrupt', 'standing', 'location', 'moving', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}

! Sequence-level property corrupt with value False should be a dataset-level property. Removed from sequences.
Finished processing sequences.
Example sequence items:

OrderedDict([('dataset', 'snapshot_enonkishu'), ('seq_id', 'ENO_S1#B02#1#7'), ('location', 'B02'), ('images', [{'id': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0017', 'frame_num': 1, 'file': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0017.JPG'}, {'id': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0018', 'frame_num': 2, 'file': 'ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0018.JPG'}, {'id': 'ENO_S1/B02/B02_R1/ENO

100%|██████████| 30227/30227 [00:00<00:00, 280110.79it/s]

Putting 30227 images into sequences...
Number of sequences: 12024
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...






all_img_properties
{'resting', 'corrupt', 'location', 'id', 'standing', 'file', 'moving', 'frame_num', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}

img_level_properties
{'file', 'frame_num', 'id'}

image-level properties that really should be sequence-level
{'resting', 'corrupt', 'standing', 'location', 'moving', 'young_present', 'count', 'datetime', 'season', 'subject_id', 'interacting', 'class'}

! Sequence-level property corrupt with value False should be a dataset-level property. Removed from sequences.
! Sequence-level property datetime with value  should be a dataset-level property. Removed from sequences.
Finished processing sequences.
Example sequence items:

OrderedDict([('dataset', 'snapshot_camdeboo'), ('seq_id', 'CDB_S1#A05#1#3'), ('location', 'A05'), ('images', [{'id': 'CDB_S1/A05/A05_R1/CDB_S1_A05_R1_IMAG0007', 'frame_num': 1, 'file': 'CDB_S1/A05/A05_R1/CDB_S1_A05_R1_IMAG0007.JPG'}]), ('class', ['empty']), ('resting', None), ('st

 76%|███████▋  | 55799/73034 [00:00<00:00, 278957.69it/s]

Putting 73034 images into sequences...


100%|██████████| 73034/73034 [00:00<00:00, 271561.50it/s]


Number of sequences: 71178
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...


In [None]:
len(private_embedded)

In [None]:
# pass the schema check
for dataset_name, sequences in name_to_sequences.items():
    print(f'Dataset {dataset_name}')
    sequences_schema_check.sequences_schema_check(sequences)

In [None]:
# private set

private_sequences = process_sequences(private_embedded, private_set_name)
sequences_schema_check.sequences_schema_check(private_sequences)

In [None]:
name_to_sequences[private_set_name] = private_sequences

In [None]:
len(name_to_sequences)

In [None]:
for dataset_name, sequences in name_to_sequences.items():
    output_path = os.path.join(output_dir, f'{dataset_name}_megadb.json')
    
    with open(output_path, 'w') as f:
        json.dump(sequences, f, indent=1)

### Forgot to prepend the prefix to each dataset in the private dataset

so they can share a common prefix and be lumped into one dataset.

Also prepend the dataset indicator to the location field so the location values are unique in the `snapshot_safari_private` dataset.

In [27]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_2020/to_ingest/OLDsnapshot_safari_private_megadb.json') as f:
    private_sequences = json.load(f)

In [28]:
len(private_sequences)

1257

In [30]:
collection_names = set()

for seq in private_sequences:
    collection_name = seq['seq_id'].split('#')[0]
    collection_names.add(collection_name)
    
    seq['location'] = f'{collection_name}_{seq["location"]}'
    
    for im in seq['images']:
        old_file_path = im['file']
        im['file'] = f'{collection_name}_private/{old_file_path}'

In [31]:
collection_names

{'CDB_S1', 'ENO_S1', 'KAR_S1', 'KGA_S1', 'KRU_S1', 'MTZ_S1'}

In [32]:
with open('.../megadb_2020/to_ingest/snapshot_safari_private_megadb.json', 'w') as f:
    json.dump(private_sequences, f, indent=1)

! Note - probably not the best idea to prepend e.g. "CDB_S1" to the location value (with the season number) - locations are the same across seasons. We should update these.