In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math
from copy import deepcopy
from shutil import copyfile
from multiprocessing.pool import ThreadPool

from tqdm import tqdm
from unidecode import unidecode 

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# saola_private

In [23]:
dataset_name = 'saola_private'

container_root = '/mink_disk_0/camtraps/swg-camera-traps-private/'
path_prefix = 'private'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json'

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

### Step 1a - If you have metadata in COCO Camera Traps (CCT) format already...


In [5]:
# path to the CCT json, or a loaded json object
path_to_image_cct = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/swg_camera_traps.json'  # set to None if not available
path_to_bbox_cct = None
assert not (path_to_image_cct is None and path_to_bbox_cct is None)

In [6]:
%%time

embedded = make_cct_embedded(image_db=path_to_image_cct, bbox_db=path_to_bbox_cct)

Loading image DB...
Number of items from the image DB: 2039657
Number of images with more than 1 species: 0 (0.0% of image DB)
No bbox DB provided.
CPU times: user 23 s, sys: 4.86 s, total: 27.9 s
Wall time: 34 s


In the following step, properties will be moved to the highest level that is still correct, i.e. if a property at the image-level always has the smae value for all images in a sequence, it will be moved to be a sequence-level property.

If a sequence-level property has the same value throughout this dataset (often 'rights holder'), it will be removed from the `sequence` objects. A message about this will be printed, and you should add that property and its (constant) value to this dataset's entry in the `datasets` table.

In [7]:
%%time

sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to saola_private. Please make sure this is correct!
Making a deep copy of docs...


  3%|▎         | 51510/2039657 [00:00<00:03, 514978.94it/s]

Putting 2039657 images into sequences...


100%|██████████| 2039657/2039657 [00:04<00:00, 416486.65it/s]


Number of sequences: 436617
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'file', 'location', 'class', 'frame_num', 'id', 'datetime'}

img_level_properties
{'file', 'class', 'frame_num', 'id', 'datetime'}

image-level properties that really should be sequence-level
{'location'}

Finished processing sequences.
Example sequence items:

{"dataset": "saola_private", "seq_id": "0a9bb757-8c2a-11eb-9411-000d3a74c7de", "location": "loc_0000", "images": [{"id": "c7a2830c-8c29-11eb-860b-000d3a74c7de", "datetime": "2017-08-15 11:48:11+00:00", "frame_num": 0, "file": "private/lao/loc_0000/2017/08/image_00000.jpg", "class": ["ignore"]}, {"id": "c7a2830d-8c29-11eb-854d-000d3a74c7de", "datetime": "2017-08-15 11:48:19+00:00", "frame_num": 1, "file": "private/lao/loc_0000/2017/08/image_00001.jpg", "class": ["ignore"]}]}

{"dataset": "saola_private", "seq_id": "ebd24e24-8c29-11eb-a71a-000d3a74c7de", "location": "loc_0065", "images": 

In [9]:
private_sequences = []  # actually public/private was decided at the image level

for seq in sequences:
    private_images = [im for im in seq['images'] if im['file'].startswith('private/')]
    if len(private_images) > 0:
        for im in private_images:
            im['file'] = im['file'].split('private/')[1]
        seq['images'] = private_images
        private_sequences.append(seq)

In [10]:
len(private_sequences)

26842

In [11]:
locations = set()
for seq in private_sequences:
    locations.add(seq['location'])
len(locations)

974

In [16]:
sample(private_sequences, 2)

[OrderedDict([('dataset', 'saola_private'),
              ('seq_id', '23f70cae-8c2a-11eb-bc34-000d3a74c7de'),
              ('location', 'loc_0554'),
              ('images',
               [{'id': 'ccfd6070-8c29-11eb-ac3d-000d3a74c7de',
                 'datetime': '2020-08-21 10:47:18+00:00',
                 'frame_num': 0,
                 'file': 'lao/loc_0554/2020/08/image_00000.jpg',
                 'class': ['ignore']},
                {'id': 'ccfd6071-8c29-11eb-ac0d-000d3a74c7de',
                 'datetime': '2020-08-21 10:47:24+00:00',
                 'frame_num': 1,
                 'file': 'lao/loc_0554/2020/08/image_00001.jpg',
                 'class': ['ignore']},
                {'id': 'ccfd6072-8c29-11eb-a39f-000d3a74c7de',
                 'datetime': '2020-08-21 10:47:27+00:00',
                 'frame_num': 2,
                 'file': 'lao/loc_0554/2020/08/image_00002.jpg',
                 'class': ['ignore']}])]),
 OrderedDict([('dataset', 'saola_private'),
   

## Step 2 - Pass the schema check

In [12]:
%%time

sequences_schema_check.sequences_schema_check(private_sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 14.4 s, sys: 14.5 ms, total: 14.4 s
Wall time: 14.4 s


In [13]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(private_sequences, f, ensure_ascii=False)

### Step 2b - Sample

In [14]:
private_sequences_good = []
locations_good = set()

for seq in private_sequences:
    include = True
    for im in seq['images']:
        if im['class'][0] in ['ignore', 'empty', 'problem', 'blurred']:
            include = False
    if include:
        private_sequences_good.append(seq)
        locations_good.add(seq['location'])
len(private_sequences_good)
len(locations_good)

391

119

In [17]:
sample(private_sequences_good, 1)

[OrderedDict([('dataset', 'saola_private'),
              ('seq_id', 'f1fb7173-8c29-11eb-96e8-000d3a74c7de'),
              ('location', 'loc_0410'),
              ('images',
               [{'id': 'cac5f4b6-8c29-11eb-ac59-000d3a74c7de',
                 'datetime': '2020-07-08 15:41:36+00:00',
                 'frame_num': 0,
                 'file': 'lao/loc_0410/2020/07/image_00012.jpg',
                 'class': ['human']},
                {'id': 'cac5f4b7-8c29-11eb-8a64-000d3a74c7de',
                 'datetime': '2020-07-08 15:41:36+00:00',
                 'frame_num': 1,
                 'file': 'lao/loc_0410/2020/07/image_00013.jpg',
                 'class': ['human']},
                {'id': 'cac5f4b8-8c29-11eb-aec3-000d3a74c7de',
                 'datetime': '2020-07-08 15:41:36+00:00',
                 'frame_num': 2,
                 'file': 'lao/loc_0410/2020/07/image_00014.jpg',
                 'class': ['human']},
                {'id': 'cac5f4b9-8c29-11eb-8aeb-000d3a

In [18]:
sequences_short = []
locations_sampled = set()

for seq in private_sequences_good:
    seq['images'] = seq['images'][:3]
    sequences_short.append(seq)
    locations_sampled.add(seq['location'])

num_images = sum([len(seq['images']) for seq in sequences_short])
num_images

len(locations_sampled)

1039

119

### Step 2c - Download the sampled images

In [19]:
list_to_download = []

for seq in sequences_short:
    for im in seq['images']:
        list_to_download.append('private/' + im['file'] + '\n')
len(list_to_download)

1039

In [21]:
list_to_download[-10]

'private/vietnam/loc_0952/2018/08/image_00105.jpg\n'

In [22]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/saola_private_files.txt', 'w') as f:
    f.writelines(list_to_download)

### Step 2d - Copy to flat folder

In [24]:
path_pairs = []

for seq in tqdm(sequences_short):
    seq_id = seq['seq_id']
    
    for im in seq['images']:
        frame = im['frame_num']
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

100%|██████████| 391/391 [00:00<00:00, 49078.94it/s]


In [25]:
len(path_pairs)
sample(path_pairs, 2)

1039

[('/mink_disk_0/camtraps/swg-camera-traps-private/private/lao/loc_0453/2020/08/image_00070.jpg',
  '/mink_disk_0/camtraps/imerit12g/saola_private.seqfdb2b632-8c29-11eb-9b98-000d3a74c7de.frame0.jpg'),
 ('/mink_disk_0/camtraps/swg-camera-traps-private/private/lao/loc_0287/2019/07/image_00000.jpg',
  '/mink_disk_0/camtraps/imerit12g/saola_private.seq206607c2-8c2a-11eb-bdec-000d3a74c7de.frame0.jpg')]

In [27]:
%%time

def copy_file(src_path, dst_path):
    if not os.path.exists(dst_path):
        return copyfile(src_path, dst_path)
    else:
        return None

with ThreadPool(12) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 6.03 s, sys: 6.82 s, total: 12.9 s
Wall time: 5.33 s
