In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH

In [64]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math
from copy import deepcopy
from shutil import copyfile
from multiprocessing.pool import ThreadPool

from tqdm import tqdm
from unidecode import unidecode 

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# saola



In [6]:
dataset_name = 'saola'

container_root = '/mink_disk_0/camtraps/swg-camera-traps/'
path_prefix = 'public'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json'

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

### Step 1a - If you have metadata in COCO Camera Traps (CCT) format already...


In [9]:
# path to the CCT json, or a loaded json object
path_to_image_cct = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/swg_camera_traps.json'  # set to None if not available
path_to_bbox_cct = None
assert not (path_to_image_cct is None and path_to_bbox_cct is None)

In [10]:
%%time

embedded = make_cct_embedded(image_db=path_to_image_cct, bbox_db=path_to_bbox_cct)

Loading image DB...
Number of items from the image DB: 2039657
Number of images with more than 1 species: 0 (0.0% of image DB)
No bbox DB provided.
CPU times: user 27 s, sys: 7.01 s, total: 34 s
Wall time: 1min 15s


In the following step, properties will be moved to the highest level that is still correct, i.e. if a property at the image-level always has the smae value for all images in a sequence, it will be moved to be a sequence-level property.

If a sequence-level property has the same value throughout this dataset (often 'rights holder'), it will be removed from the `sequence` objects. A message about this will be printed, and you should add that property and its (constant) value to this dataset's entry in the `datasets` table.

In [16]:
%%time

sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to saola. Please make sure this is correct!
Making a deep copy of docs...


  2%|▏         | 46906/2039657 [00:00<00:04, 468968.76it/s]

Putting 2039657 images into sequences...


100%|██████████| 2039657/2039657 [00:05<00:00, 377104.84it/s]


Number of sequences: 436617
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'location', 'class', 'frame_num', 'id', 'file', 'datetime'}

img_level_properties
{'class', 'frame_num', 'id', 'file', 'datetime'}

image-level properties that really should be sequence-level
{'location'}

Finished processing sequences.
Example sequence items:

{"dataset": "saola", "seq_id": "0a9bb757-8c2a-11eb-9411-000d3a74c7de", "location": "loc_0000", "images": [{"id": "c7a2830c-8c29-11eb-860b-000d3a74c7de", "datetime": "2017-08-15 11:48:11+00:00", "frame_num": 0, "file": "private/lao/loc_0000/2017/08/image_00000.jpg", "class": ["ignore"]}, {"id": "c7a2830d-8c29-11eb-854d-000d3a74c7de", "datetime": "2017-08-15 11:48:19+00:00", "frame_num": 1, "file": "private/lao/loc_0000/2017/08/image_00001.jpg", "class": ["ignore"]}]}

{"dataset": "saola", "seq_id": "06f05625-8c2a-11eb-bb0b-000d3a74c7de", "location": "loc_0198", "images": [{"id": "ce466aa

In [17]:
# sample some sequences to make sure they are what you expect
sample(sequences, 2)

[OrderedDict([('dataset', 'saola'),
              ('seq_id', '1a2ad85d-8c2a-11eb-9418-000d3a74c7de'),
              ('location', 'loc_0762'),
              ('images',
               [{'id': 'd77a05b8-8c29-11eb-be04-000d3a74c7de',
                 'datetime': '2018-05-25 13:50:42+00:00',
                 'frame_num': 0,
                 'file': 'private/lao/loc_0762/2018/05/image_00004.jpg',
                 'class': ['ignore']},
                {'id': 'd77a05b9-8c29-11eb-addf-000d3a74c7de',
                 'datetime': '2018-05-25 13:50:50+00:00',
                 'frame_num': 1,
                 'file': 'private/lao/loc_0762/2018/05/image_00005.jpg',
                 'class': ['ignore']},
                {'id': 'd77a05ba-8c29-11eb-ba32-000d3a74c7de',
                 'datetime': '2018-05-25 13:50:58+00:00',
                 'frame_num': 2,
                 'file': 'private/lao/loc_0762/2018/05/image_00006.jpg',
                 'class': ['ignore']},
                {'id': 'd77a05bb-8c

In [19]:
public_sequences = []  # actually public/private was decided at the image level

for seq in sequences:
    public_images = [im for im in seq['images'] if im['file'].startswith('public/')]
    if len(public_images) > 0:
        for im in public_images:
            im['file'] = im['file'].split('public/')[1]
        seq['images'] = public_images
        public_sequences.append(seq)

In [20]:
len(public_sequences)

410464

In [21]:
locations = set()
for seq in public_sequences:
    locations.add(seq['location'])
len(locations)

935

## Step 2 - Pass the schema check

In [23]:
%%time

sequences_schema_check.sequences_schema_check(public_sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 2min 38s, sys: 3.59 ms, total: 2min 38s
Wall time: 2min 38s


In [27]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(public_sequences, f, ensure_ascii=False)

### Step 2b - Sample

In [26]:
public_sequences_good = []
locations_good = set()

for seq in public_sequences:
    include = True
    for im in seq['images']:
        if im['class'][0] in ['ignore', 'empty', 'problem', 'blurred']:
            include = False
    if include:
        public_sequences_good.append(seq)
        locations_good.add(seq['location'])
len(public_sequences_good)
len(locations_good)

222309

926

In [42]:
sequences_short = []
locations_sampled = set()

# sample 50k sequences, not covering all locations but most, and take only 3 frames
for seq in sample(public_sequences_good, 50000):
    seq['images'] = seq['images'][:3]
    sequences_short.append(seq)
    locations_sampled.add(seq['location'])

num_images = sum([len(seq['images']) for seq in sequences_short])
num_images

len(locations_sampled)

119288

913

In [48]:
sample(sequences_short, 1)

[OrderedDict([('dataset', 'saola'),
              ('seq_id', 'dffd2979-8c29-11eb-acee-000d3a74c7de'),
              ('location', 'loc_0385'),
              ('images',
               [{'id': 'ca5aa776-8c29-11eb-9b92-000d3a74c7de',
                 'datetime': '2020-07-13 08:24:51+00:00',
                 'frame_num': 0,
                 'file': 'lao/loc_0385/2020/07/image_00393.jpg',
                 'class': ['large_antlered_muntjac']},
                {'id': 'ca5aa777-8c29-11eb-91db-000d3a74c7de',
                 'datetime': '2020-07-13 08:24:55+00:00',
                 'frame_num': 1,
                 'file': 'lao/loc_0385/2020/07/image_00394.jpg',
                 'class': ['large_antlered_muntjac']},
                {'id': 'ca5aa778-8c29-11eb-ae63-000d3a74c7de',
                 'datetime': '2020-07-13 08:24:58+00:00',
                 'frame_num': 2,
                 'file': 'lao/loc_0385/2020/07/image_00395.jpg',
                 'class': ['large_antlered_muntjac']}])])]

### Step 2c - Download the sampled images

In [56]:
list_to_download = []

for seq in sequences_short:
    for im in seq['images']:
        list_to_download.append('public/' + im['file'] + '\n')
len(list_to_download)

119288

In [57]:
list_to_download[10000]

'public/lao/loc_0435/2020/03/image_00050.jpg\n'

In [58]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/saola_files.txt', 'w') as f:
    f.writelines(list_to_download)

### Step 2d - Copy to flat folder

In [49]:
path_pairs = []

for seq in tqdm(sequences_short):
    seq_id = seq['seq_id']
    
    for im in seq['images']:
        frame = im['frame_num']
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

100%|██████████| 50000/50000 [00:00<00:00, 53647.62it/s]


In [60]:
len(path_pairs)
sample(path_pairs, 2)

119288

[('/mink_disk_0/camtraps/swg-camera-traps/public/lao/loc_0292/2019/03/image_00042.jpg',
  '/mink_disk_0/camtraps/imerit12g/saola.seq323bb6f5-8c2a-11eb-b0f1-000d3a74c7de.frame2.jpg'),
 ('/mink_disk_0/camtraps/swg-camera-traps/public/vietnam/loc_0954/2018/08/image_00162.jpg',
  '/mink_disk_0/camtraps/imerit12g/saola.seqf17d1519-8c29-11eb-8401-000d3a74c7de.frame2.jpg')]

In [69]:
for src_path, dst_path in tqdm(path_pairs):
    if not os.path.exists(src_path):
        old_path = src_path.replace('/mink_disk_0/camtraps/swg-camera-traps/public', '/mink_disk_0/camtraps/swg-camera-traps/swg-camera-traps/public')
        try:
            _ = copyfile(old_path, src_path)
        except IOError as e:
            os.makedirs(os.path.dirname(src_path))
            _ = copyfile(old_path, src_path)

100%|██████████| 119288/119288 [00:39<00:00, 3000.20it/s] 


In [70]:
%%time

def copy_file(src_path, dst_path):
    if not os.path.exists(dst_path):
        return copyfile(src_path, dst_path)
    else:
        return None

with ThreadPool(12) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 3min 56s, sys: 9min 31s, total: 13min 27s
Wall time: 16min 57s
