In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm


import path_utils, sas_blob_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import process_sequences

# Adding images from existing megadb entries to annotation batch 12

In [11]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

## WCS Camera Traps

Some are real sequences and others are dummy sequences with only one image. In Batch 9-11, we sampled sequences from every location with animal already.

In [5]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/wcs_w_batch_10_boxes.json') as f:
    wcs_db = json.load(f)

In [6]:
len(wcs_db)

407999

In [None]:
sample(wcs_db, 2)

In [25]:
seq_w_box = []
num_im_w_box = 0

seq_wo_box = []
num_im_wo_box = 0

for seq in wcs_db:
    has_box = False
    for im in seq['images']:
        if 'bbox' in im:
            has_box = True
    if has_box:
        seq_w_box.append(seq)
        num_im_w_box += len(seq['images'])
    else:
        seq_wo_box.append(seq)
        num_im_wo_box += len(seq['images'])

In [26]:
len(seq_w_box)
num_im_w_box

len(seq_wo_box)
num_im_wo_box

15897

61738

392102

1145752

In [27]:
1145752/392102

2.9220763984881484

In [18]:
all_locations = set([seq['location'] for seq in wcs_db])
len(all_locations)

3779

In [19]:
locations_w_box = set([seq['location'] for seq in seq_w_box])
len(locations_w_box)

3370

In [20]:
li_locations_w_box = [seq['location'] for seq in seq_w_box]
counter_locations_w_box = Counter(li_locations_w_box)

In [29]:
seq_wo_box_non_empty = []
num_im_seq_wo_box_non_empty = 0

for seq in seq_wo_box:
    non_empty = False
    for im in seq['images']:
        if 'empty' not in im['class']:
            non_empty = True
            
    if non_empty:
        seq_wo_box_non_empty.append(seq)
        num_im_seq_wo_box_non_empty += len(seq['images'])

In [30]:
len(seq_wo_box_non_empty)
num_im_seq_wo_box_non_empty

245069

639136

In [31]:
639136 / 250000

2.556544

In [32]:
245069/2.56

95730.078125

In [51]:
sampled = sample(seq_wo_box_non_empty, 137000)

im_to_send = []

for seq in sampled:
    first_images = seq['images'][:7]
    
    for im in first_images:
        im['seq_id'] = seq['seq_id']
        
        if 'frame_num' not in im:
            im['frame_num'] = 1
        
        if 'empty' not in im['class']:
            im_to_send.append(im)

len(im_to_send)

249091

In [52]:
sample(im_to_send, 3)

[{'datetime': '2010-01-28 17:03:37.000',
  'frame_num': 1,
  'match_level': 1,
  'image_id': '3b9ddad6-92d5-11e9-ae26-000d3a74c7de',
  'corrupt': False,
  'wcs_id': 'ken-015-d0051-i022207',
  'file': 'animals/0319/0153.jpg',
  'sex': 'unknown',
  'class': ['papio anubis'],
  'count': 4,
  'age': 'unknown',
  'seq_id': 'ken-015-d0051-17'},
 {'datetime': '2009-03-29 11:57:00.000',
  'match_level': 1,
  'image_id': 'a6110c6c-92d5-11e9-952c-000d3a74c7de',
  'corrupt': False,
  'wcs_id': 'mdg-002-d0038-i002323',
  'file': 'animals/0607/1244.jpg',
  'sex': 'unknown',
  'class': ['bos taurus'],
  'count': 1,
  'age': 'unknown',
  'seq_id': 'dummy_7db87a086e7544a9bf22c8fa9d9ba8b5',
  'frame_num': 1},
 {'datetime': '2013-05-24 07:49:45.000',
  'match_level': 2,
  'image_id': '89047789-92d5-11e9-9b52-000d3a74c7de',
  'corrupt': False,
  'wcs_id': 'gtm-003-d0015-i007368',
  'file': 'animals/0529/1151.jpg',
  'sex': 'unknown',
  'class': ['motorcycle'],
  'count': 1,
  'age': 'unknown',
  'seq_id'

In [56]:
wcs_list_to_download = [im['file'] + '\n' for im in im_to_send]

with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/wcs_files.txt', 'w') as f:
    f.writelines(wcs_list_to_download)

In [58]:
%%time

dataset_name = 'wcs'
container_root = '/mink_disk_0/camtraps/wcs-unzipped/wcs-unzipped'

path_pairs = []

for im in tqdm(im_to_send):
    
    src_path = os.path.join(container_root, im['file'])
    
    seq_id = im['seq_id']
    frame = im['frame_num']

    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12e', 
                            f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
    path_pairs.append((src_path, dst_path))

100%|██████████| 249091/249091 [00:01<00:00, 171603.51it/s]

CPU times: user 1.35 s, sys: 103 ms, total: 1.45 s
Wall time: 1.46 s





In [65]:
sample(path_pairs, 5)

[('/mink_disk_0/camtraps/wcs-unzipped/wcs-unzipped/animals/0546/1370.jpg',
  '/mink_disk_0/camtraps/imerit12e/wcs.seqdummy_7f6d7a9da5a34628a7a3157d2ea0e4f1.frame1.jpg'),
 ('/mink_disk_0/camtraps/wcs-unzipped/wcs-unzipped/animals/0117/1547.jpg',
  '/mink_disk_0/camtraps/imerit12e/wcs.seqbol-016-d0062-27.frame6.jpg'),
 ('/mink_disk_0/camtraps/wcs-unzipped/wcs-unzipped/animals/0568/1379.jpg',
  '/mink_disk_0/camtraps/imerit12e/wcs.seqdummy_15f43c4c29dc45f89d15499f00bf86d9.frame1.jpg'),
 ('/mink_disk_0/camtraps/wcs-unzipped/wcs-unzipped/animals/0143/1848.jpg',
  '/mink_disk_0/camtraps/imerit12e/wcs.seqbol-015-d0047-56.frame2.jpg'),
 ('/mink_disk_0/camtraps/wcs-unzipped/wcs-unzipped/animals/0549/1822.jpg',
  '/mink_disk_0/camtraps/imerit12e/wcs.seqdummy_d7f23faa29a14eedb744b94b87a04ad3.frame1.jpg')]

In [66]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 1min 49s, sys: 5min 22s, total: 7min 11s
Wall time: 17min 47s


In [67]:
len(dst_paths)

249091

## Snapshot Karoo

In [5]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/snapshot_karoo_megadb.json') as f:
    karoo_db = json.load(f)

In [21]:
len(karoo_db)
sample(karoo_db, 2)

14806

[{'dataset': 'snapshot_karoo',
  'seq_id': 'KAR_S1#B03#1#610',
  'location': 'B03',
  'images': [{'id': 'KAR_S1/B03/B03_R1/KAR_S1_B03_R1_IMAG1692',
    'frame_num': 1,
    'file': 'KAR_S1/B03/B03_R1/KAR_S1_B03_R1_IMAG1692.JPG'},
   {'id': 'KAR_S1/B03/B03_R1/KAR_S1_B03_R1_IMAG1693',
    'frame_num': 2,
    'file': 'KAR_S1/B03/B03_R1/KAR_S1_B03_R1_IMAG1693.JPG'},
   {'id': 'KAR_S1/B03/B03_R1/KAR_S1_B03_R1_IMAG1694',
    'frame_num': 3,
    'file': 'KAR_S1/B03/B03_R1/KAR_S1_B03_R1_IMAG1694.JPG'}],
  'class': ['gemsbokoryx'],
  'datetime': '2018-01-29 00:24:23',
  'resting': 0.0,
  'standing': 0.56,
  'moving': 0.11,
  'young_present': 0.11,
  'count': '1',
  'season': 'KAR_S1',
  'subject_id': 28807253,
  'interacting': 0.0},
 {'dataset': 'snapshot_karoo',
  'seq_id': 'KAR_S1#F02#1#1128',
  'location': 'F02',
  'images': [{'id': 'KAR_S1/F02/F02_R1/KAR_S1_F02_R1_IMAG2746',
    'frame_num': 1,
    'file': 'KAR_S1/F02/F02_R1/KAR_S1_F02_R1_IMAG2746.JPG'}],
  'class': ['empty'],
  'datetime': 

In [19]:
sequences_schema_check.sequences_schema_check(karoo_db)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [7]:
non_empty_seqs = [seq for seq in karoo_db if seq['class'][0] != 'empty']
len(non_empty_seqs)

2500

In [9]:
list_to_download = []

path_prefix = 'KAR/KAR_public'

for seq in non_empty_seqs:
    for im in seq['images']:
        list_to_download.append(os.path.join(path_prefix, im['file']) + '\n')
len(list_to_download)

6282

In [10]:
list_to_download[100]

'KAR/KAR_public/KAR_S1/A01/A01_R1/KAR_S1_A01_R1_IMAG00145.JPG\n'

In [11]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/snapshot_karoo_files.txt', 'w') as f:
    f.writelines(list_to_download)

In [13]:
%%time

dataset_name = 'snapshot_karoo'
container_root = '/mink_disk_0/camtraps/snapshot-safari'

path_pairs = []

for seq in non_empty_seqs:
    seq_id = seq['seq_id']
    for im in seq['images']:
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path)

        frame = im['frame_num']

        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

len(path_pairs)
path_pairs[-50]

CPU times: user 74.4 ms, sys: 57.2 ms, total: 132 ms
Wall time: 1.4 s


('/mink_disk_0/camtraps/snapshot-safari/KAR/KAR_public/KAR_S1/E03/E03_R1/KAR_S1_E03_R1_IMAG0440.JPG',
 '/mink_disk_0/camtraps/imerit12f/snapshot_karoo.seqKAR_S1#E03#1#168.frame1.jpg')

In [24]:
path_pairs[1000:1005]

[('/mink_disk_0/camtraps/snapshot-safari/KAR/KAR_public/KAR_S1/B02/B02_R1/KAR_S1_B02_R1_IMAG0230.JPG',
  '/mink_disk_0/camtraps/imerit12f/snapshot_karoo.seqKAR_S1#B02#1#92.frame1.jpg'),
 ('/mink_disk_0/camtraps/snapshot-safari/KAR/KAR_public/KAR_S1/B02/B02_R1/KAR_S1_B02_R1_IMAG0231.JPG',
  '/mink_disk_0/camtraps/imerit12f/snapshot_karoo.seqKAR_S1#B02#1#92.frame2.jpg'),
 ('/mink_disk_0/camtraps/snapshot-safari/KAR/KAR_public/KAR_S1/B02/B02_R1/KAR_S1_B02_R1_IMAG0232.JPG',
  '/mink_disk_0/camtraps/imerit12f/snapshot_karoo.seqKAR_S1#B02#1#92.frame3.jpg'),
 ('/mink_disk_0/camtraps/snapshot-safari/KAR/KAR_public/KAR_S1/B02/B02_R1/KAR_S1_B02_R1_IMAG0233.JPG',
  '/mink_disk_0/camtraps/imerit12f/snapshot_karoo.seqKAR_S1#B02#1#93.frame1.jpg'),
 ('/mink_disk_0/camtraps/snapshot-safari/KAR/KAR_public/KAR_S1/B02/B02_R1/KAR_S1_B02_R1_IMAG0234.JPG',
  '/mink_disk_0/camtraps/imerit12f/snapshot_karoo.seqKAR_S1#B02#1#93.frame2.jpg')]

In [25]:
%%time

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 13.2 s, sys: 40.8 s, total: 53.9 s
Wall time: 43.6 s


In [26]:
len(dst_paths)

6282

## Snapshot Kgalagadi

In [5]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/snapshot_kgalagadi_megadb.json') as f:
    kgalagadi_db = json.load(f)

In [6]:
len(kgalagadi_db)
sample(kgalagadi_db, 2)

3566

[{'dataset': 'snapshot_kgalagadi',
  'seq_id': 'KGA_S1#B06#1#897',
  'location': 'B06',
  'images': [{'id': 'KGA_S1/B06/B06_R1/KGA_S1_B06_R1_IMAG2683',
    'frame_num': 1,
    'file': 'KGA_S1/B06/B06_R1/KGA_S1_B06_R1_IMAG2683.JPG'},
   {'id': 'KGA_S1/B06/B06_R1/KGA_S1_B06_R1_IMAG2684',
    'frame_num': 2,
    'file': 'KGA_S1/B06/B06_R1/KGA_S1_B06_R1_IMAG2684.JPG'},
   {'id': 'KGA_S1/B06/B06_R1/KGA_S1_B06_R1_IMAG2685',
    'frame_num': 3,
    'file': 'KGA_S1/B06/B06_R1/KGA_S1_B06_R1_IMAG2685.JPG'}],
  'class': ['empty'],
  'datetime': '2018-12-22 13:20:18',
  'resting': None,
  'standing': None,
  'moving': None,
  'young_present': None,
  'count': None,
  'season': 'KGA_S1',
  'subject_id': 33332550,
  'interacting': None},
 {'dataset': 'snapshot_kgalagadi',
  'seq_id': 'KGA_S1#B09#1#166',
  'location': 'B09',
  'images': [{'id': 'KGA_S1/B09/B09_R1/KGA_S1_B09_R1_IMAG0488',
    'frame_num': 1,
    'file': 'KGA_S1/B09/B09_R1/KGA_S1_B09_R1_IMAG0488.JPG'},
   {'id': 'KGA_S1/B09/B09_R1/KGA_

In [7]:
non_empty_seqs = [seq for seq in kgalagadi_db if seq['class'][0] != 'empty']
len(non_empty_seqs)

894

In [8]:
list_to_download = []

path_prefix = 'KGA/KGA_public'

for seq in non_empty_seqs:
    for im in seq['images']:
        list_to_download.append(os.path.join(path_prefix, im['file']) + '\n')
len(list_to_download)

2336

In [9]:
list_to_download[100]

'KGA/KGA_public/KGA_S1/A01/A01_R1/KGA_S1_A01_R1_IMAG0230.JPG\n'

In [10]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/snapshot_kgalagadi_files.txt', 'w') as f:
    f.writelines(list_to_download)

In [11]:
%%time

dataset_name = 'snapshot_kgalagadi'
container_root = '/mink_disk_0/camtraps/snapshot-safari'

path_pairs = []

for seq in non_empty_seqs:
    seq_id = seq['seq_id']
    for im in seq['images']:
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path)

        frame = im['frame_num']

        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

len(path_pairs)
path_pairs[-50]

CPU times: user 36.4 ms, sys: 16.1 ms, total: 52.5 ms
Wall time: 614 ms


('/mink_disk_0/camtraps/snapshot-safari/KGA/KGA_public/KGA_S1/B09/B09_R1/KGA_S1_B09_R1_IMAG0586.JPG',
 '/mink_disk_0/camtraps/imerit12f/snapshot_kgalagadi.seqKGA_S1#B09#1#198.frame3.jpg')

In [12]:
%%time

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 4.54 s, sys: 13.7 s, total: 18.3 s
Wall time: 8.79 s


In [13]:
len(dst_paths)

2336

## Snapshot Enonkishu

In [5]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/snapshot_enonkishu_megadb.json') as f:
    db = json.load(f)

len(db)
sample(db, 2)

12969

[{'dataset': 'snapshot_enonkishu',
  'seq_id': 'ENO_S1#D03#1#61',
  'location': 'D03',
  'images': [{'id': 'ENO_S1/D03/D03_R1/ENO_S1_D03_R1_IMAG0061',
    'frame_num': 1,
    'file': 'ENO_S1/D03/D03_R1/ENO_S1_D03_R1_IMAG0061.JPG'}],
  'class': ['empty'],
  'datetime': '2018-09-20 17:11:21',
  'resting': None,
  'standing': None,
  'moving': None,
  'young_present': None,
  'count': None,
  'season': 'ENO_S1',
  'subject_id': 31959195,
  'interacting': None},
 {'dataset': 'snapshot_enonkishu',
  'seq_id': 'ENO_S1#D04#2#288',
  'location': 'D04',
  'images': [{'id': 'ENO_S1/D04/D04_R2/ENO_S1_D04_R2_IMAG0710',
    'frame_num': 1,
    'file': 'ENO_S1/D04/D04_R2/ENO_S1_D04_R2_IMAG0710.JPG'},
   {'id': 'ENO_S1/D04/D04_R2/ENO_S1_D04_R2_IMAG0711',
    'frame_num': 2,
    'file': 'ENO_S1/D04/D04_R2/ENO_S1_D04_R2_IMAG0711.JPG'},
   {'id': 'ENO_S1/D04/D04_R2/ENO_S1_D04_R2_IMAG0712',
    'frame_num': 3,
    'file': 'ENO_S1/D04/D04_R2/ENO_S1_D04_R2_IMAG0712.JPG'}],
  'class': ['empty'],
  'datetime

In [6]:
non_empty_seqs = [seq for seq in db if seq['class'][0] != 'empty']
len(non_empty_seqs)

4202

In [7]:
list_to_download = []

path_prefix = 'ENO/ENO_public'

for seq in non_empty_seqs:
    for im in seq['images']:
        list_to_download.append(os.path.join(path_prefix, im['file']) + '\n')
len(list_to_download)

9496

In [8]:
list_to_download[100]

'ENO/ENO_public/ENO_S1/B02/B02_R1/ENO_S1_B02_R1_IMAG0122.JPG\n'

In [9]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/snapshot_enonkishu_files.txt', 'w') as f:
    f.writelines(list_to_download)

In [10]:
%%time

dataset_name = 'snapshot_enonkishu'
container_root = '/mink_disk_0/camtraps/snapshot-safari'

path_pairs = []

for seq in non_empty_seqs:
    seq_id = seq['seq_id']
    for im in seq['images']:
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path)

        frame = im['frame_num']

        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

len(path_pairs)
path_pairs[-50]

CPU times: user 99.6 ms, sys: 109 ms, total: 208 ms
Wall time: 2.21 s


('/mink_disk_0/camtraps/snapshot-safari/ENO/ENO_public/ENO_S1/F06/F06_R1/ENO_S1_F06_R1_IMAG0039.JPG',
 '/mink_disk_0/camtraps/imerit12f/snapshot_enonkishu.seqENO_S1#F06#1#14.frame2.jpg')

In [11]:
%%time

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 17.8 s, sys: 1min 1s, total: 1min 19s
Wall time: 1min 5s


In [12]:
len(dst_paths)

9496

## Snapshot Camdeboo

In [5]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/snapshot_camdeboo_megadb.json') as f:
    db = json.load(f)

len(db)
sample(db, 2)

12024

[{'dataset': 'snapshot_camdeboo',
  'seq_id': 'CDB_S1#C06#2#473',
  'location': 'C06',
  'images': [{'id': 'CDB_S1/C06/C06_R2/CDB_S1_C06_R2_IMAG1267',
    'frame_num': 1,
    'file': 'CDB_S1/C06/C06_R2/CDB_S1_C06_R2_IMAG1267.JPG'},
   {'id': 'CDB_S1/C06/C06_R2/CDB_S1_C06_R2_IMAG1268',
    'frame_num': 2,
    'file': 'CDB_S1/C06/C06_R2/CDB_S1_C06_R2_IMAG1268.JPG'},
   {'id': 'CDB_S1/C06/C06_R2/CDB_S1_C06_R2_IMAG1269',
    'frame_num': 3,
    'file': 'CDB_S1/C06/C06_R2/CDB_S1_C06_R2_IMAG1269.JPG'}],
  'class': ['kudu'],
  'resting': 0.0,
  'standing': 0.1,
  'moving': 0.9,
  'young_present': 0.0,
  'count': '1',
  'season': 'CDB_S1',
  'subject_id': 32986502,
  'interacting': 0.0},
 {'dataset': 'snapshot_camdeboo',
  'seq_id': 'CDB_S1#B04#1#63',
  'location': 'B04',
  'images': [{'id': 'CDB_S1/B04/B04_R1/CDB_S1_B04_R1_IMAG0153',
    'frame_num': 1,
    'file': 'CDB_S1/B04/B04_R1/CDB_S1_B04_R1_IMAG0153.JPG'},
   {'id': 'CDB_S1/B04/B04_R1/CDB_S1_B04_R1_IMAG0154',
    'frame_num': 2,
    'f

In [6]:
non_empty_seqs = [seq for seq in db if seq['class'][0] != 'empty']
len(non_empty_seqs)

7193

In [7]:
list_to_download = []

path_prefix = 'CDB/CDB_public'

for seq in non_empty_seqs:
    for im in seq['images']:
        list_to_download.append(os.path.join(path_prefix, im['file']) + '\n')
len(list_to_download)

16864

In [8]:
list_to_download[100]

'CDB/CDB_public/CDB_S1/A05/A05_R1/CDB_S1_A05_R1_IMAG0112.JPG\n'

In [9]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/snapshot_camdeboo_files.txt', 'w') as f:
    f.writelines(list_to_download)

In [10]:
%%time

dataset_name = 'snapshot_camdeboo'
container_root = '/mink_disk_0/camtraps/snapshot-safari'

path_pairs = []

for seq in non_empty_seqs:
    seq_id = seq['seq_id']
    for im in seq['images']:
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path)

        frame = im['frame_num']

        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

len(path_pairs)
path_pairs[-50]

CPU times: user 158 ms, sys: 177 ms, total: 334 ms
Wall time: 2.84 s


('/mink_disk_0/camtraps/snapshot-safari/CDB/CDB_public/CDB_S1/F03/F03_R2/CDB_S1_F03_R2_IMAG0752.JPG',
 '/mink_disk_0/camtraps/imerit12f/snapshot_camdeboo.seqCDB_S1#F03#2#280.frame1.jpg')

In [11]:
%%time

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 29.7 s, sys: 1min 45s, total: 2min 15s
Wall time: 2min 10s


In [12]:
len(dst_paths)

16864

## Snapshot Mountain Zebra

In [5]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/snapshot_mountain_zebra_megadb.json') as f:
    db = json.load(f)

len(db)
sample(db, 2)

71178

[{'dataset': 'snapshot_mountain_zebra',
  'seq_id': 'MTZ_S1#E05#1#4004',
  'location': 'E05',
  'images': [{'id': 'MTZ_S1/E05/E05_R1/MTZ_S1_E05_R1_IMAG4004',
    'frame_num': 1,
    'file': 'MTZ_S1/E05/E05_R1/MTZ_S1_E05_R1_IMAG4004.JPG'}],
  'class': ['mongooseyellow'],
  'datetime': '2017-09-27 15:35:50',
  'resting': 0.0,
  'standing': 0.33,
  'moving': 0.67,
  'young_present': 0.0,
  'count': '1',
  'season': 'MTZ_S1',
  'subject_id': 29975409,
  'interacting': 0.0},
 {'dataset': 'snapshot_mountain_zebra',
  'seq_id': 'MTZ_S1#B05#3#7111',
  'location': 'B05',
  'images': [{'id': 'MTZ_S1/B05/B05_R3/MTZ_S1_B05_R3_IMAG7111',
    'frame_num': 1,
    'file': 'MTZ_S1/B05/B05_R3/MTZ_S1_B05_R3_IMAG7111.JPG'}],
  'class': ['empty'],
  'datetime': '2018-04-29 20:01:08',
  'resting': None,
  'standing': None,
  'moving': None,
  'young_present': None,
  'count': None,
  'season': 'MTZ_S1',
  'subject_id': 32757448,
  'interacting': None}]

In [6]:
non_empty_seqs = [seq for seq in db if seq['class'][0] != 'empty']
len(non_empty_seqs)

5593

In [7]:
list_to_download = []

path_prefix = 'MTZ/MTZ_public'

for seq in non_empty_seqs:
    for im in seq['images']:
        list_to_download.append(os.path.join(path_prefix, im['file']) + '\n')
len(list_to_download)

5919

In [8]:
list_to_download[100]

'MTZ/MTZ_public/MTZ_S1/B04/B04_R1/MTZ_S1_B04_R1_IMAG0256.JPG\n'

In [9]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/snapshot_mountain_zebra_files.txt', 'w') as f:
    f.writelines(list_to_download)

Short sequences...

In [10]:
%%time

dataset_name = 'snapshot_mountain_zebra'
container_root = '/mink_disk_0/camtraps/snapshot-safari'

path_pairs = []

for seq in non_empty_seqs:
    seq_id = seq['seq_id']
    for im in seq['images']:
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path)

        frame = im['frame_num']

        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

len(path_pairs)
path_pairs[-50]

CPU times: user 47.4 ms, sys: 76.2 ms, total: 124 ms
Wall time: 1.15 s


('/mink_disk_0/camtraps/snapshot-safari/MTZ/MTZ_public/MTZ_S1/G04/G04_R1/MTZ_S1_G04_R1_IMAG3027.JPG',
 '/mink_disk_0/camtraps/imerit12f/snapshot_mountain_zebra.seqMTZ_S1#G04#1#2991.frame1.jpg')

In [None]:
path_pairs[3000:3100]

In [16]:
%%time

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 13 s, sys: 45 s, total: 58 s
Wall time: 55 s


## Snapshot Kruger

In [4]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/snapshot_kruger_megadb.json') as f:
    db = json.load(f)

len(db)
sample(db, 2)

4568

[{'dataset': 'snapshot_kruger',
  'seq_id': 'KRU_S1#8#1#489',
  'location': '8',
  'images': [{'id': 'KRU_S1/8/8_R1/KRU_S1_8_R1_IMAG1401',
    'frame_num': 1,
    'file': 'KRU_S1/8/8_R1/KRU_S1_8_R1_IMAG1401.JPG'},
   {'id': 'KRU_S1/8/8_R1/KRU_S1_8_R1_IMAG1402',
    'frame_num': 2,
    'file': 'KRU_S1/8/8_R1/KRU_S1_8_R1_IMAG1402.JPG'},
   {'id': 'KRU_S1/8/8_R1/KRU_S1_8_R1_IMAG1403',
    'frame_num': 3,
    'file': 'KRU_S1/8/8_R1/KRU_S1_8_R1_IMAG1403.JPG'}],
  'class': ['empty'],
  'datetime': '2018-06-28 13:13:30',
  'resting': None,
  'standing': None,
  'moving': None,
  'young_present': None,
  'count': None,
  'season': 'KRU_S1',
  'subject_id': 31940127,
  'interacting': None},
 {'dataset': 'snapshot_kruger',
  'seq_id': 'KRU_S1#5#1#38',
  'location': '5',
  'images': [{'id': 'KRU_S1/5/5_R1/KRU_S1_5_R1_IMAG0060',
    'frame_num': 1,
    'file': 'KRU_S1/5/5_R1/KRU_S1_5_R1_IMAG0060.JPG'}],
  'class': ['impala'],
  'datetime': '2018-06-03 19:02:16',
  'resting': 0.1,
  'standing': 0.7

In [5]:
non_empty_seqs = [seq for seq in db if seq['class'][0] != 'empty']
len(non_empty_seqs)

1654

In [6]:
list_to_download = []

path_prefix = 'KRU/KRU_public'

for seq in non_empty_seqs:
    for im in seq['images']:
        list_to_download.append(os.path.join(path_prefix, im['file']) + '\n')
len(list_to_download)

3540

In [7]:
list_to_download[100]

'KRU/KRU_public/KRU_S1/1/1_R1/KRU_S1_1_R1_IMAG0125.JPG\n'

In [8]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/snapshot_kruger_files.txt', 'w') as f:
    f.writelines(list_to_download)

In [9]:
%%time

dataset_name = 'snapshot_kruger'
container_root = '/mink_disk_0/camtraps/snapshot-safari'

path_pairs = []

for seq in non_empty_seqs:
    seq_id = seq['seq_id']
    for im in seq['images']:
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path)

        frame = im['frame_num']

        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

len(path_pairs)
path_pairs[-50]

CPU times: user 47.8 ms, sys: 22.6 ms, total: 70.3 ms
Wall time: 590 ms


('/mink_disk_0/camtraps/snapshot-safari/KRU/KRU_public/KRU_S1/9/9_R1/KRU_S1_9_R1_IMAG0065.JPG',
 '/mink_disk_0/camtraps/imerit12f/snapshot_kruger.seqKRU_S1#9#1#23.frame2.jpg')

In [12]:
%%time

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 15.2 s, sys: 53.8 s, total: 1min 8s
Wall time: 1min 10s
