In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# fws_hawaii_kauai_forest_birds_a24s

In [4]:
dataset_name = 'fws_hawaii_kauai_forest_birds_a24s'

container_root = '/mink_disk_0/camtraps/hawaii-fws-upload/'
path_prefix = 'A24s/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

No species or sequence info.

Location inferred from folder structure. Different for the two sub-folders.

In [5]:
folder = os.path.join(container_root, path_prefix)

paths = path_utils.recursive_file_list(folder)
len(paths)
paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p)])
len(paths)

315362

313769

In [6]:
paths[100]

'HPK/100RECNX/IMG_0101.JPG'

### HPK folder

In [7]:
hpk_sequences = []
hpk_locations = set()

for p in paths:
    if not p.startswith('HPK'):
        continue
    
    p_parts = p.split('/')
    location = p_parts[1]
    if location.startswith('10'):
        location = 'HPK_root'
    hpk_locations.add(location)
    
    seq_id = 'dummy_' + '_'.join(p_parts[1:]).split('.')[0].replace(' ', '')
    
    hpk_sequences.append({
        'dataset': dataset_name,
        'seq_id': seq_id,
        'location': location,
        'class': ['__label_unavailable'],
        'images': [
            {
                'file': p,
                'frame_num': 1
            }
        ]
    })

In [8]:
len(hpk_locations)
hpk_locations

13

{'Camera 1',
 'Camera 2',
 'Camera 4',
 'Camera 5',
 'Camera 6',
 'Camera 7',
 'Camera 8',
 'Camera 9',
 'Fall 2017_feral cat',
 'G8 Camera',
 'H2 Rat Cam 2017 or 2018',
 'HPK_root',
 'misc HPK trap monitoring 2017ish'}

In [9]:
len(hpk_sequences)
sample(hpk_sequences, 3)

80584

[{'dataset': 'fws_hawaii_kauai_forest_birds_a24s',
  'seq_id': 'dummy_G8Camera_DCIM_100EK113_PICT1029',
  'location': 'G8 Camera',
  'class': ['__label_unavailable'],
  'images': [{'file': 'HPK/G8 Camera/DCIM/100EK113/PICT1029.JPG',
    'frame_num': 1}]},
 {'dataset': 'fws_hawaii_kauai_forest_birds_a24s',
  'seq_id': 'dummy_101RECNX_IMG_4491',
  'location': 'HPK_root',
  'class': ['__label_unavailable'],
  'images': [{'file': 'HPK/101RECNX/IMG_4491.JPG', 'frame_num': 1}]},
 {'dataset': 'fws_hawaii_kauai_forest_birds_a24s',
  'seq_id': 'dummy_Camera9_100RECNX_IMG_0545',
  'location': 'Camera 9',
  'class': ['__label_unavailable'],
  'images': [{'file': 'HPK/Camera 9/100RECNX/IMG_0545.JPG', 'frame_num': 1}]}]

### MOH folder

In [10]:
moh_sequences = []
moh_locations = set()

for p in paths:
    if not p.startswith('MOH'):
        continue
    
    p_parts = p.split('/')
    
    if p_parts[1] == 'Spring19':
        location = p_parts[3].split('(')[0].strip()
    else:  # Pilot Study Winter 2018-2019
        location = p_parts[2]
    if location.endswith(('a', 'b')):
        location = location[:-1]
    
    moh_locations.add(location)
    
    seq_id = 'dummy_' + '_'.join(p_parts[1:]).split('.')[0].replace(' ', '')
    
    moh_sequences.append({
        'dataset': dataset_name,
        'seq_id': seq_id,
        'location': location,
        'class': ['__label_unavailable'],
        'images': [
            {
                'file': p,
                'frame_num': 1
            }
        ]
    })

In [11]:
len(moh_locations)
moh_locations

89

{'CMA02',
 'CMA03',
 'CMA04',
 'CMA05',
 'CMA07',
 'CMA09',
 'CMA10',
 'CMA11',
 'CMA13',
 'CMB01',
 'CMD03',
 'CMD04',
 'CMD05',
 'CMD06',
 'CMD07',
 'CMD08',
 'CMD09',
 'CMD10',
 'CMD11',
 'CMD12',
 'CMD13',
 'CMD14',
 'CMD15',
 'CMG16',
 'CMG17',
 'CMG18',
 'CMG19',
 'CMG20',
 'CMG21',
 'CMG22',
 'CMG23',
 'CMG24',
 'CMG25',
 'CMG26',
 'MD01',
 'MD08',
 'MD16',
 'MD17',
 'MD18',
 'MD19',
 'ME01',
 'ME02',
 'ME03',
 'ME04',
 'ME06',
 'ME07',
 'ME08',
 'ME10',
 'ME12',
 'ME13',
 'ME15',
 'ME16',
 'ME17',
 'ME18',
 'ME20',
 'ME21',
 'MF03',
 'MF04',
 'MF05',
 'MG03',
 'MG04',
 'MG05',
 'MG07',
 'MG08',
 'MG09',
 'MG10',
 'MG12',
 'MG13',
 'MG15',
 'MH01',
 'MH02',
 'MH03',
 'MH04',
 'MH05',
 'MH06',
 'MH07',
 'MH08',
 'MH09',
 'MH11',
 'MH12',
 'MI01',
 'MI02',
 'MI03',
 'MJ01',
 'MJ02',
 'MJ04',
 'MJ05',
 'MJ07',
 'MJ10'}

In [12]:
len(moh_sequences)
sample(moh_sequences, 2)

233185

[{'dataset': 'fws_hawaii_kauai_forest_birds_a24s',
  'seq_id': 'dummy_Spring19_April5-10_MH09_R05_IMG_1708',
  'location': 'MH09',
  'class': ['__label_unavailable'],
  'images': [{'file': 'MOH/Spring19/April5-10/MH09/R05/IMG_1708.JPG',
    'frame_num': 1}]},
 {'dataset': 'fws_hawaii_kauai_forest_birds_a24s',
  'seq_id': 'dummy_PilotStudyWinter2018-19_MD16_2ndRoundJan_CMD16B_DCIM_101EK113_EK005325',
  'location': 'MD16',
  'class': ['__label_unavailable'],
  'images': [{'file': 'MOH/Pilot Study Winter 2018-19/MD16/2nd Round Jan/CMD16B/DCIM/101EK113/EK005325.JPG',
    'frame_num': 1}]}]

### Combine the two folders

In [13]:
sequences = hpk_sequences + moh_sequences

In [14]:
len(sequences)

313769

In [15]:
len(hpk_locations) + len(moh_locations)

102

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [16]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 37.5 s, sys: 1.89 ms, total: 37.5 s
Wall time: 37.5 s


In [17]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - sample images

Want to sample 100k from every low-level folder, while keeping images together so annotators can see the sequences if they're naturally sequential

In [18]:
folder_to_basenames = defaultdict(list)  # the list of basenames already sorted

for seq in sequences:
    im = seq['images'][0]
    
    folder, basename = os.path.split(im['file'])
    folder_to_basenames[folder].append(basename)

In [19]:
len(folder_to_basenames)

311

In [20]:
images_to_include = []

for folder, li in tqdm(folder_to_basenames.items()):
    batches_folder = 0
    for i in range(0, len(li), 10):  # 10 sequential images
        if batches_folder % 3 == 0:  # always take the first 10 in a folder
            batch = li[i: i + 10]
            files = [os.path.join(folder, i) for i in batch]
            images_to_include.extend(files)
        batches_folder += 1

100%|██████████| 311/311 [00:00<00:00, 1839.35it/s]


In [21]:
len(images_to_include)
set_images_to_include = set(images_to_include)
len(set_images_to_include)

105670

105670

In [22]:
images_to_include[-20:]

['MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0301.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0302.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0303.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0304.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0305.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0306.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0307.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0308.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0309.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0310.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0331.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0332.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0333.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0334.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0335.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0336.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0337.JPG',
 'MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX033

### Step 2c - copy images to flat folder

In [23]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [24]:
%%time

path_pairs = []
for seq in tqdm(sequences):    
    seq_id = seq['seq_id']
    for im in seq['images']:
        
        if im['file'] not in set_images_to_include:
            continue
        
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path), src_path
        frame = 1
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12c', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

100%|██████████| 313769/313769 [00:24<00:00, 12731.21it/s]

CPU times: user 1.85 s, sys: 934 ms, total: 2.79 s
Wall time: 24.6 s





In [25]:
len(path_pairs)
path_pairs[-100]

105670

('/mink_disk_0/camtraps/hawaii-fws-upload/A24s/MOH/Spring19/May30-June3/ME21 (ATC 01)/RCNX0061.JPG',
 '/mink_disk_0/camtraps/imerit12c/fws_hawaii_kauai_forest_birds_a24s.seqdummy_Spring19_May30-June3_ME21(ATC01)_RCNX0061.frame1.jpg')

In [26]:
len('fws_hawaii_kauai_forest_birds_a24s.seqdummy_Spring19_May30-June3_ME21(ATC01)_RCNX0061.frame1.jpg')

96

In [27]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 51 s, sys: 2min 19s, total: 3min 10s
Wall time: 8min 37s


In [28]:
len(dst_paths)

105670