In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# islandconservation_midway_2020

In [16]:
dataset_name = 'islandconservation_midway_2020'

container_root = '/mink_disk_0/camtraps/island-conservation-private/'  
path_prefix = 'training-2021.02.01/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

CameraName is in the file name, the part before the first dot in the name. 

There are no CSV labels...

In [17]:
folder = os.path.join(container_root, path_prefix)

paths = path_utils.recursive_file_list(folder)
len(paths)
paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p)])
len(paths)

177782

177778

In [18]:
paths[107778]

'Monitoreo petreles/Floreana_PetrelCameraData/CAM2/CAM2_20AGO2020/Floreana_CAM2_20AGO2020_2020-06-26 10-15-18_IMG_2223 (2).JPG'

### Lehua folder

In [23]:
# The Lehua folder - 2017 folder has different camera locations 
# and the two 2016 folders are from the same location

folder_name = 'Lehua'
lehua_locations = set()
lehua_sequences = []

for p in paths:
    if not p.startswith('Lehua'):
        continue
    
    p_parts = p.split('/')
    year_folder = p_parts[1]
    if year_folder == '2017':
        location = p_parts[2]
    else:
        location = 'Lehua2016'
    
    lehua_locations.add(location)
    
    lehua_sequences.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{folder_name}{len(lehua_sequences)}',
        'images': [
            {
                'file': p,
                'frame_num': 1 # only one image, but easier for ingesting the annotations
            }
        ],
        'location': location,
        'class': ['__label_unavailable']
    })

In [25]:
len(lehua_locations)
len(lehua_sequences)

10

14388

In [26]:
lehua_sequences[1300]

{'dataset': 'islandconservation_midway_2020',
 'seq_id': 'dummy_Lehua1300',
 'images': [{'file': 'Lehua/2017/5W Aug 2017/IMG_0138.JPG', 'frame_num': 1}],
 'location': '5W Aug 2017',
 'class': ['__label_unavailable']}

### Monitoreo petreles

In [29]:
folder_name = 'Monitoreo petreles'
mon_locations = set()
mon_sequences = []

for p in paths:
    if not p.startswith('Monitoreo'):
        continue
    
    p_parts = p.split('/')
    region = p_parts[1].split('_')[0]
    camera = p_parts[2].split('_')[0]
    
    location = f'{region}_{camera}'
    
    mon_locations.add(location)
    
    mon_sequences.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{folder_name}{len(mon_sequences)}',
        'images': [
            {
                'file': p,
                'frame_num': 1 # only one image, but easier for ingesting the annotations
            }
        ],
        'location': location,
        'class': ['__label_unavailable']
    })

In [30]:
len(mon_locations)
len(mon_sequences)

20

121400

In [31]:
mon_sequences[-1000]

{'dataset': 'islandconservation_midway_2020',
 'seq_id': 'dummy_Monitoreo petreles120400',
 'images': [{'file': 'Monitoreo petreles/SantaCruz_PetrelCameraData/CAM31/CAM31_21Sep2020/SantaCruz_CAM31_21Sep2020_2020-08-11 19-00-00_IMG_0255.JPG',
   'frame_num': 1}],
 'location': 'SantaCruz_CAM31',
 'class': ['__label_unavailable']}

### MidwayBoninNight 

In [32]:
folder_name = 'MidwayBoninNight'
bon_locations = set()
bon_sequences = []

for p in paths:
    if not p.startswith('MidwayB'):
        continue
    
    p_parts = p.split('/')
    camera = p_parts[1].split('.')[0]
    
    location = f'{folder_name}_{camera}'
    
    bon_locations.add(location)
    
    bon_sequences.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{folder_name}{len(bon_sequences)}',
        'images': [
            {
                'file': p,
                'frame_num': 1 # only one image, but easier for ingesting the annotations
            }
        ],
        'location': location,
        'class': ['__label_unavailable']
    })

In [33]:
len(bon_locations)
len(bon_sequences)

14

41990

In [36]:
sequences = lehua_sequences + mon_sequences + bon_sequences

In [37]:
len(sequences)

177778

In [44]:
len(lehua_locations) + len(mon_locations) + len(bon_locations)

44

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [38]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 21.1 s, sys: 0 ns, total: 21.1 s
Wall time: 21.1 s


In [39]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - copy images to flat folder

In [40]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [41]:
%%time

path_pairs = []
for seq in tqdm(sequences):
    seq_id = seq['seq_id']
    for im in seq['images']:
        src_path = os.path.join(container_root, path_prefix, im['file'])
        assert os.path.exists(src_path), src_path
        frame = 1
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12b', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

100%|██████████| 177778/177778 [00:22<00:00, 7741.71it/s]

CPU times: user 2 s, sys: 1.54 s, total: 3.54 s
Wall time: 23 s





In [42]:
len(path_pairs)
path_pairs[-100]

177778

('/mink_disk_0/camtraps/island-conservation-private/training-2021.02.01/MidwayBoninNight/Sec37.Sec37-2_Jan25-Feb17.IMG_6232.JPG',
 '/mink_disk_0/camtraps/imerit12b/islandconservation_midway_2020.seqdummy_MidwayBoninNight41890.frame1.jpg')

In [43]:
# sample
sample_path_pairs = sample(path_pairs, 150000)

len(sample_path_pairs)
sample_path_pairs[-100]

150000

('/mink_disk_0/camtraps/island-conservation-private/training-2021.02.01/Monitoreo petreles/Floreana_PetrelCameraData/CAM16/CAM16_10DIC2020/Floreana_CAM16_10DIC2020_2020-10-28 11-01-25_10280385.JPG',
 '/mink_disk_0/camtraps/imerit12b/islandconservation_midway_2020.seqdummy_Monitoreo petreles23145.frame1.jpg')

In [45]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, sample_path_pairs)

CPU times: user 1min 44s, sys: 5min 33s, total: 7min 17s
Wall time: 20min 53s


In [46]:
len(dst_paths)

150000