In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# umn_gomez

In [4]:
dataset_name = 'umn_gomez'

container_root = '/mink_disk_0/camtraps/umn-gomez/'  
path_prefix = 'raw-pictures-habfrag/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

Species in a CSV but unfortunately no sequence info. Location in file name. They compared MD result and manually labeled classes (species and empty/human/animal).

Not all images in the folder in blob storage have entries in the CSV.

In [5]:
csv_path = os.path.join(container_root, 'row-pictures-habfrag_all_images_md_vs_manual_label.csv')

In [6]:
df = pd.read_csv(csv_path)

In [7]:
len(df)
df.sample(2)

75557

Unnamed: 0,filename,timestamp,common_name_wi,megaD_max_confidence,md_class,true_class
66338,N27/09060810.JPG,2019-09-06 06:33:13,South American Coati,1.0,Animal,Animal
41449,N14/11230318.JPG,2019-11-23 13:52:28,Collared Peccary,1.0,Animal,Animal


In [8]:
df['true_class'].unique()

array(['Animal', 'Blank', 'Human'], dtype=object)

In [9]:
df['common_name_wi'].unique()

array(['Rodent', "Spix's Guan", 'Blank', 'Bird', 'Unknown species',
       'Black Agouti', 'Tayra', 'Southern Tamandua', 'Dasypus Species',
       'Human', "Salvin's Curassow", 'Nine-banded Armadillo',
       'Collared Peccary', 'Ocelot', 'Giant Anteater', 'Cervidae Family',
       'Bos Species', 'Bush Dog', 'South American Coati', 'Lowland Tapir',
       'Amazonian Motmot', 'Possum Family', 'Peccary Family',
       'Caprimulgidae Family', 'Giant Armadillo',
       'Northern Amazon Red Squirrel', 'White-tailed Deer',
       'Spotted Paca', 'Red Brocket', 'Mazama Species', 'Mammal',
       'Lizards and Snakes', 'White-lipped Peccary',
       'Columbiformes Order', 'Aphelocoma Species', 'No CV Result',
       'Giant Ameiva', 'Sciuridae Family', 'Armadillo Family',
       'Turtle Order', 'Dasyprocta Species', 'Tamandua Species', 'Puma',
       'Tortoise Family', 'Margarita Island Capuchin', 'Weasel Family',
       'Jaguar', 'Saimiri Species', 'Coendou Species',
       'Razor-billed Curass

In [10]:
folder = os.path.join(container_root, path_prefix)

paths = path_utils.recursive_file_list(folder)
len(paths)
paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p) and not os.path.basename(p).startswith('.')])
len(paths)

111878

111829

In [11]:
paths[-100]

'N36/11210994.JPG'

In [18]:
locations = set()
sequences = []

for i_row, row in tqdm(df.iterrows()):

    fn = row['filename']
    species = row['common_name_wi'].lower()
    if species == 'blank':
        species = 'empty'
    
    location = fn.split('/')[0]
    locations.add(location)
    
    seq_id = fn.split('.')[0].replace('/', '_')
        
    sequences.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{seq_id}',
        'images': [
            {
                'file': fn,
                'frame_num': 1, # only one image, but easier for ingesting the annotations
                'datetime': row['timestamp'], # extra field,
                'mdv4_class': row['md_class'],
                'mdv4_conf': row['megaD_max_confidence'],
                'true_obj_class': row['true_class']
            }
        ],
        'location': location,
        'class': [species]
    })

75557it [00:09, 8005.09it/s]


In [19]:
num_empty = sum([1 for seq in sequences if seq['class'][0] == 'empty'])
num_empty

13132

In [20]:
num_empty / 75557

0.17380255965694774

In [21]:
len(locations)

44

In [22]:
len(sequences)
sequences[20000:20003]

75557

[{'dataset': 'umn_gomez',
  'seq_id': 'dummy_M05_08070432',
  'images': [{'file': 'M05/08070432.JPG',
    'frame_num': 1,
    'datetime': '2019-08-07 15:58:52',
    'mdv4_class': 'Animal',
    'mdv4_conf': 1.0,
    'true_obj_class': 'Animal'}],
  'location': 'M05',
  'class': ['collared peccary']},
 {'dataset': 'umn_gomez',
  'seq_id': 'dummy_M05_08070433',
  'images': [{'file': 'M05/08070433.JPG',
    'frame_num': 1,
    'datetime': '2019-08-07 15:58:53',
    'mdv4_class': 'Animal',
    'mdv4_conf': 1.0,
    'true_obj_class': 'Animal'}],
  'location': 'M05',
  'class': ['collared peccary']},
 {'dataset': 'umn_gomez',
  'seq_id': 'dummy_M05_08070434',
  'images': [{'file': 'M05/08070434.JPG',
    'frame_num': 1,
    'datetime': '2019-08-07 15:58:54',
    'mdv4_class': 'Animal',
    'mdv4_conf': 1.0,
    'true_obj_class': 'Animal'}],
  'location': 'M05',
  'class': ['collared peccary']}]

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [23]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 9.64 s, sys: 15.6 ms, total: 9.66 s
Wall time: 9.66 s


In [24]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - copy non-empties to flat folder

In [25]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [29]:
%%time

path_pairs = []
for seq in tqdm(sequences):
    
    if seq['class'][0] == 'empty':
        continue
    
    seq_id = seq['seq_id']
    
    im = seq['images'][0]
    frame = im['frame_num']
    
    src_path = os.path.join(container_root, path_prefix, im['file'])
    assert os.path.exists(src_path), src_path

    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12d', 
                            f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
    path_pairs.append((src_path, dst_path))

100%|██████████| 75557/75557 [00:05<00:00, 12818.89it/s]

CPU times: user 675 ms, sys: 259 ms, total: 934 ms
Wall time: 5.9 s





In [30]:
len(path_pairs)  # non-empty images out of total of 19221 (38%)
path_pairs[3000]

62425

('/mink_disk_0/camtraps/umn-gomez/raw-pictures-habfrag/M00/10090027.JPG',
 '/mink_disk_0/camtraps/imerit12d/umn_gomez.seqdummy_M00_10090027.frame1.jpg')

In [31]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 35.4 s, sys: 1min 40s, total: 2min 15s
Wall time: 5min 49s


In [32]:
len(dst_paths)

62425