In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# auckland_doc_2019

In [4]:
dataset_name = 'auckland_doc_2019'

container_root = '/mink_disk_0/camtraps/auckland-doc/'
path_prefix = '2020.08.01_reformat/Maukahuka_Auckland_Island/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

Folder structure

Training folder:

study / possibly another level / species (inc probably) / behaivor / basename

Testing folder:

study / program / location / basename

A third "Others" folder doesn't look as relevant.

In [5]:
folder = os.path.join(container_root, path_prefix)

paths = path_utils.recursive_file_list(folder)
len(paths)
paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p) and not os.path.basename(p).startswith('.')])
len(paths)

213572

209696

In [40]:
paths[10]
paths[100000 - 1]

'1_Training/Summer_Trail_2019/empty/ignore/20190801_vBqd_02060944_IMG3.JPG'

'2_Testing/Summer_Trial_2019/C1_2_22_SD79_20190302/AucklandIsland_C1_2_22_SD79_20190302_102EK113_02240971_20190224170621.jpg'

### 1_Training folder

In [32]:
locations = set()
behaviors = set()
species = set()
sequences_train = []
folder = 'train'

for p in paths:
    if not p.startswith('1_Training'):
        continue
        
    p_parts = p.split('/')
    
    behavior = p_parts[-2]
    behaviors.add(behavior)
    
    clss = p_parts[-3].lower()
    
    if clss.startswith('probably'):
        clss = clss.split('_')[1]  # e.g. probably_bird
        probably=True
    else:
        probably=False
    if clss.startswith('2_'):
        clss = clss.split('2_')[1]
        
    if clss == 'nz sealion':
        clss = 'nz sea lion'

    species.add(clss)
        
    basename = os.path.basename(p)
    b_parts = basename.split('_')
    location = b_parts[1]
    locations.add(location)
    
    sequences_train.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{folder}_{len(sequences_train)}',
        'location': location,
        'class': [clss],
        'behavior': behavior,
        'species_unsure': probably,
        'images': [
            {
                'file': p,
                'frame_num': 1
            }
        ]
    })

In [33]:
len(sequences_train)

46600

In [34]:
len(locations)  # e.g. TP5r, F62S

268

In [35]:
len(behaviors)
behaviors

3

{'eat', 'ignore', 'interact'}

In [36]:
len(species)

22

In [37]:
species

{'bellbird',
 'bird sp',
 'black screen',
 'blackbird',
 'cat',
 'collared cat',
 'dunnock',
 'empty',
 'human',
 'mouse',
 'multiplespecies',
 'nz sea lion',
 'pig',
 'pipit',
 'robin',
 'silvereye',
 'thrush',
 'tomtit',
 'tui',
 'unsure',
 'yellow crowned parakeet',
 'yellow eyed penguin'}

In [39]:
len(sequences_train)

46600

In [52]:
non_empty_train = sum([1 for seq in sequences_train if 'empty' not in seq['class']])
non_empty_train

17463

### 2_Testing

Locations should be like "A1".

No species for this folder

In [41]:
locations = set()
sequences_test = []
folder = 'test'

for p in paths:
    if not p.startswith('2_Testing'):
        continue
        
    p_parts = p.split('/')
    deployment = p_parts[-2]
    
    if p_parts[1] == 'Summer_Trial_2019':
        location = deployment.split('_')[0]
    else:
        location = deployment
    locations.add(location)
    
    sequences_test.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{folder}_{len(sequences_test)}',
        'location': location,
        'class': ['__label_unavailable'],
        'images': [
            {
                'file': p,
                'frame_num': 1
            }
        ]
    })

In [42]:
len(locations)

69

In [45]:
len(sequences_test)

161597

In [53]:
17463 + 161597

179060

### Both folders

In [44]:
sequences = sequences_train + sequences_test
len(sequences)

208197

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [47]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 24.8 s, sys: 21.3 ms, total: 24.8 s
Wall time: 24.8 s


In [48]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - copy non-empties to flat folder

In [49]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [50]:
%%time

path_pairs = []
for seq in tqdm(sequences):
    
    if seq['class'][0] == 'empty':
        continue
    
    seq_id = seq['seq_id']
    
    im = seq['images'][0]
    frame = im['frame_num']
    
    src_path = os.path.join(container_root, path_prefix, im['file'])
    assert os.path.exists(src_path), src_path

    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12d', 
                            f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
    path_pairs.append((src_path, dst_path))

100%|██████████| 208197/208197 [00:15<00:00, 13521.90it/s]

CPU times: user 1.71 s, sys: 1.26 s, total: 2.97 s
Wall time: 15.4 s





In [51]:
len(path_pairs)
path_pairs[3000]

179060

('/mink_disk_0/camtraps/auckland-doc/2020.08.01_reformat/Maukahuka_Auckland_Island/1_Training/Summer_Trail_2019/mouse/ignore/20190801_QLS5_AucklandIsland__C2_6_16_SD17_20190227_02260035_20190226234744.JPG',
 '/mink_disk_0/camtraps/imerit12d/auckland_doc_2019.seqdummy_train_25316.frame1.jpg')

In [54]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 1min 22s, sys: 4min 25s, total: 5min 47s
Wall time: 17min 10s


In [55]:
len(dst_paths)

179060