In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# au_nt_gov_kerr

In [4]:
dataset_name = 'au_nt_gov_kerr'

container_root = '/mink_disk_0/camtraps/nt-gov-au/'  

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

Last level of directory includes the species name. Location is a part of the file name.

In [5]:
paths = path_utils.recursive_file_list(container_root)
len(paths)
paths = sorted([p.split(container_root)[1] for p in paths if path_utils.is_image_file(p)])
len(paths)

19353

19053

In [6]:
paths[1000]
paths[1000].split('/')[-2].lower().split('_')[0]

'Training Images (Custom Vision)/Brush-tailed Rabbit-rat/Brush-tailed Rabbit-ratIMG_0008.JPG'

'brush-tailed rabbit-rat'

In [7]:
# typos etc
species_name_mapping = {
    'rainbow bee eater': 'rainbow bee-eater',
    'australian owlet nightjar': 'australian owlet-nightjar',
    'bar shouldered dove': 'bar-shouldered dove',
    'black wallaro': 'black wallaroo',
    'blue winged kookaburra': 'blue-winged kookaburra',
    'black footed tree rat': 'black-footed tree-rat',
    'brush tailed rabbit rat': 'brush-tailed rabbit rat',
    'brush-tailed rabbit-rat': 'brush-tailed rabbit rat',
    'brush tailed mulgara': 'brush-tailed mulgara',
    'common brushtailed possum': 'common brushtail possum',
    'fawn antechnius': 'fawn antechinus',
    'common bronzewing pigeon': 'common bronzewing',
    'common rock-rat': 'common rock rat',
    'gregory np 2017': None,  # this is a national park
    'grey shrike thrush': 'grey shrike-thrush',
    'middle arm': None,
    'middle arm 2019': None,
    'magpie lark': 'magpie-lark',
    'mtf- gunn point 2018': None,
    'willy wagtail': 'willie wagtail',
    'nitmiluk': None,
    'white-bellied sea-eagle- incorrect photo titles': 'white-bellied sea-eagle',
    'rainbow pita': 'rainbow pitta',
    'sandy inland mounse': 'sandy inland mouse',
    'variouslocations': None,
    'set up': 'human',
    'swamp- water buffalo': 'water buffalo',
    'spinifex hopping-mouse': 'spinifex hopping mouse',
    'white winged fairy wren': 'white-winged fairywren',
    'none': 'empty',
    'rhinella marina (cane toad)': 'rhinella marina',
    'tropicagama temporalis (swamplands lashtail)': 'tropicagama temporalis',
    'varanus scalaris (spotted tree monitor)': 'varanus scalaris',
    'pseudonaja textilis (eastern brown snake)': 'pseudonaja textilis',
    'straw necked ibis': 'straw-necked ibis',
    'grey crowned babbler': 'grey-crowned babbler',
    'lophognathus gilberti (gilberts dragon)': 'lophognathus gilberti',
    'chlamydosaurus kingii (frilled lizard)': 'chlamydosaurus kingii',
    "byone's": "bynoe's gecko",
    'Carlia_Middle Arm 2019': None
}

Examples of prefixes that are surely different locations: 'WD', 'GP', 'MA', 'BAT'

In [10]:
def get_location(p):
    p_parts = p.split('/')
    basename = os.path.basename(p)
    
    species = p_parts[-2].split('_')[0]
    
    if basename.startswith(species):
        b = basename.split(species)[1]
        if b.startswith('.'):
            b = b.split('.')[1]
        
        if b.startswith('IMG_'):
            location = 'unknown'
        else:
            location = b.split('_')[0]
    elif basename.startswith('IMG_'):
        location = 'unknown'
    else:
        location = basename.split('_')[0]
    
    location = location.strip()
    
    if '-' in location and not location.startswith('-'):
        location = location.split('-')[0]
    if location.startswith('-'):
        location = location.split('-')[1]
    if location.lower().endswith('.jpg'):
        location = 'unknown'
    if location.startswith('sp.'):
        location = location.split('sp.')[1]
    if ')' in location:
        location = location.split(')')[1]

    if location.startswith('None'):
        location = location.split('None')[1]
    if location.startswith('dog'):
        location = location.split('dog')[1]
    if location.startswith('Egret'):
        location = location.split('Egret')[1]
    if location.startswith('Set-up'):
        location = location.split('Set-up')[1]
    if location.startswith('Lo'):
        location = 'unknown'
    if len(location) > 7:
        location = 'unknown'
    if 'C' in location:
        location = location.split('C')[0]
    if location.startswith('GP'): 
        location = 'GP'
    if location.startswith('GE'):
        location = 'GE'
    if location.startswith('S'):  # be conservative
        location = 'S'
    if location.startswith('Grid'):  # be conservative
        location = 'Grid'
    if location in ['IMG', 'SVG', 'Willie', 'White', 'Silver', 'end', '', 'Short', 'Water', 'blue', 'Set', 'Black', 'SVL']:
        location = 'unknown'
    
    return location

In [11]:
get_location('Training Images (Gold Standard)/Carlia_Middle Arm 2019/MA-MA25_C1_01352_4-08-2019_12-52.JPG')

'MA'

In [15]:
sequences = []
locations = []
set_species = set()

for p in tqdm(paths):
    p_parts = p.split('/')
    
    location = get_location(p)
    locations.append(location)

    if location == 'M_A_-_M_A_2_5':
        print(species)
        print(p)
        break
    
    species = p_parts[-2].split('_')[0].lower()
    
    if species in species_name_mapping:
        species = species_name_mapping[species]
    if species is None:
        # this directory is a location, species is one above
        species = p_parts[-3].lower().split('_')[0]
        if species in species_name_mapping:
            species = species_name_mapping[species]
        if species is None:
            print(p)
        
    set_species.add(species)
    
    
    basename = os.path.basename(p)
    b_parts = basename.split('.')[-2]
        
#     timestamp = datetime(
#         year=int(b_parts.split('-')[-2].split('_')[0]),
#         month=int(b_parts.split('-')[-2].split('_')[1]),
#         day=int(b_parts.split('-')[-1]),
#     )
    
    sequences.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{location}_{len(sequences)}',
        'location': location,
        'class': [species],
        'images': [{
            'file': p,
            'frame_num': 1,
            #'datetime': str(timestamp)
        }]
    })
        
len(sequences)

100%|██████████| 19053/19053 [00:00<00:00, 66790.81it/s]


19053

In [17]:
location_counter = Counter(locations)
len(location_counter)
location_counter

21

Counter({'GP': 1574,
         'unknown': 6407,
         'AR': 242,
         'BAT': 1301,
         'KOO': 357,
         'WD': 2745,
         'MEL': 208,
         'WAD': 414,
         'BP': 291,
         'BA': 26,
         'S': 1080,
         'WN': 5,
         'BN': 8,
         'Grid': 250,
         'KNP': 2136,
         'MA': 1593,
         'GE': 386,
         'FGc': 3,
         'NMc': 3,
         'FOH': 15,
         'MAN': 9})

In [18]:
len(set_species)
set_species

198

{'accipiter sp',
 'agamidae',
 'agile wallaby',
 'anilios sp',
 'antilopene wallaroo',
 'australian magpie',
 'australian owlet-nightjar',
 'australian white ibis',
 'banteng',
 'bar-shouldered dove',
 'black butcherbird',
 'black kite',
 'black rat',
 'black wallaroo',
 'black-faced cuckoo-shrike',
 'black-faced woodswallow',
 'black-footed tree-rat',
 'black-tailed monitor',
 'black-tailed treecreeper',
 'blue-faced honeyeater',
 'blue-tongue lizard',
 'blue-winged kookaburra',
 'brown goshawk',
 'brown honeyeater',
 'brown quail',
 'brush-tailed mulgara',
 'brush-tailed rabbit rat',
 'buffalo',
 'bush stone-curlew',
 "bynoe's gecko",
 'cane toad',
 'carlia',
 'carlia sp',
 'cat',
 'cattle',
 'chestnut-backed button-quail',
 'chlamydosaurus kingii',
 'collared sparrowhawk',
 'common blue-tongued lizard',
 'common bronzewing',
 'common brushtail possum',
 'common planigale',
 'common rock rat',
 'common wallaroo',
 'conilurus, leggedina, mesembriomys, notomys, pseudoms, zyzomys',
 'co

In [20]:
sequences[-200]

{'dataset': 'au_nt_gov_kerr',
 'seq_id': 'dummy_WD_18853',
 'location': 'WD',
 'class': ['sandy inland mouse'],
 'images': [{'file': 'TrainingData_3/small mammals/west_davs_1of2/sandy inland mouse/WD-B1400C1_08504_17-11-2019_23-39.JPG',
   'frame_num': 1}]}

In [21]:
sequences[1000]

{'dataset': 'au_nt_gov_kerr',
 'seq_id': 'dummy_unknown_1000',
 'location': 'unknown',
 'class': ['brush-tailed rabbit rat'],
 'images': [{'file': 'Training Images (Custom Vision)/Brush-tailed Rabbit-rat/Brush-tailed Rabbit-ratIMG_0008.JPG',
   'frame_num': 1}]}

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [23]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 2.28 s, sys: 4.49 ms, total: 2.29 s
Wall time: 2.29 s


In [24]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - copy images to flat folder

In [25]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [26]:
%%time

path_pairs = []
for seq in tqdm(sequences):
    
    if 'empty' in seq['class']:
        continue
    
    seq_id = seq['seq_id']
    for im in seq['images']:
        src_path = os.path.join(container_root, im['file'])
        assert os.path.exists(src_path), src_path
        frame = 1
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12c', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

100%|██████████| 19053/19053 [00:00<00:00, 117230.38it/s]

CPU times: user 115 ms, sys: 52.6 ms, total: 168 ms
Wall time: 165 ms





In [27]:
len(path_pairs)
path_pairs[1000]

18842

('/mink_disk_0/camtraps/nt-gov-au/Training Images (Custom Vision)/Brush-tailed Rabbit-rat/Brush-tailed Rabbit-ratIMG_0008.JPG',
 '/mink_disk_0/camtraps/imerit12c/au_nt_gov_kerr.seqdummy_unknown_1000.frame1.jpg')

In [28]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 25.7 s, sys: 1min, total: 1min 26s
Wall time: 34 s


In [29]:
len(dst_paths)

18842