In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm


import path_utils, sas_blob_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

## BNF round 2

Last time we only imported the 25k entries that were bbox labeled. This notebook imports the rest of them.

The entires imported last time had no location - we shoud just delete these entries and ingest these instead, since we include all the existing bboxes anyways

There is no sequence info. The location is best-effort: a few of the ones sharing the same prefix are likely the same location...

In [241]:
dataset_name = 'bnf_20190624and0815'

container_root = '/mink_disk_0/camtraps/bnf/bnf/'  

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_new.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_new_temp.json' 

In [4]:
# entries exported last time. No megadb "id" yet - need to query again to upsert to update fields
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/bnf_20190624and0815_w_batch_10_boxes.json') as f:
    bnf_db = json.load(f)
len(bnf_db)

25000

In [16]:
entries_already_in_db = {}
for seq in bnf_db:
    for im in seq['images']:
        entries_already_in_db[im['file']] = seq

len(entries_already_in_db)

25000

In [13]:
sample(bnf_db, 1)

[{'dataset': 'bnf_20190624and0815',
  'seq_id': 'dummy_9df472f5d93e40a5ba30ac49f877636d',
  'images': [{'image_id': 14313,
    'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2010/Camera Trap Locations 2010/Jelutong Edge 1/JE1 12-06-10/Cam N 11/CDY_0001.JPG',
    'bbox': [{'category': 'person', 'bbox': [0.2733, 0.143, 0.2081, 0.68]}]}],
  'class': ['__label_unavailable']}]

In [35]:
# from CameraTraps/data_management/importers/bnf_to_json.ipynb
bnf_valid_species = {
    'argus',
    'babbler brown bird',
    'banded linsang',
    'banded palm civet',
    'banteng',
    'bats',
    'bay cat',
    'bearded pig',
    'binturong',
    'bird',
    'bornean ground cuckoo',
    'bornean red muntja',
    'brown wood owl',
    'butterfly',
    'camera placing shots',
    'civet',
    'clouded leopard',
    'collared mongoose',
    'common palm civet',
    'crested fireback',
    'deer',
    'eagle',
    'empty',
    'fairy pitta',
    'flat headed cat',
    'gibbon',
    'great argus',
    'grey headed fish eagle',
    'human',
    'hunter dog',
    'leopard cat',
    'long-tailed macaque',
    'macaque',
    'malay civet',
    'marbled cat',
    'mongoose',
    'monitor lizard',
    'monkey',
    'moon rat',
    'mouse deer',
    'muntjac',
    'orangutan',
    'otter civet',
    'pangolin',
    'pig',
    'pig-tailed macaque',
    'porcupine',
    'primate',
    'raptor',
    'red langur',
    'red leaf monkey',
    'reptile',
    'rodent',
    'sambar deer',
    'short-tailed mongoose',
    'small-toothed palm civet',
    'squirrel',
    "storm's stork",
    'sun bear',
    'treeshrew',
    'turtle',
    'unknown',
    'white headed fish eagle',
    'yellow muntjac',
    'yellow throated marten'
}

cat_map = {
    'nothing': 'empty',
    "strom's strok": "storm's stork",
    "storms stork": "storm's stork",
    "storms st": "storm's stork",
    "storm stork": "storm's stork", 
    'pig-t-macaque': 'pig-tailed macaque',
    "unclear": "unknown",
    'camera set-up': 'human',
    'humans': 'human',
    'orang-utans': 'orangutan',
    'orang utan': 'orangutan',
    'sunbear': 'sun bear',
    'marble cat': 'marbled cat',
    'flat-headed-cat': 'flat headed cat',
    'dog hunter': 'hunter dog',
    'small thoodhed palm civet': 'small toothed palm civet',
    'short tailed-mongoose': 'short-tailed mongoose',
    'short taled-mongoose': 'short-tailed mongoose',
    'short-t mongoose': 'short-tailed mongoose',
    'short-t-monggoose': 'short-tailed mongoose',
    'short tailed mongoose': 'short-tailed mongoose',
    'short-t-mongoose': 'short-tailed mongoose',
    'short-tailed-mongoose': 'short-tailed mongoose',
    's-t mongoose': 'short-tailed mongoose',
    'short - tailed mongoose': 'short-tailed mongoose',
    'short tailet mongoose': 'short-tailed mongoose',
    'mongooe': 'mongoose',
    'reed-leaf monkey': 'red leaf monkey',
    'otters civet': 'otter civet',
    'small toothed palm civet': 'small-toothed palm civet', 
    'st palm civet': 'small-toothed palm civet', 
    'small thoodhed palm civet': 'small-toothed palm civet', 
    'small thoothed palm civet': 'small-toothed palm civet', 
    'small thoodheed palm civet': 'small-toothed palm civet',
    'small-thoodhed palm civet': 'small-toothed palm civet',
    'small-toodhed palm civet': 'small-toothed palm civet',
    'collard mongoose': 'collared mongoose'
}

len(bnf_valid_species)
len(cat_map)
combined_species_names = bnf_valid_species | set(cat_map.keys())
len(combined_species_names)

65

36

101

In [7]:
input_container_sas = ''

In [30]:
image_paths = sas_blob_utils.list_blobs_in_container(
    container_uri=input_container_sas,
    blob_suffix=('.jpg', '.jpeg', '.png'),  # check will be case-insensitive
)

0it [00:00, ?it/s]

listing blobs...


326797it [02:53, 1887.80it/s]

Enumerated 279638 matching blobs out of 326797 total





### 20190815cameratraps folder

No species label but somewhat neat location... Location still has issues for paths e.g.

Sometimes the date is not correct - in 2015 folder but the date on other folder indicates another year.

In [144]:
location_date = 'Km2 x Railway 02-05-13'
date = location_date.split(' ')[-1]
location = location_date.split(date)[0].strip()
date
location

'02-05-13'

'Km2 x Railway'

In [209]:
folder = '201908'

seq_08 = []
locations_08 = set()

years_w_loc_date = ['2013', '2014', '2015', '2016', '2017']

for image_path in tqdm(image_paths):
    if not image_path.startswith(folder):
        continue
        
    bbox = None
    clss = None
    if image_path in entries_already_in_db:
        bbox = entries_already_in_db[image_path]['images'][0]['bbox']
        clss = entries_already_in_db[image_path]['class']
        
    p_parts = image_path.split('/')
    year = p_parts[2]
#     if year in years_w_loc_date:
#         location_date = p_parts[-3]
#         if location_date.startswith('Cam M'):
#             location_date = p_parts[-4] 
        
#         date = location_date.split(' ')[-1]
#         location = location_date.split(date)[0].strip()
#     else:
#         location = p_parts[-3]
#         date = year
        
#     if location in ['', '2018', 'Cam']:
#         location = 'unknown'
#     locations_08.add(location)
    
    im = {
        'file': image_path,
        'frame_num': 1,
    }
    if bbox is not None:
        im['bbox'] = bbox
    
    seq = {
        'dataset': dataset_name,
        'seq_id': f'dummy_{folder}_{len(seq_08)}',
#         'location': 'unknown',
        'datetime': year,
        'images': [im]
    }
    if clss is not None:
        seq['class'] = clss
    else:
        seq['class'] = ['__label_unavailable']
        
    seq_08.append(seq)

100%|██████████| 279638/279638 [00:00<00:00, 886586.67it/s] 


In [210]:
len(seq_08)

40592

In [211]:
# seq_08[21785]
for s in sample(seq_08, 10):
    s

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_25982',
 'datetime': '2016',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2016/C.Maret/T SC 610m 08-03-2016/Cam 2015-20/IMAG0059.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_28334',
 'datetime': '2016',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2016/H.Agustus/T2E x ORW 25-08-2016/Cam C27/Cdy00030.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_7029',
 'datetime': '2013',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2013/L.Cam Traps Desember 2013/T1A x Railway 13-12-13/Cam S9/Cdy00493.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_9760',
 'datetime': '2014',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2014/C.Cam Traps Maret 2014/Km3 x Railway 01-03-2014/Cam S5/Cdy00401.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_32795',
 'datetime': '2018',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/C.Maret/T SC x Canal D/Cam 2017-1/IMAG0102.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_35095',
 'datetime': '2018',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/G.Juli/T SC 1412m/Cam 2018-5/IMAG0059.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_33861',
 'datetime': '2018',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/E.Mei/T SC x Canal D/Cam 2017-3/T SC x Canal D Malay Civet.JPG',
   'frame_num': 1,
   'bbox': [{'category': 'animal',
     'bbox': [0.1277, 0.7367, 0.5104, 0.1969]}]}],
 'class': ['malay civet']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_23350',
 'datetime': '2015',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2015/J. Oktober 2015/T 2E x ORW/Cam C27/Cdy00008.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_36475',
 'datetime': '2018',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2018/J.Oktober/Km4 x Railway/Cam 2018-1/IMAG0015.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201908_22885',
 'datetime': '2015',
 'images': [{'file': '20190815cameratraps/DOWNLOAD CAMERA TRAP- SEBANGAU/2015/H. Agustus 2015/Tower 15-08-2015/CamC30/Cdy00030.JPG',
   'frame_num': 1}],
 'class': ['__label_unavailable']}

### 20190624cameratraps folder
Only some folders have location / species, but the location is hard to parse consistently with the 201908 folder, so we're matching locations from the 201908 folder and use that if available.

In [212]:
folder = '201906'

seq_06 = []
locations_06 = set()
num_species_found = 0

folders_w_loc = ['Belantikan 1 RAW', 'Belantikan 2 RAW', 'Sungai Wain']

for image_path in tqdm(image_paths):
    if not image_path.startswith(folder):
        continue
        
    bbox = None
    clss = None
    if image_path in entries_already_in_db:
        bbox = entries_already_in_db[image_path]['images'][0]['bbox']
        clss = entries_already_in_db[image_path]['class']
    
    if image_path == '20190624cameratraps/images/Bawan/BEST ANIMALS ALL/Birds/white headed fish eagle.JPG':
        print(clss)
    
    if clss is None:
        candidate_species = set()
        for s in combined_species_names:
            if s in image_path.lower():
                if s in cat_map:
                    s = cat_map[s]
                candidate_species.add(s)
                
        if len(candidate_species) > 0:
            num_species_found += 1
            clss = [s for s in candidate_species]
    if clss is None:  # actually all species labels were found in the last round... this will be None
        clss = ['__label_unavailable']    # actually all species labels were found in the last round... this will be None
    
    p_parts = image_path.split('/')

    location = 'unknown'
#     for loc08 in locations_08:
#         if loc08 in image_path[51:]:
#             location = loc08
    
#     locations_06.add(location)
    
    im = {
        'file': image_path,
        'frame_num': 1,
    }
    if bbox is not None:
        im['bbox'] = bbox
    
    seq_06.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{folder}_{len(seq_06)}',
#         'location': location,
        'class': clss,
        'images': [im]
    })

  3%|▎         | 7263/279638 [00:00<00:07, 36301.36it/s]

['white headed fish eagle']


100%|██████████| 279638/279638 [00:09<00:00, 29659.14it/s]


In [213]:
num_species_found

0

In [214]:
len(seq_06)

239046

In [219]:
sample(seq_06, 10)
seq_06[53718]

[{'dataset': 'bnf_20190624and0815',
  'seq_id': 'dummy_201906_77201',
  'class': ['__label_unavailable'],
  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2011/Camera trap locations 2011/T.1A x Railway/T1A x Railway 13-04-11/CAM N21/Dog 1 8-04-11 T1A+Rlwy Cam N21.JPG',
    'frame_num': 1}]},
 {'dataset': 'bnf_20190624and0815',
  'seq_id': 'dummy_201906_173811',
  'class': ['__label_unavailable'],
  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/Camera traps ALL historical data/CAMERA TRAPS 2011/Camera trap locations 2011/KM 2 x Railway/Km2 x Railway 05-12-11/Cam N9/CDY_0009.JPG',
    'frame_num': 1}]},
 {'dataset': 'bnf_20190624and0815',
  'seq_id': 'dummy_201906_64272',
  'class': ['__label_unavailable'],
  'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2010/Camera Trap Locations 2010/TO x TC/T.0 x T.C 4-05-10/Cam N 2/CDY_0027.JPG',
    'fram

{'dataset': 'bnf_20190624and0815',
 'seq_id': 'dummy_201906_53718',
 'class': ['__label_unavailable'],
 'images': [{'file': '20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2010/Camera Trap Locations 2010/Km5 x Railway/KM 5 14-05-10/Cam N 10/CDY_0014.JPG',
   'frame_num': 1}]}

### Both folders combined

In [220]:
sequences = seq_06 + seq_08
len(sequences)

279638

In [221]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Sample
Send all 40,592 images from the 08 folder (which is neater), and then sample 110,000 from the 06 folder. 

In [228]:
im_to_send = [seq['images'][0] for seq in seq_08 if 'empty' not in seq['class']]  # actually we don't have labels 
len(im_to_send)


sample_06 = sample(seq_06, 110000)

im_06 = [seq['images'][0] for seq in sample_06 if 'empty' not in seq['class']]

im_to_send = im_to_send + im_06
len(im_to_send)

40592

149853

In [229]:
bnf_list_to_download = [im['file'] + '\n' for im in im_to_send]

with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/bnf_files.txt', 'w') as f:
    f.writelines(bnf_list_to_download)

## Rename and copy to imerit12f folder

In [240]:
len(im_to_send)
len(sequences)
dataset_name
container_root

149853

279638

'bnf_20190624and0815'

'/mink_disk_0/camtraps/bnf/'

In [232]:
im_to_seq_id = {}
for seq in sequences:
    for im in seq['images']:
        im_to_seq_id[im['file']] = seq['seq_id']

In [242]:
%%time

path_pairs = []

for im in im_to_send:
    src_path = os.path.join(container_root, im['file'])
    seq_id = im_to_seq_id[im['file']]
    frame = im['frame_num']
    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12f', 
                            f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
    path_pairs.append((src_path, dst_path))

path_pairs[-50]

CPU times: user 619 ms, sys: 8 ms, total: 627 ms
Wall time: 626 ms


('/mink_disk_0/camtraps/bnf/bnf/20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMTRAPS Temporary/Photos download/2012/Cam Traps Oct 2012/T0 x TC 19-10-12/Cam N1/CDY_0037.JPG',
 '/mink_disk_0/camtraps/imerit12f/bnf_20190624and0815.seqdummy_201906_132338.frame1.jpg')

In [243]:
len(path_pairs)

149853

In [244]:
%%time

def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 4min 51s, sys: 9min 21s, total: 14min 12s
Wall time: 15min 10s


In [245]:
len(dst_paths)

149853