In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr’

%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
from random import sample
from collections import defaultdict
from datetime import datetime
import uuid

from tqdm import tqdm

There are some Excel spreadsheet with species labels, but the labels weren't made with the path to the image in the entry but just the location and timestamp, so too hard to recover. So here, we just try getting a list of all species, and then if the species name is in the path to an image, that image will be labeled with that species.

## Get available species labels

In [3]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/engagements/BNF/20190624and0815/8714_images.json') as f:
    image_paths_original = json.load(f)

In [None]:
image_paths_original = [i for i in image_paths_original if i.lower().endswith('.jpg')]
len(image_paths_original)
sample(image_paths_original, 5)
image_paths = [i.lower() for i in image_paths_original if i.lower().endswith('.jpg')]
len(image_paths)
sample(image_paths, 5)

## Getting a list of species
Only ran once to grab the majority of species.

In [5]:
folder_images = []
for p in image_paths:
    if p.startswith('20190624cameratraps/images/kutai 1'):
        folder_images.append(p)

In [6]:
len(folder_images)

361

In [7]:
species_sample = set()

for i in folder_images:
    species_sample.add(i.split('/')[-2])

In [45]:
species_sample

{'argus pheasant',
 'banded palm civet',
 'banteng',
 'bay cat',
 'bearded pig',
 'bornean red muntjac',
 'clouded leopard',
 'crested fireback',
 'kutai 1',
 'macaques',
 'malay civet',
 'marbled cat',
 'mouse deer',
 'orang-utans',
 'pangolin',
 'porcupine',
 'sambar deer',
 'squirrel',
 'sun bear',
 'unknown'}

In [None]:
bawan_images

## Manually compiled list of species

In [4]:
# this list includes all the species names and typo variants I was able to find best-effort
# can't use rat, cat, bat - these match too many strings

species = ['nothing', 'bird', 'argus', 'crested fireback', 'grey headed fish eagle', 
         'bornean ground cuckoo', 'white headed fish eagle', 'babbler brown bird',
           'camera set-up', 'humans', 'human', 'camera placing shots',
           'civet', 'common palm civet', 'clouded leopard', 'deer', 'mongooe', 'pangolin', 'pig', 'porcupine', 'primate', 'sun bear', 
           'unclear', 'unknown',
          'banded palm civet', 'bay cat', 'bearded pig', 'binturong', 'gibbon', 'great argus', 'leopard cat', 'long-tailed macaque',
          'malay civet', 'marbled cat', 'moon rat', 'mouse deer', 'muntjac', 'orangutan', 'orang-utans', 'pig-tailed macaque', 'red langur', 'sambar deer',
          'banded palm civet', 'banteng', 'bornean red muntja', 'macaque', 'squirrel',
          'turtle', 'banded linsang', 'monitor lizard', 'flat-headed-cat', 'lizard', 'yellow muntjac',
          'small toothed palm civet', 'st palm civet', 'small thoodhed palm civet', 'small thoothed palm civet', 'small thoodheed palm civet', 
           'small-thoodhed palm civet', 'small-toodhed palm civet', 'small-toothed palm civet',
           "storm's stork", "strom's strok", 'storms stork', 'storm stork', 'storms st', 'raptor',
          'rodent', 'reptile', 'orang utan', 'yellow throated marten', 'bats', 
           'collard mongoose', 'collared mongoose',
          'brown wood owl', 'fairy pitta', 'pig-t-macaque',
          'sunbear', 'eagle', 
           'short tailed mongoose', 'short-t-mongoose', 'short-t mongoose', 'short tailed-mongoose', 'short taled-mongoose', 's-t mongoose', 'short-tailed-mongoose', 'short - tailed mongoose', 'short tailet mongoose', 'short-tailed mongoose', 
           'mongoose', 
           'treeshrew', 'marble cat', 'flat headed cat',
          'hunter dog', 'dog hunter', 'short-t-monggoose',
          'reed-leaf monkey', 'red leaf monkey', 'monkey', 'otters civet', 'otter civet', 'butterfly'] # there are no "empty" in the file names anywhere, and there are very few "nothing"
species = set(species)
len(species)

101

In [5]:
# the dict in this direction is for verification only 

species_to_image_paths = defaultdict(list)

for p in tqdm(image_paths_original):
    candidate_species = []
    for s in species:
        if s in p.lower():
            candidate_species.append(s)
            
    if len(candidate_species) == 0:
        continue
    
    if 'nothing' in candidate_species:
        species_to_image_paths['nothing'].append(p)
    
    most_specific_species = None
    max_len = 0
    for c in candidate_species:
        if len(c) > max_len:
            max_len = len(c)
            most_specific_species = c
            
    species_to_image_paths[most_specific_species].append(p)

100%|██████████| 279638/279638 [00:09<00:00, 28101.39it/s]


In [6]:
len(species_to_image_paths.keys())

99

In [9]:
len(species_to_image_paths['nothing']) # too many wrongly spelt short-tailed mongoose... They'll be in "mongoose"

3164

In [13]:
'20190624cameratraps/images/OUTROP CAMERA TRAPS ALL/CAMERA TRAPS SABANGAU/CAMERA TRAPS 2009/other stuff/CAMERA TRAPS 2009 up to date/KM 2 x Railway/Km2 x Railway Download 23-05-09 NOTHING macaque only/CDY_0031.JPG' in species_to_image_paths['nothing']

True

In [10]:
image_path_to_species = {}

for s, list_paths in species_to_image_paths.items():
    for p in list_paths:
        image_path_to_species[p] = s

In [11]:
len(image_path_to_species)  # about 1k are "nothing" labeled

17279

In [None]:
sample(image_path_to_species.items(), 5)

## Make CCT database

In [18]:
info = {
  # Required
  "version" : '20190825',
  "description" : 'Images sent by Susan Cheyne from Borneo Nature Foundation',
  
  # Optional
  "year" : 2019,
  "contributor" : 'Images sent by Susan Cheyne from Borneo Nature Foundation. DB created by Siyu Yang',
  "date_created" : str(datetime.today().date())
}

In [19]:
info

{'contributor': 'Images sent by Susan Cheyne from Borneo Nature Foundation. DB created by Siyu Yang',
 'date_created': '2019-08-25',
 'description': 'Images sent by Susan Cheyne from Borneo Nature Foundation',
 'version': '20190825',
 'year': 2019}

In [20]:
len(species_to_image_paths)
sorted(species_to_image_paths.keys())

100

['argus',
 'babbler brown bird',
 'banded linsang',
 'banded palm civet',
 'banteng',
 'bats',
 'bay cat',
 'bearded pig',
 'binturong',
 'bird',
 'bornean ground cuckoo',
 'bornean red muntja',
 'brown wood owl',
 'butterfly',
 'camera placing shots',
 'camera set-up',
 'civet',
 'clouded leopard',
 'collard mongoose',
 'collared mongoose',
 'common palm civet',
 'crested fireback',
 'deer',
 'dog hunter',
 'eagle',
 'fairy pitta',
 'flat headed cat',
 'flat-headed-cat',
 'gibbon',
 'great argus',
 'grey headed fish eagle',
 'humans',
 'hunter dog',
 'leopard cat',
 'long-tailed macaque',
 'macaque',
 'malay civet',
 'marble cat',
 'marbled cat',
 'mongooe',
 'mongooes',
 'mongoose',
 'monitor lizard',
 'monkey',
 'moon rat',
 'mouse deer',
 'muntjac',
 'nothing',
 'orang utan',
 'orang-utans',
 'orangutan',
 'otter civet',
 'otters civet',
 'pangolin',
 'pig',
 'pig-t-macaque',
 'pig-tailed macaque',
 'porcupine',
 'primate',
 'raptor',
 'red langur',
 'red leaf monkey',
 'reed-leaf 

In [21]:
cat_map = {
    'nothing': 'empty',
    "strom's strok": "storm's stork",
    "storms stork": "storm's stork",
    "storms st": "storm's stork",
    "storm stork": "storm's stork", 
    'pig-t-macaque': 'pig-tailed macaque',
    "unclear": "unknown",
    'camera set-up': 'human',
    'humans': 'human',
    'orang-utans': 'orangutan',
    'orang utan': 'orangutan',
    'sunbear': 'sun bear',
    'marble cat': 'marbled cat',
    'flat-headed-cat': 'flat headed cat',
    'dog hunter': 'hunter dog',
    'small thoodhed palm civet': 'small toothed palm civet',
    'short tailed-mongoose': 'short-tailed mongoose',
    'short taled-mongoose': 'short-tailed mongoose',
    'short-t mongoose': 'short-tailed mongoose',
    'short-t-monggoose': 'short-tailed mongoose',
    'short tailed mongoose': 'short-tailed mongoose',
    'short-t-mongoose': 'short-tailed mongoose',
    'short-tailed-mongoose': 'short-tailed mongoose',
    's-t mongoose': 'short-tailed mongoose',
    'short - tailed mongoose': 'short-tailed mongoose',
    'short tailet mongoose': 'short-tailed mongoose',
    'mongooe': 'mongoose',
    'reed-leaf monkey': 'red leaf monkey',
    'otters civet': 'otter civet',
    'small toothed palm civet': 'small-toothed palm civet', 
    'st palm civet': 'small-toothed palm civet', 
    'small thoodhed palm civet': 'small-toothed palm civet', 
    'small thoothed palm civet': 'small-toothed palm civet', 
    'small thoodheed palm civet': 'small-toothed palm civet',
    'small-thoodhed palm civet': 'small-toothed palm civet',
    'small-toodhed palm civet': 'small-toothed palm civet',
    'collard mongoose': 'collared mongoose'
}

In [22]:
# redo image_path_to_species to uniformize species names
for p, s in image_path_to_species.items():
    if s in cat_map:
        image_path_to_species[p] = cat_map[s]

len(image_path_to_species)

valid_species = set(image_path_to_species.values())
len(valid_species)
valid_species

17279

65

{'argus',
 'babbler brown bird',
 'banded linsang',
 'banded palm civet',
 'banteng',
 'bats',
 'bay cat',
 'bearded pig',
 'binturong',
 'bird',
 'bornean ground cuckoo',
 'bornean red muntja',
 'brown wood owl',
 'butterfly',
 'camera placing shots',
 'civet',
 'clouded leopard',
 'collared mongoose',
 'common palm civet',
 'crested fireback',
 'deer',
 'eagle',
 'empty',
 'fairy pitta',
 'flat headed cat',
 'gibbon',
 'great argus',
 'grey headed fish eagle',
 'human',
 'hunter dog',
 'leopard cat',
 'long-tailed macaque',
 'macaque',
 'malay civet',
 'marbled cat',
 'mongoose',
 'monitor lizard',
 'monkey',
 'moon rat',
 'mouse deer',
 'muntjac',
 'orangutan',
 'otter civet',
 'pangolin',
 'pig',
 'pig-tailed macaque',
 'porcupine',
 'primate',
 'raptor',
 'red langur',
 'red leaf monkey',
 'reptile',
 'rodent',
 'sambar deer',
 'short-tailed mongoose',
 'small-toothed palm civet',
 'squirrel',
 "storm's stork",
 'sun bear',
 'treeshrew',
 'turtle',
 'unknown',
 'white headed fish 

In [23]:
categories = {}
i = 1
for s in sorted(list(valid_species)):
    categories[s] = i
    i += 1

In [24]:
images = []
annotations = []

for p, s in image_path_to_species.items():
    if s in cat_map:
        s = cat_map[s]
        
    image_id = str(uuid.uuid4())
    anno_id = str(uuid.uuid4())
    category_id = categories[s]
    
    images.append({
        'id': image_id,
        'file_name': p
    })
    annotations.append({
        'id': anno_id,
        'image_id': image_id,
        'category_id': category_id
    })
len(images)
len(annotations)

17279

17279

In [None]:
images[100]

In [26]:
annotations[100]

{'category_id': 46,
 'id': '55f2a527-a0bd-4df1-8b93-3dc6ef37eec9',
 'image_id': 'dab5bc96-54fa-49c1-b73a-1ca39c47eef6'}

In [27]:
categories_final = []
for name, i in categories.items():
    categories_final.append({
        'id': i,
        'name': name
    })

In [28]:
cct_db = {
    'info': info,
    'images': images,
    'annotations': annotations,
    'categories': categories_final
}

In [29]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/engagements/BNF/20190624and0815/BNF_20190624and0815_20190825a.json', 'w') as f:
    json.dump(cct_db, f, indent=1)

## Split detector results into labeled and unlabeled portions

There was no reliable labels for "empty" images.

In [30]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/engagements/BNF/20190624and0815/8714_detections_bnf0624and0815_20190815182245_refiltered.json') as f:
    all_res = json.load(f)

In [31]:
len(all_res['images'])

278873

In [32]:
images_with_label = set(image_path_to_species.keys())
len(images_with_label)

17279

In [33]:
labeled_im_res = []
unlabeled_im_res = []

for res in all_res['images']:
    if res['file'] in images_with_label:
        labeled_im_res.append(res)
    else:
        unlabeled_im_res.append(res)
len(labeled_im_res)
len(unlabeled_im_res)

17279

261594

In [34]:
all_res.keys()

dict_keys(['info', 'detection_categories', 'images'])

In [35]:
labeled_res = {
    'info': all_res['info'],
    'detection_categories': all_res['detection_categories'],
    'images': labeled_im_res
}

unlabeled_im_res = {
    'info': all_res['info'],
    'detection_categories': all_res['detection_categories'],
    'images': unlabeled_im_res
}

In [36]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/engagements/BNF/20190624and0815/8714_detections_bnf0624and0815_20190815182245_refiltered_labeled.json', 'w') as f:
    json.dump(labeled_res, f, indent=1)

In [37]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/engagements/BNF/20190624and0815/8714_detections_bnf0624and0815_20190815182245_refiltered_unlabeled.json', 'w') as f:
    json.dump(unlabeled_im_res, f, indent=1)