In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

%load_ext autoreload
%autoreload 2

# Creating CCT json database for Sul Ross

Need to install ExifTool: 
- Follow instruction at https://www.sno.phy.queensu.ca/~phil/exiftool/install.html#Unix (wget download the tar on the [home page](https://www.sno.phy.queensu.ca/~phil/exiftool/Image-ExifTool-11.44.tar.gz), test and install)
- Install its Python wrapper (git clone the repo following instructions on https://smarnach.github.io/pyexiftool/ and inside the directory do `python setup.py install`)

In [None]:
import os
from datetime import datetime
import json
from collections import defaultdict

from tqdm import tqdm

import exiftool
import path_utils  # ai4eutils

## List image IDs
Get the list of image_id in folders `Summer2018` and `Presidio001`

In [None]:
data_dir = '/home/beaver/cameratraps/mnt/sulross'  # container mount point
image_dirs = os.listdir(data_dir)
image_dirs

In [None]:
image_paths = []
for image_dir in image_dirs:
    image_dir = os.path.join(data_dir, image_dir)
    if os.path.isdir(image_dir):
        print(image_dir)
        for image_path in tqdm(path_utils.recursive_file_list(image_dir, bConvertSlashes=False)):
            if path_utils.is_image_file(image_path):
                image_paths.append(os.path.join(image_dir, image_path))
            
image_paths = sorted(image_paths)
len(image_paths)
image_paths[:3]

In [None]:
# exclude the test folders - these are subsets of the other two folders Presidio001 and Summer2018

image_ids = []
for i in image_paths:
    image_id = i.split('/home/beaver/cameratraps/mnt/sulross/')[1]
    if not image_id.startswith('test'):
        image_ids.append(image_id)
len(image_ids)
image_ids[:3]

In [None]:
len(image_ids)

In [None]:
with open('/home/beaver/cameratraps/data/sulross/20190522_image_ids.json', 'w') as f:
    json.dump(image_ids, f, indent=1)

In [None]:
meta = {}

for i in range (0, 100):
    meta[i] = '1'

In [None]:
len(meta)

## Extract labels from EXIF data

Used `sulross_get_exif.py` to save the field with the species information from the images. This is saved in `20190522_metadata.json`.

In [None]:
image_id_to_metadata = json.load(open('/Users/siyuyang/Source/temp_data/CameraTrap/engagements/SulRoss/20190522/20190522_metadata.json'))

In [None]:
len(image_id_to_metadata)

In [None]:
image_id_to_species = {}
no_species = []
for image_id, metadata in image_id_to_metadata.items():
    species_present = False
    for m in metadata:
        parts = m.split('|')
        if not species_present and len(parts) == 2 and parts[0] == 'Species':
            s = parts[1]
            if s == 'None':
                s = ''
            image_id_to_species[image_id] = s
            species_present = True
    if not species_present:
        no_species.append((image_id, metadata))
        image_id_to_species[image_id] = ''
len(image_id_to_species)

In [None]:
len(no_species)  # number of images without EXIF field that says "Species|" - assume empty...

# Most empty images are denoted by "Species|None"

In [None]:
no_species[100]

Spot checked that these are empty of animals.

In [None]:
all_species = set(image_id_to_species.values())

In [None]:
len(all_species)
all_species

In [None]:
name_change = {
    'Popcupine': 'Porcupine',
    'Blacktailed jackrabbit': 'Black-tailed Jackrabbit',
    '': 'empty'
}

# lower-case all species names; get rid of the leading _ in some of them like _Skunk

Image IDs are `Presidio001/Cam016/Presidio001__Cam016__2018-03-05__11-43-58(11).JPG`, and the part `Presidio001/Cam016/Presidio001__Cam016__2018-03-05__11-43-` is a sequence ID.

In [None]:
def get_info_from_image_name(image_id):
    image_name = image_id.split('.')[0]
    frame_num = int(image_name.split('(')[-1].split(')')[0])
    seq_id_parts = image_name.split('-')
    seq_id = '-'.join(seq_id_parts[:-1])
    
    parts = image_id.split('/')
    
    # want '2019-05-19 08:57:43'
    dt = parts[-1].split('.')[0].split('(')[0].split('__')
    date = dt[2]
    time = dt[3]
    dt = '{} {}'.format(date, ':'.join(time.split('-')))
    
    # location is folder_name+camera_id
    location = '{}+{}'.format(parts[0], parts[1])

    return seq_id, frame_num, dt, location

In [None]:
image_id = 'Presidio001/Cam016/Presidio001__Cam016__2018-03-05__11-43-58(11).JPG'
get_info_from_image_name(image_id)

In [None]:
image_id = 'Summer2018/D15/Summer2018__D15__2018-06-23__03-56-24(1).JPG'
get_info_from_image_name(image_id)

In [None]:
images = []
seq_id_to_num_frames = defaultdict(int)
species_count = defaultdict(int)

for image_id, species in tqdm(image_id_to_species.items()):
    if species in name_change:
        species = name_change[species]
        
    if species.startswith('_'):
        species = species.split('_')[1]
    species = species.lower()
    species_count[species] += 1
    
    seq_id, frame_num, dt, location = get_info_from_image_name(image_id)
    seq_id_to_num_frames[seq_id] += 1 
    
    images.append({
            'id': image_id.split('.')[0],
            'file_name': image_id,
            'datetime': dt,
            'seq_id': seq_id,
            'frame_num': frame_num,
            'location': location,
            'species': species
        })

In [None]:
images[1000]

In [None]:
species_count

In [None]:
len(species_count)

In [None]:
category_map = {
    'empty': 0
}

species = list(species_count.keys())

i = 1
for s in species:
    if s != 'empty':
        category_map[s] = i
        i += 1
    
category_map
len(category_map)

In [None]:
final_images = []
annotations = []

for image in images:
    # each image only has one species label in this dataset, so use image_id as annotation_id
    
    annotations.append({
            'id': image['id'] + '_anno',
            'image_id': image['id'],
            'category_id': category_map[image['species']]
        })
    
    image['seq_num_frames'] = seq_id_to_num_frames[image['seq_id']]
    
    # frame_num starts at 1
    if image['frame_num'] > image['seq_num_frames']:
        print(image)
    
    final_images.append(image)

Only one image had frame_num > seq_num_frames...

In [None]:
len(final_images)

In [None]:
len(annotations)

In [None]:
final_images[1000]
annotations[1000]

In [None]:
for image in final_images:
    del image['species']

In [None]:
final_images[1000]

In [None]:
categories = []
for name, i in category_map.items():
    categories.append({
            'id': i,
            'name': name
        })

In [None]:
len(categories)

In [None]:
db = {
    'info': {
        'version': '20190530',
        'description': 'Sul Ross University data, from folders Presidio001 and Summer2018.',
        'contributor': 'Patricia Harveson, Sul Ross University. Database created by Siyu Yang',
        'year': 2019,
        'date_created': str(datetime.today())
    },
    'images': final_images,
    'categories': categories,
    'annotations': annotations
}

In [None]:
with open('/Users/siyuyang/Source/temp_data/CameraTrap/engagements/SulRoss/20190522/Database/sulross_20190530.json', 'w') as f:
    json.dump(db, f, indent=1)