In [None]:
import datetime
import json
from lxml import etree
import os
# There are ways to size the image without loading it into memory by reading its headers (https://github.com/scardine/image_size), but seems less reliable.
from tqdm import tqdm
from eMammal_helpers import clean_species_name, clean_frame_number, get_img_size

In [None]:
# print all outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# auto reload external Python modules
%load_ext autoreload
%autoreload 2

In [None]:
# display Matplotlib figures inline and set default size
import matplotlib.pyplot as plt

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [None]:
# configurations and paths
output_dir_path = '/home/yasiyu/scripts/output'

csv_path = '/home/yasiyu/scripts/input/emammal_2018.08.20.csv'  # csv specifying the images sent for annotation

deployments_path = '/datadrive/emammal'

# Produce COCO format json database for eMammal 

Contact: Siyu Yang <yasiyu@microsoft.com>


**The content of this notebook has been refactored into `create_eMammal_json.py`. Please run, and make modifications on, that script instead.**




Run this notebook with Python 3.

This notebook produces the COCO formatted json database, which contains all the images whether they were annotated with bounding boxes or not. In this process, each image needs to be loaded to size it.

Bbox annotations will be added to the database when they arrive.

General format defined at http://cocodataset.org/#format-data, and specific to our work, see Sara's [definition](https://ai4edevelopment.visualstudio.com/AI%20for%20Earth%20Development/AI%20for%20Earth%20Development%20Team/_git/cameraTraps?path=%2Fdatabase_tools%2FREADME.md&version=GBmaster).

I decided to save the sequence level ResearcherIdentifications info in each image object. Where there are multiple different species, they are semi-column separated.

```
image{
  "id" : str,
  "width" : int,
  "height" : int,
  "file_name" : str,
  "rights_holder" : str,  # not included
  "location": str,  # not int
  "datetime": datetime,
  "seq_id": str,
  "seq_num_frames": int,
  "frame_num": int,
  "label": str  # just for eMammal
}
```

Image ID is in this format:

`datasetemammal.projectp100.deploymentd17432.seqd17432s11.frame004.imgd17432s11i4`

Image file name:

`emammal/3191d36836/d36836s14i1.JPG` or `.jpg`

In [None]:
db_info = {
    'year': 2018,
    'version': '0.0.1',
    'description': 'eMammal dataset containing 3140 deployments, in COCO format.',
    'contributor': 'eMammal',
    'date_created': str(datetime.date.today())
}

In [None]:
### Sequential

In [None]:
db_images = []
for deployment in tqdm(os.listdir(deployments_path)):
    deployment_path = os.path.join(deployments_path, deployment)
    manifest_path = os.path.join(deployment_path, 'deployment_manifest.xml')
    
    with open(manifest_path, 'r') as f:
        tree = etree.parse(f)
    
    root = tree.getroot()
    project_id = root.findtext('ProjectId')
    deployment_id = root.findtext('CameraDeploymentID')
    deployment_location = root.findtext('CameraSiteName')
    
    image_sequences = root.findall('ImageSequence')
    
    for sequence in image_sequences:
        seq_id = sequence.findtext('ImageSequenceId')
        
        # get species info for this sequence
        researcher_identifications = sequence.findall('ResearcherIdentifications')
        species = set()
        
        for researcher_id in researcher_identifications:
            identifications = researcher_id.findall('Identification')
            for id in identifications:
                species_common_name = clean_species_name(id.findtext('SpeciesCommonName'))
                species.add(species_common_name)
        
        species_str = ';'.join(sorted(list(species)))

        # add each image's info to database
        images = sequence.findall('Image')
        for img in images:
            img_id = img.findtext('ImageId')
            img_file_name = img.findtext('ImageFileName')
            assert img_file_name.endswith('.JPG')
            
            img_datetime = img.findtext('ImageDateTime')  # these are in different formats...
            img_frame = clean_frame_number(img.findtext('ImageOrder'))
            
            full_img_id = 'datasetemammal.project{}.deployment{}.seq{}.frame{}.img{}'.format(project_id, deployment_id, seq_id, img_frame, img_id)
        
            full_img_path = os.path.join(deployment_path, img_file_name)
            img_width, img_height = get_img_size(full_img_path)
        
            db_images.append({
                'id': full_img_id,
                'width': img_width,
                'height': img_height,
                'file_name': os.path.join('emammal', deployment, img_file_name),
                'location': deployment_location,
                'datetime': img_datetime,
                'seq_id': seq_id,
                'seq_num_frames': int(img_frame),
                'label': species_str
            })
        
        

In [None]:
db_images

### Threaded for faster IO

In [None]:
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool  # this functions like threading

import warnings
warnings.filterwarnings('ignore')

In [None]:
def _add_image(entry, full_img_path):
    img_width, img_height = get_img_size(full_img_path)
    entry['width'] = img_width
    entry['height'] = img_height
    pbar.update(1)
    return entry

In [None]:
tasks = []

print('Looping through the deployments...')
for deployment in tqdm(os.listdir(deployments_path)):
    deployment_path = os.path.join(deployments_path, deployment)
    manifest_path = os.path.join(deployment_path, 'deployment_manifest.xml')
    
    with open(manifest_path, 'r') as f:
        tree = etree.parse(f)
    
    root = tree.getroot()
    project_id = root.findtext('ProjectId')
    deployment_id = root.findtext('CameraDeploymentID')
    deployment_location = root.findtext('CameraSiteName')
    
    image_sequences = root.findall('ImageSequence')
    
    for sequence in image_sequences:
        seq_id = sequence.findtext('ImageSequenceId')
        
        # get species info for this sequence
        researcher_identifications = sequence.findall('ResearcherIdentifications')
        species = set()
        
        for researcher_id in researcher_identifications:
            identifications = researcher_id.findall('Identification')
            for id in identifications:
                species_common_name = clean_species_name(id.findtext('SpeciesCommonName'))
                species.add(species_common_name)
        
        species_str = ';'.join(sorted(list(species)))

        # add each image's info to database
        images = sequence.findall('Image')
        for img in images:
            img_id = img.findtext('ImageId')
            img_file_name = img.findtext('ImageFileName')
            
            assert img_file_name.lower().endswith('.jpg')  # some are .JPG and some are .jpg
            
            img_datetime = img.findtext('ImageDateTime')  # these are in different formats...
            img_frame = clean_frame_number(img.findtext('ImageOrder'))
            
            full_img_id = 'datasetemammal.project{}.deployment{}.seq{}.frame{}.img{}'.format(project_id, deployment_id, seq_id, img_frame, img_id)
        
            full_img_path = os.path.join(deployment_path, img_file_name)
        
            entry = {
                'id': full_img_id,
                'width': 0,  # place holders
                'height': 0,
                'file_name': os.path.join('emammal', deployment, img_file_name),
                'location': deployment_location,
                'datetime': img_datetime,
                'seq_id': seq_id,
                'seq_num_frames': int(img_frame),
                'label': species_str
            }
    
            tasks.append((entry, full_img_path))

In [None]:
print('Finished creating tasks.')
num_workers = multiprocessing.cpu_count()
pool = ThreadPool(num_workers)
pbar = tqdm(total=len(tasks))
        
db_images = pool.starmap(_add_image, tasks)
print('Waiting for processes to finish...')
pool.close()
pool.join()
print('Done.')

In [None]:
len(tasks)

Some images could not be opened by PIL. Inspecting them on local shows that they are corrupt image files.

In [None]:
from PIL import Image 

In [None]:
img_path = '/datadrive/emammal/p168d31859/d31859s15i2.JPG'
im = Image.open(img_path)