In [None]:
import csv
import json
import uuid
import datetime
import pandas as pd
from lxml import etree
from collections import defaultdict, Counter
from tqdm import tqdm_notebook as tqdm
import os
import matplotlib.pyplot as plt
import statistics
import numpy as np
from eMammal_helpers import clean_species_name, get_total_from_distribution, sort_dict_val_desc, plot_distribution, plot_histogram

In [None]:
# print all outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# auto reload external Python modules
%load_ext autoreload
%autoreload 2

# display Matplotlib figures inline and set default size
%matplotlib inline
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.figsize'] = (8.0, 3.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.rcParams['font.size'] = 9

# eMammal dataset stats

Contact: Siyu Yang <yasiyu@microsoft.com>

Run this notebook with Python 3.

This notebook inspects the eMammal dataset we have in August 2018. A subset of this is sent to iMerit for bounding box annotations. The CSV at `csv_path` contains all the images that have been sent to iMerit for annotations in the first batch.


In [None]:
# configurations and paths
output_dir_path = '/home/yasiyu/yasiyu_temp'

csv_path = '/home/yasiyu/scripts/input/emammal_2018.08.20.csv'  # csv specifying the images sent for annotation

deployments_path = '/datadrive/emammal'


# constants
_people_tags = {
    'Bicycle',
    'Calibration Photos',
    'Camera Trapper',
    'camera trappper',
    'camera  trapper',
    'Homo sapien',
    'Homo sapiens',
    'Human, non staff',
    'Human, non-staff',
    'camera trappe',
    'Human non-staff',
    'Setup Pickup',
    'Vehicle'
}
PEOPLE_TAGS = {x.lower() for x in _people_tags}

_no_animal_tags = {'No Animal', 'no  animal', 'Time Lapse', 'Camera Misfire', 'False trigger', 'Blank'}
NO_ANIMAL_TAGS = {x.lower() for x in _no_animal_tags}

## Data

The original images and classification annotations are in the `emammal` container in the `wildlifeblobssc` storage account in the AI for Earth Development subscription. The `emammal` container holds collections named after the researcher responsible for them and a number indicating the batch. These were downloaded to the `bobcat` VM's 2TB data disk at `/datadrive/emammal`, forgoing the collection folder level. Scripts for downloading them to the data disk is at `database_tools/copy_and_unzip_emammal.py`. 

At `/datadrive/emammal`, each folder is one deployment. Each deployment folder contains the sequences of images and a .xml file with information such as timestamp and animal species labels.

There are a total of 3140 deployments in the 0McShea and 0Kays collections, and 126 in the 0Long collection.

A deployment is a set of image sequences specific in space and time. So a site name could be shared by multiple deployments. I suspect there is an issue with the two projects from China (p193 and p195) with how they name their deployments - they seem to have named some of them with the site name.

### First batch of annotations

Here we load the csv and an example xml file.

In [None]:
# did not work with 'utf-8'
data = pd.read_csv(csv_path, encoding='ISO-8859-1', header=0,
                   names=['projectID', 'projectName', 'deploymentID', 'siteName', 'speciesPresentCommon', 'imageID', 'annotationSetFileName']) 

In [None]:
data.sample(n=10)
# note that speciesPresentCommon is ; separated when there are more than one species present

In [None]:
len(data)  # 18418 images are getting bbox annotations

In [None]:
data.loc[3, 'annotationSetFileName']  # annotationSetFileName example

In [None]:
sorted(data.projectID.unique())

Project p126 is not found in blob storage...

In [None]:
data[data.projectID == 'p126']

In [None]:
all_annotated_img = data['annotationSetFileName']
frames = [x.split('.frame')[1].split('.')[0] for x in all_annotated_img]

In [None]:
possible_frames = set(frames)
possible_frames

Note that the seq ID can just be `s11` without the deployment ID `d17432` prefix. But that's how it is in the `<ImageSequenceId>` item.


No "human" images were included in the annotation set.

In [None]:
sample_xml_path = '/datadrive/emammal/4180d18095/deployment_manifest.xml'

In [None]:
with open(sample_xml_path, 'r') as f:
    tree = etree.parse(sample_xml_path)
root = tree.getroot()
for child in root:
    print(child)

In [None]:
tree.findtext('ProjectId')

In [None]:
tree.find('CameraDeploymentID').text

In [None]:
sequence = tree.find('ImageSequence')
for child in sequence:
    print(child)

print('')

# species is only identified at the sequence level, not image level
research_ids = sequence.find('ResearcherIdentifications')
example_id = research_ids[0]
for c in example_id:
    print('{} - {}'.format(c, c.text))


print('\nImage has the following fields. The ImageIdentifications is empty.')    
for child2 in child:
    print(child2)

## Dataset stats

All of the result below (apart from where noted) are based on the entire eMammal dataset, not just the images that were sent for annotation.

It is quite fast (~30 seconds) to run the cell below, reading the 3140 xml files.

In [None]:
# What species are there
species_tally = defaultdict(int)

# How many animals of the same species are there in each sequence - the content of <Count>
# Note that even if the identification is "empty"/"no animal", the <Count> tag will still be 1
animal_counts = []

# How many <Identification> items are there in each sequence
num_identifications = []

# Sequence ID for all sequences that have more than one <Identification> : What species are present
seq_with_multi_ids = {}

# How many sequences are there in each deployment
num_sequences_in_d = []

# How many images are in each sequence
num_images_in_seq = []


# TODO Could get these info for each project separately.


total_num_deployments = len(os.listdir(deployments_path))

for deployment in tqdm(os.listdir(deployments_path)):
    deployment_path = os.path.join(deployments_path, deployment)
    manifest_path = os.path.join(deployment_path, 'deployment_manifest.xml')
    
    with open(manifest_path, 'r') as f:
        tree = etree.parse(f)
    
    root = tree.getroot()
    project_id = root.findtext('ProjectId')
    deployment_id = root.findtext('CameraDeploymentID')
    
    image_sequences = root.findall('ImageSequence')
    num_sequences_in_d.append(len(image_sequences))
    
    for sequence in image_sequences:
        images = sequence.findall('Image')
        num_images_in_seq.append(len(images))
        
        # get species info
        seq_id = sequence.findtext('ImageSequenceId')
        full_seq_id = 'datasetemammal.project{}.deployment{}.seq{}'.format(project_id, deployment_id, seq_id)
        
        researcher_identifications = sequence.findall('ResearcherIdentifications')
        
        for researcher_id in researcher_identifications:
            identifications = researcher_id.findall('Identification')
            num_identifications.append(len(identifications))
            multi_id_flag = True if len(identifications) > 1 else False
                
            species = []

            for id in identifications:
                species_common_name = clean_species_name(id.findtext('SpeciesCommonName'))
                species_tally[species_common_name] += 1
                
                species.append(species_common_name)

                count = id.findtext('Count')
                animal_counts.append(int(count))
                
            if multi_id_flag:
                seq_with_multi_ids[full_seq_id] = species

### Number of images, sequences and deployments

In [None]:
# Number of sequences in a deployment
print('Total number of sequences in the dataset: {}'.format(sum(num_sequences_in_d)))
print('Median of {:.0f} sequences in a deployment, average of {:.2f}, min {:.0f}, max {:.0f}'.format(
    np.median(num_sequences_in_d), 
    np.mean(num_sequences_in_d),
    min(num_sequences_in_d),
    max(num_sequences_in_d)))
plot_histogram(num_sequences_in_d, 'Histogram of the number of sequences in deployments')
plot_histogram(num_sequences_in_d, 'Histogram of the number of sequences in deployments, max=200', max_val=200)

In [None]:
# Number of images/frames in a sequence
print('Total number of images in the dataset: {}'.format(sum(num_images_in_seq)))
print('Median of {:.0f} images in a sequence, average of {:.2f}, min {:.0f}, max {:.0f}'.format(
    np.median(num_images_in_seq), 
    np.mean(num_images_in_seq),
    min(num_images_in_seq),
    max(num_images_in_seq)))

plot_histogram(num_images_in_seq, 'Histogram of the number of images in sequences, max=40', max_val=40)

Verified that this number of images are present as .jpg:

In [None]:
total_num_images = 0

for deployment in tqdm(os.listdir(deployments_path)):
    deployment_path = os.path.join(deployments_path, deployment)
    content = os.listdir(deployment_path)
    num_images = sum(1 for i in content if i.lower().endswith('.jpg'))
    total_num_images += num_images
print('Total of {} images found.'.format(total_num_images))

### Species present

In [None]:
plot_distribution(species_tally, title='Number of sequences with the species', top=30)

In [None]:
sort_dict_val_desc(species_tally)

### Count tag `<Count>` 

An example where the count is 3 is `project3062d20814.deploymentd20814.sequenced20814s25`. There are 3 Northern Raccoons but no more than 2 appear in one image.

Inspecting a few others of such, count doesn't seem particularly useful. One labeled with count 3 could be 2 animals that becomes invisible in a few frames. 

As noted before, even if the identification is "empty"/"no animal", the <Count> tag will still be 1

In [None]:
print('Total number of Count item values in the dataset: {}'.format(sum(animal_counts)))
print('Median of {:.0f} for the Count item, average of {:.2f}, min {:.0f}, max {:.0f}'.format(
    np.median(animal_counts), 
    np.mean(animal_counts),
    min(animal_counts),
    max(animal_counts)))
counter = Counter(animal_counts)
print(counter)

### Multiple animals or species in the same image sequence

Some image sequences contain more than one `<Identification>` item in `<ResearcherIdentifications>`. There is also a `<Count>` for each `<Identification>`, which seems to mean how many animals of that species are there. This is a little inconsistent: sometimes both `<Identification>` are of the same species as you can see below.

This is problematic because iMerit only labels bbox for one class 'animal', so when there are multiple bboxs in an image of different species, there's no way to label the bbox's species correctly (other than manually).

In [None]:
print('Total number of identification items in the dataset: {}'.format(sum(num_identifications)))
print('Median of {:.0f} identification items per sequence, average of {:.2f}, min {:.0f}, max {:.0f}'.format(
    np.median(num_identifications), 
    np.mean(num_identifications),
    min(num_identifications),
    max(num_identifications)))
counter = Counter(num_identifications)
print(counter)

152483 / 174277 = 87.5% images have just one identification item in the xml.

In [None]:
list(seq_with_multi_ids.values())[:20]  # what species are there in sequences with multiple identifications

In [None]:
all_annotated_img = data['annotationSetFileName']
all_annotated_seq = set([x.split('.frame')[0] for x in all_annotated_img])
len(all_annotated_seq)

Here are all the species __in bbox annotated pictures__ with more than one identifications, that don't involve human or where all IDs are of the same species:

In [None]:
for seq_id, animals in seq_with_multi_ids.items():
    if seq_id in all_annotated_seq:
        if 'human' not in animals and len(set(animals)) > 1:
            print(animals)

So not that many images that are animals-only and contain more than one label. This is few enough to label manually.

## Notes

Some other names used for denoting human and empty sequences:
```
other_human_tags = {
    'camera  trapper', # extra whitespace or misspelt
    'camera trapper ',
    'camera trappper'
}

other_no_animal = {
    'animal not on list',  # should map to 'unknown animal'?
    'no  animal',  # extra whitespace
}
```

## Bounding box annotations

To come. 