In [None]:
import copy
import pandas as pd
import csv
import json
from lxml import etree
import os
from tqdm import tqdm
from collections import defaultdict
import statistics
import shutil
from azure.storage.blob import BlobServiceClient
from eMammal_helpers import *

In [None]:
# print all outputs in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# auto reload external Python modules
%load_ext autoreload
%autoreload 2

# display Matplotlib figures inline and set default size
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.dpi'] = 120
plt.rcParams['figure.figsize'] = (8.0, 3.0)
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
plt.rcParams['font.size'] = 9

In [None]:
# configurations and paths
output_dir_path = '/home/yasiyu/yasiyu_temp'

deployments_path = '/datadrive/emammal'

projects_of_interest = ['p139', 'p158']  # the Robert Long collection had deployents from these two projects

# Prepare an iMerit job

This notebooks samples a subset of the Robert Long collection of eMammal data from the Seattle area, and 
copies them to a folder to hand off for annotations to iMerit.

Aim to have a sample of ~20k images:
- exclude empty sequences
- do not include too many humans
- reasonable diversity across species
- reasonable diversity across deployments
- balance between day/night (time of day is okay for this purpose)
- for simplicity, may avoid sequences with multiple _different_ species, since we'll use these labels for classification as well.
- could bias towards species that the GIX project needs

Image name format:

`dataset[datasetname].project[projectID].deployment[deploymentID].seq[sequenceID].frame[frameNumber].img[imageID].jpg
`

e.g.

`datasetemammal.project3062.deploymentd16546.seqd16546s14.frame001.imgd16546s14i1.jpg`

In [None]:
# find the deployments in the Robert Long collection

deployments = []
for deployment in os.listdir(deployments_path):
    # we know that for deployments in the Robert Long collection, the folders are prefixed with the 
    # project ID, but in general this is not true.
    of_interest = False
    for proj in projects_of_interest:
        if deployment.startswith(proj):
            of_interest = True
    if of_interest:
        deployments.append(deployment)

In [None]:
len(deployments)  # confirmed by using Storage Explorer that the 0Roboert Long collection has 126 zips indeed.

Make a dataframe, one row each sequence, columns include

`full_seq_id, project_id, deployment_id, species, is_empty, is_human_only, is_multi_species, num_frames, start_time, is_daytime`

species is semi-column separated list of unique species, sorted.

Make another dataframe for images, one row per image, columns include

`full_img_id, full_img_path, full_seq_id, project_id, project_name, deployment_id, site_name, species, img_id`

Note that `full_img_id` is generated from information in the xml file only, and you might not be able to construct `full_img_path` from it (hence the need for this mapping) because of errors in naming the folder structures.

Checked that all `full_img_path` exist on the data disk. Some images may be corrupted from previous experience.  

In [None]:
sequences_list = []
images_list = []
species_tally = defaultdict(int)

# is the species distribution different for sequences with no more than 20 frames
species_tally_short_seq = defaultdict(int)
max_num_frames = 20

for deployment in tqdm(deployments):
    deployment_path = os.path.join(deployments_path, deployment)
    manifest_path = os.path.join(deployment_path, 'deployment_manifest.xml')
    
    with open(manifest_path, 'r') as f:
        tree = etree.parse(f)
    
    root = tree.getroot()
    project_id = root.findtext('ProjectId')
    project_name = root.findtext('ProjectName')
    deployment_id = root.findtext('CameraDeploymentID')
    site_name = root.findtext('CameraSiteName')
    
    image_sequences = root.findall('ImageSequence')
    for sequence in image_sequences:
        images = sequence.findall('Image')
        num_frames = len(images)
        
        # sequences
        seq_id = sequence.findtext('ImageSequenceId')
        full_seq_id = 'datasetemammal.project{}.deployment{}.seq{}'.format(project_id, deployment_id, seq_id)
        
        # get species info
        researcher_identifications = sequence.findall('ResearcherIdentifications')
        species_in_seq = set()
        for researcher_id in researcher_identifications:
            identifications = researcher_id.findall('Identification')
            for id in identifications:
                species_common_name = clean_species_name(id.findtext('SpeciesCommonName'))
                species_tally[species_common_name] += 1
                species_in_seq.add(species_common_name)
                
                if num_frames <= max_num_frames:
                    species_tally_short_seq[species_common_name] += 1
                
        species_str = ';'.join(sorted(list(species_in_seq)))
    
        seq_start_time = sequence.findtext('ImageSequenceBeginTime')
        start_time = parse_timestamp(seq_start_time)
    
        sequences_list.append({
            'full_seq_id': full_seq_id,
            'project_id': project_id,
            'deployment_id': deployment_id,
            'species': species_str,
            'is_empty': True if species_str == 'empty' else False,
            'is_human_only': True if species_str == 'human' else False,
            'is_multi_species': True if len(species_in_seq) > 1 else False,
            'num_frames': num_frames,
            'start_time': start_time,
            'is_daytime': is_daytime(start_time)
        })
        
        # images
        for img in images:
            img_id = img.findtext('ImageId')
            img_file_name = img.findtext('ImageFileName')
            assert img_file_name.endswith('.JPG')
            try:
                img_frame = clean_frame_number_4_digit(img.findtext('ImageOrder'))
            except:
                img_frame = img_file_name.split('i')[1].split('.')[0]
                img_frame = clean_frame_number_4_digit(img_frame)
            
            full_img_id = 'datasetemammal.project{}.deployment{}.seq{}.frame{}.img{}'.format(project_id, deployment_id, seq_id, img_frame, img_id)
            full_img_path = os.path.join(deployment_path, img_file_name)
            
            if not os.path.exists(full_img_path):
                print('Path does not exist: {}'.format(full_img_path))
            
            images_list.append({
                'full_img_id': full_img_id,
                'full_img_path': full_img_path,
                'full_seq_id': full_seq_id,
                'project_id': project_id,
                'project_name': project_name,
                'deployment_id': deployment_id,
                'site_name': site_name,
                'species': species_str,
                'img_id': img_id
            })

Images in deployements `p139d18649` only have four attributes:

```
<Element Image at 0x7f0feeb07388>
<Element ImageId at 0x7f0feeb07288>
<Element ImageFileName at 0x7f0feeb07508>
<Element ImageInterestRanking at 0x7f0feebbf708>
```

Their frame number are assigned based on the image's file name.

In [None]:
len(sequences_list)

In [None]:
len(images_list)

### Species distribution in the 0Robert Long WA collection

In [None]:
plot_distribution(species_tally, title='Number of sequences with the species', top=30)

In [None]:
species_tally_animals = copy.deepcopy(species_tally)
species_tally_animals['human'] = 0
species_tally_animals['empty'] = 0
species_tally_animals['domestic dog'] = 0
plot_distribution(species_tally_animals, title='Number of sequences with the species, excl. human, empty, dog', top=30)

### Species distribution in the 0Robert Long WA collection, among sequences with 20 frames or fewer

In [None]:
plot_distribution(species_tally_short_seq, title='Number of sequences with the species, num_frames <= 20', top=30)

In [None]:
species_tally_animals = copy.deepcopy(species_tally_short_seq)
species_tally_animals['human'] = 0
species_tally_animals['empty'] = 0
species_tally_animals['domestic dog'] = 0
plot_distribution(species_tally_animals, title='Number of sequences with the species, excl. human, empty, dog, num_frames <= 20', top=30)

### Understand the distribution of species and number of frames

In [None]:
df_seq = pd.DataFrame(sequences_list)
df_img = pd.DataFrame(images_list)

In [None]:
len(df_seq)
df_seq.dtypes

In [None]:
len(df_img)

In [None]:
df_seq.sample(n=5)

In [None]:
df_img.sample(n=3)

#### Refine the sequences we want to label

In [None]:
df_seq_animals = df_seq[(df_seq.is_empty == False) & (df_seq.is_human_only == False)]

In [None]:
df_seq_animals

#### What is the distribution of number of frames in sequences, in the sequences excluding empty, human-only and dog-walking?

In [None]:
num_frames = df_seq_no_dog_walking.num_frames
statistics.mean(num_frames)
statistics.median(num_frames)
max(num_frames)
min(num_frames)

In [None]:
plot_histogram(num_frames)

In [None]:
# how many of these frames are more than 20 long? - only about 5%
sum(num > 20 for num in num_frames) / len(num_frames)

## Select the sequences to be annotated
#### Start from no human-only and non-empty sequences. Exclude sequences longer than 20 frames, and exclude sequences with multiple species

In [None]:
df_seq_short = df_seq_animals[(df_seq_animals.num_frames <= 20)]

In [None]:
sum(df_seq_animals.num_frames)  # number of images
sum(df_seq_short.num_frames)

#### Separate out dog-walking, horse-riding, coyote/mule deer (two most common non-domestic species) sequences

In [None]:
df_seq_dog_walking = df_seq_short[df_seq_short.species == 'domestic dog;human']
len(df_seq_dog_walking)

In [None]:
df_seq_horse_riding = df_seq_short[df_seq_short.species == 'domestic horse;human']
len(df_seq_horse_riding)

In [None]:
df_seq_coyote_deer = df_seq_short[(df_seq_short.species == 'coyote') | (df_seq_short.species == 'mule deer')]
len(df_seq_coyote_deer)
df_seq_coyote_deer.sample(n=3)

#### Get ride of all multi-species (not that many after excluding dog walking and horse riding ones) sequences

In [None]:
def_single_species = df_seq_short[df_seq_short.is_multi_species == False]

In [None]:
df_seq_rare = def_single_species[(def_single_species.species != 'coyote') & (def_single_species.species != 'mule deer')]
len(df_seq_rare)
df_seq_rare.sample(n=5)

In [None]:
df_seq_coyote_deer_sample = df_seq_coyote_deer.sample(n=800)
len(df_seq_coyote_deer_sample)
df_seq_coyote_deer_sample.sample(n=3)

In [None]:
df_seq_good = pd.concat([df_seq_rare, df_seq_coyote_deer_sample])
print('Total number of sequences: {}'.format(len(df_seq_good)))
print('Total number of images: {}'.format(sum(df_seq_good.num_frames)))
df_seq_good.sample(n=3)

In [None]:
final_species = defaultdict(int)
for sp in df_seq_good.species:
    final_species[sp] += 1

In [None]:
plot_distribution(final_species, title='Species in the resampled dataset', top=30)

#### Save a record of the sequences selected for relabeling 
Note that since the coyote and mule deer entries are sampled randomly, running this script again will NOT generate the same sequences

In [None]:
df_seq_good.to_csv('/home/yasiyu/yasiyu_temp/0RobertLong_sequences_20180907.csv', index=False)

In [None]:
# need to load df_seq_good from csv if you need to find the images in these sequences again.

#### Save a record of the images in the selected sequences

In [None]:
img_selected_paths = []
list_df_images_in_seq = []
for full_seq_id in tqdm(df_seq_good.full_seq_id):
    images_in_seq = df_img[df_img.full_seq_id == full_seq_id]
    
    pairs = zip(list(images_in_seq.full_img_path), list(images_in_seq.full_img_id))
    img_selected_paths.extend(pairs)
    
    list_df_images_in_seq.append(images_in_seq)

In [None]:
len(img_selected_paths)  # should be 14867

In [None]:
df_img_good = pd.concat(list_df_images_in_seq)
len(df_img_good)
df_img_good.sample(n=3)

In [None]:
df_img_good.to_csv('/home/yasiyu/yasiyu_temp/0RobertLong_images_20180907.csv', index=False)

In [None]:
len(df_img_good.deployment_id.unique())  # number of deployments selected, out of a total of 126, so pretty good representation

#### Copy out the images into a folder

Need to rename the images to the full_img_id

In [None]:
img_selected_paths[14000]

In [None]:
dest_folder = '/home/yasiyu/yasiyu_temp/eMammal_20180907'
for from_path, full_img_id in tqdm(img_selected_paths):
    dest_path = os.path.join(dest_folder, '{}.jpg'.format(full_img_id))
    res = shutil.copyfile(from_path, dest_path)

In [None]:
# zip the folder

In [None]:
# upload to blob storage - take a while
key = os.environ["AZ_STORAGE_KEY"]
blob_service = BlobServiceClient(account_url='wildlifeblobssc.blob.core.windows.net', credential=key)
with open('/home/yasiyu/yasiyu_temp/eMammal_20180907.zip', 'rb') as f:
    blob_service.get_container_client('yasiyutemp').upload_blob(name='eMammal_20180907.zip', data=f)

### Number of daytime/nighttime images

In [None]:
num_images_night = sum(df_seq_good[df_seq_good.is_daytime == False].num_frames) / sum(df_seq_good.num_frames)
num_images_night

But if we take into account of all images including people walking dogs, there are now fewer nighttime pictures which makes sense since animals only come out after dark on trails / in general.

In [None]:
num_images_night_tot = sum(df_seq[df_seq.is_daytime == False].num_frames) / sum(df_seq.num_frames)
num_images_night_tot