In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')  # append this repo to PYTHONPATH

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math

import pandas as pd
from tqdm import tqdm

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# Sul Ross 2019 spring

In [4]:
path_to_output = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_2020/sulross_2019_spring_megadb.json'  

**Name of the dataset**

In [5]:
dataset_name = 'sulross_2019_spring'

## Step 0 - Add an entry to the `datasets` table

Done. Prefix is `Spring2019`. Note that for dataset `sulross_2018` there is no prefix in the dataset entry.

## Step 1 - Prepare the `sequence` objects to insert into the database

### Step 1b - If you're starting from scratch...

In [6]:
label_folder = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Engagements/SulRoss/share_microsoft/LionSpring2019Labels'

# some folders were renamed, so using the API output to map to path in blob storage
api_output_path = '/Users/siyuyang/Source/temp_data/CameraTrap/engagements/SulRoss/20191015_Spring2019/4092_detections_SulRossSpring2019_20191014211922.json'

In [7]:
with open(api_output_path) as f:
    detection_res = json.load(f)

In [9]:
image_name_to_path = {}

for im in detection_res['images']:
    fn = os.path.basename(im['file'])
    image_name_to_path[fn] = '/'.join(im['file'].split('/')[1:])  # delete prefix Spring2019

In [11]:
del detection_res

In [None]:
li = []
for csv_file in os.listdir(label_folder):
    print(csv_file)
    csv_path = os.path.join(label_folder, csv_file)
    df = pd.read_csv(csv_path, index_col=None, header=0)
    li.append(df)

timelapse_df = pd.concat(li, ignore_index=True)

In [14]:
len(timelapse_df)
timelapse_df.dtypes

602438

File             object
RelativePath    float64
Folder           object
Date             object
Time             object
ImageQuality     object
DeleteFlag         bool
County           object
Survey           object
Analyst          object
Notes            object
Publicity          bool
Empty              bool
Person             bool
Animal             bool
Species          object
species2         object
species3        float64
Unnamed: 18     float64
dtype: object

In [15]:
timelapse_df = timelapse_df.drop(columns='Unnamed: 18')

In [None]:
timelapse_df.sample(5)

In [17]:
entries_im_not_stored = []

for i_row, row in tqdm(timelapse_df.iterrows()):
    fn = row['File']
    if not fn in image_name_to_path:
        entries_im_not_stored.append(fn)
        
len(entries_im_not_stored)

602438it [01:13, 8144.85it/s]


14

In [20]:
embedded = []  # list of images with all attributes at the image-level
unidentified_animal = []

for i_row, row in tqdm(timelapse_df.iterrows()):
    fn = row['File']
    path = image_name_to_path.get(fn, False)
    if path is False:
        continue
    
    # using the file name only to determine the seq_id, frame_num and location
    p = fn.split('-')
    seq_id = '-'.join(p[:-1])

    frame_num = int(fn.split('(')[1].split(')')[0])
    
    location = fn.split('__')[1].split('_')[0]
    
    # other attributes from the csv columns
    date_time = row['Date'] + ' ' + row['Time']
    
    is_empty = row['Empty']
    has_person = row['Person']
    has_animal = row['Animal']
    
    raw_classes = []
    if not pd.isnull(row['Species']):
        if not(row['Species'] is None or row['Species'].lower() == 'none'):
            raw_classes.append(row['Species'])
    if not pd.isnull(row['species2']):
        raw_classes.append(row['species2'])
    if not pd.isnull(row['species3']):
        raw_classes.append(row['species3'])
        
    animal_classes = []
    for a in raw_classes:
        animal_classes.append(a.lower().replace('_', ''))  # _skunk to skunk

    # this happens - the one sample I looked had a person, so appending 'human' below  
    if has_person and is_empty:
        print(row)
        break

    # happens; be conservative and note as unidentified
#     if has_animal and is_empty:
#         print(row)
#         break
    
    if has_animal and len(animal_classes) == 0:
        unidentified_animal.append(row)
        animal_classes.append('unidentified')
    
    if has_person:
        animal_classes.append('human')
    
    if len(animal_classes) == 0:
        animal_classes = ['empty']
    else:
        animal_classes = list(set(animal_classes))
        
    embedded.append({
        'file': path,
        'seq_id': seq_id,
        'frame_num': frame_num,
        'location': location,
        'datetime': date_time,
        'class': animal_classes
    })

602438it [02:36, 3860.95it/s]


In [22]:
sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to sulross_2019_spring. Please make sure this is correct!
Making a deep copy of docs...


 25%|██▍       | 148172/602424 [00:00<00:00, 740771.52it/s]

Putting 602424 images into sequences...


100%|██████████| 602424/602424 [00:01<00:00, 484425.32it/s]


Number of sequences: 288158
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'frame_num', 'class', 'file', 'location', 'datetime'}

img_level_properties
{'file', 'class', 'datetime', 'frame_num'}

image-level properties that really should be sequence-level
{'location'}

Finished processing sequences.
Example sequence items:

{'seq_id': 'Spring2019__S1_BadTimes__1920-08-13__07-28', 'dataset': 'sulross_2019_spring', 'images': [{'file': 'S1_BadTimes/Spring2019__S1_BadTimes__1920-08-13__07-28-46(1).JPG', 'frame_num': 1, 'datetime': '13-Aug-1920 07:28:46', 'class': ['empty']}], 'location': 'S1'}

[{'seq_id': 'Spring2019__S2__2019-03-29__12-02', 'dataset': 'sulross_2019_spring', 'images': [{'file': 'S2/Spring2019__S2__2019-03-29__12-02-11(1).JPG', 'frame_num': 1, 'datetime': '29-Mar-2019 12:02:11', 'class': ['empty']}, {'file': 'S2/Spring2019__S2__2019-03-29__12-02-22(2).JPG', 'frame_num': 2, 'datetime': '29-Mar-2019 12:02:2

In [None]:
for seq in sequences:
    frame_numbers = [im['frame_num'] for im in seq['images']]
    if len(frame_numbers) != len(set(frame_numbers)):
        im_dict = {}
        for im in seq['images']:
            im_dict[im['file']] = im
        deduped_im = list(im_dict.values())
        seq['images'] = deduped_im

In [26]:
# some entries has non-unique frame num - the image entries are duplicated in these
problem_seqs = []

for seq in sequences:
    frame_numbers = [im['frame_num'] for im in seq['images']]
    if len(frame_numbers) != len(set(frame_numbers)):
        problem_seqs.append(seq)

In [28]:
len(problem_seqs)

90

Looks like these are all of label "empty" and have (x) that are duplicated in the sequence. Exclude them

In [30]:
new_sequences = []
for seq in sequences:
    frame_numbers = [im['frame_num'] for im in seq['images']]
    if len(frame_numbers) == len(set(frame_numbers)):
        new_sequences.append(seq)
        
sequences = new_sequences

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [31]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


In [None]:
# sample some sequences to make sure things look good

sample(sequences, 10)

In [33]:
species_present = set()
for seq in sequences:
    for im in seq['images']:
        if im['class'][0] != 'empty':
            species_present.update(im['class'])

In [None]:
species_present

## Step 4 - Save the `sequence` items to a file

In [35]:
with open(path_to_output, 'w') as f:
    json.dump(sequences, f)