In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')  # append this repo to PYTHONPATH

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math

import pandas as pd
from tqdm import tqdm

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# Combine Sul Ross kitfox with bbox entries

Give the path to a JSON file where output from this script will be written to. You can then take this file to the .Net app for ingestion to the database.

In [4]:
path_to_output = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_2020/sulross_kitfox_combined_megadb.json'  

**Name of the dataset**

In [5]:
dataset_name = 'sulross_kitfox'

## Step 0 - Add an entry to the `datasets` table

This dataset is already in the table with a few entries with human and vehicle bbox labels. We need to combine them with the newly available class data.

In [6]:
with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_batches_9_10_11/sulross_kitfox_w_batch_10_boxes.json') as f:
    existing_entries = json.load(f)

In [None]:
len(existing_entries)
existing_entries

## Step 1 - Prepare the `sequence` objects to insert into the database

### Step 1b - If you're starting from scratch...

In [9]:
label_folder = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Engagements/SulRoss/share_microsoft/KitFoxLabels'

# some folders were renamed, so using the API output to map to path in blob storage
api_output_path = '/Users/siyuyang/Source/temp_data/CameraTrap/engagements/SulRoss/20190619_kitfox/detector_output/detections_kitfox_20190620.json'

In [10]:
with open(api_output_path) as f:
    detection_res = json.load(f)

In [11]:
image_name_to_path = {}

for im in detection_res['images']:
    fn = os.path.basename(im['file'])
    image_name_to_path[fn] = im['file']

In [12]:
del detection_res

In [None]:
li = []
for csv_file in os.listdir(label_folder):
    print(csv_file)
    csv_path = os.path.join(label_folder, csv_file)
    df = pd.read_csv(csv_path, index_col=None, header=0)
    li.append(df)

timelapse_df = pd.concat(li, ignore_index=True)

In [14]:
len(timelapse_df)

3393908

In [16]:
timelapse_df.dtypes

File            object
RelativePath    object
Folder          object
Date            object
Time            object
ImageQuality    object
DeleteFlag        bool
County          object
Survey          object
Analyst         object
Notes           object
Publicity         bool
Empty             bool
Person            bool
Animal            bool
Species         object
species2        object
species3        object
Unnamed: 18     object
dtype: object

In [19]:
timelapse_df = timelapse_df.drop(columns='Unnamed: 18')

In [None]:
timelapse_df.sample(5)

In [24]:
entries_im_not_stored = []

for i_row, row in tqdm(timelapse_df.iterrows()):
    fn = row['File']
    if not fn in image_name_to_path:
        entries_im_not_stored.append(fn)

3393908it [07:32, 7503.91it/s]


In [25]:
len(entries_im_not_stored)

124

In [None]:
embedded = []  # list of images with all attributes at the image-level
unidentified_animal = []

for i_row, row in tqdm(timelapse_df.iterrows()):
    fn = row['File']
    path = image_name_to_path.get(fn, False)
    if path is False:
        continue
    
    # using the file name only to determine the seq_id, frame_num and location
    p = fn.split('-')
    seq_id = '-'.join(p[:-1])

    frame_num = int(fn.split('(')[1].split(')')[0])
    
    p = fn.split('__')
    location = p[0] + '__' + p[1]
    
    # other attributes from the csv columns
    date_time = row['Date'] + ' ' + row['Time']
    
    is_empty = row['Empty']
    has_person = row['Person']
    has_animal = row['Animal']
    
    raw_classes = []
    if not pd.isnull(row['Species']):
        if not(row['Species'] is None or row['Species'] == 'None'):
            raw_classes.append(row['Species'])
    if not pd.isnull(row['species2']):
        raw_classes.append(row['species2'])
    if not pd.isnull(row['species3']):
        raw_classes.append(row['species3'])
        
    animal_classes = []
    for a in raw_classes:
        animal_classes.append(a.lower().replace('_', ''))  # _skunk to skunk

# this happens - the one sample I looked had a person, so appending 'human' below  
#     if has_person and is_empty:
#         print(row)
#         break
  
# certain bird species are recorded but the Empty field still marked as True
#     if has_animal and is_empty:
#         print(row)
#         break
    
    if has_animal and len(animal_classes) == 0:
        unidentified_animal.append(row)
        animal_classes.append('unidentified')
    
    if has_person:
        animal_classes.append('human')
    
    if len(animal_classes) == 0:
        animal_classes = ['empty']
    else:
        animal_classes = list(set(animal_classes))
        
    embedded.append({
        'file': path,
        'seq_id': seq_id,
        'frame_num': frame_num,
        'location': location,
        'datetime': date_time,
        'class': animal_classes
    })

In [61]:
sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to sulross_kitfox. Please make sure this is correct!
Making a deep copy of docs...



  0%|          | 0/3393784 [00:00<?, ?it/s][A
  2%|▏         | 79824/3393784 [00:00<00:04, 793629.56it/s][A


Putting 3393784 images into sequences...


  5%|▍         | 158572/3393784 [00:00<00:04, 790011.91it/s][A
  7%|▋         | 238150/3393784 [00:00<00:03, 791456.49it/s][A
  9%|▉         | 301941/3393784 [00:00<00:04, 753357.48it/s][A
 11%|█         | 367793/3393784 [00:00<00:04, 734387.67it/s][A
 13%|█▎        | 451151/3393784 [00:00<00:03, 751144.80it/s][A
 16%|█▌        | 538399/3393784 [00:00<00:03, 768449.88it/s][A
 18%|█▊        | 615771/3393784 [00:00<00:03, 769127.23it/s][A
 21%|██        | 697981/3393784 [00:00<00:03, 775027.24it/s][A
 23%|██▎       | 789308/3393784 [00:01<00:03, 788850.60it/s][A
 26%|██▌       | 878227/3393784 [00:01<00:03, 797963.13it/s][A
 28%|██▊       | 962848/3393784 [00:01<00:03, 801977.03it/s][A
 31%|███       | 1049265/3393784 [00:01<00:02, 806754.54it/s][A
 33%|███▎      | 1133775/3393784 [00:01<00:02, 797605.17it/s][A
 36%|███▌      | 1213904/3393784 [00:01<00:02, 795824.16it/s][A
 38%|███▊      | 1297376/3393784 [00:01<00:02, 798200.55it/s][A
 41%|████      | 1388643/3393784 [00

Number of sequences: 1020306
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'file', 'location', 'datetime', 'frame_num', 'class'}

img_level_properties
{'file', 'frame_num', 'class', 'datetime'}

image-level properties that really should be sequence-level
{'location'}

Finished processing sequences.
Example sequence items:

{'seq_id': 'Reeves002__Cam001__2019-02-19__14-21', 'dataset': 'sulross_kitfox', 'images': [{'file': 'Reeves002/Cam001/Reeves002__Cam001__2019-02-19__14-21-54(1).JPG', 'frame_num': 1, 'datetime': '19-Feb-19 14:21:54', 'class': ['empty']}, {'file': 'Reeves002/Cam001/Reeves002__Cam001__2019-02-19__14-21-59(2).JPG', 'frame_num': 2, 'datetime': '19-Feb-19 14:21:59', 'class': ['empty']}], 'location': 'Reeves002__Cam001'}

[{'seq_id': 'Hudspeth001__Cam010__2018-08-06__14-09', 'dataset': 'sulross_kitfox', 'images': [{'file': 'Hudspeth001/Cam010/Hudspeth001__Cam010__2018-08-06__14-09-02(1).JPG', 'frame_num

In [None]:
# clean up species a bit

for seq in sequences:
    for im in seq['images']:
        if im['class'][0] != 'empty':
            
            if 'none' in im['class']:
                im['class'] = [i for i in im['class'] if i != 'none']

In [69]:
species_present = set()
for seq in sequences:
    for im in seq['images']:
        if im['class'][0] != 'empty':
            species_present.update(im['class'])

In [None]:
species_present

In [76]:
# some entries has non-unique frame num - the image entries are duplicated in these

fixed_seqs = {}  # seq_id to seq entry

for seq in sequences:
    frame_numbers = [im['frame_num'] for im in seq['images']]
    if len(frame_numbers) != len(set(frame_numbers)):
        im_dict = {}
        for im in seq['images']:
            im_dict[im['file']] = im
        deduped_im = list(im_dict.values())
        seq['images'] = deduped_im

In [73]:
len(problem_seqs)

2297

In [75]:
'Night/JeffDavis001/Cam004_messeduptimes/JeffDavis001__Cam004__2018-05-03__16-06-44(3).JPG' == 'Night/JeffDavis001/Cam004_messeduptimes/JeffDavis001__Cam004__2018-05-03__16-06-44(3).JPG'

True

In [None]:
problem_seqs[100]

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [77]:
sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.


## Add the few bounding box entries we had

In [80]:
im_file_to_bbox = {}
im_has_vehicle = []

for seq in existing_entries:
    for im in seq['images']:
        if 'vehicle' in im['class']:
            im_has_vehicle.append(im['file'])
            
        if 'bbox' in im:
            im_file_to_bbox[im['file']] = im['bbox']

In [81]:
for seq in sequences:
    for im in seq['images']:
        if im['file'] in im_has_vehicle:
            
            if im['class'][0] == 'empty':
                im['class'] = ['vehicle']
            else:
                if 'vehicle' not in im['class']:
                    im['class'].append('vehicle')
        if im['file'] in im_file_to_bbox:
            im['bbox'] = im_file_to_bbox[im['file']]

In [82]:
count_w_bbox = 0
for seq in sequences:
    for im in seq['images']:
        if 'bbox' in im:
            count_w_bbox += 1
count_w_bbox

39

In [None]:
# sample some sequences to make sure things look good

sample(sequences, 10)

## Step 4 - Save the `sequence` items to a file

You can now take the resulting JSON file to the .Net application for bulk insertion to the database:

In [84]:
with open(path_to_output, 'w') as f:
    json.dump(sequences, f)

You can check that the bounding box annotations and paths to images all survived by running the `visualization/visualize_megadb.py` using the above exported file.