In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/Users/siyuyang/Source/repos/GitHub_MSFT/CameraTraps')  # append this repo to PYTHONPATH

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
import math

import pandas as pd
from tqdm import tqdm

from data_management.megadb.schema import sequences_schema_check
from data_management.annotations.add_bounding_boxes_to_megadb import *
from data_management.megadb.converters.cct_to_megadb import make_cct_embedded, process_sequences, write_json

# IDFG SWWLF 2019

#### Notes on swwlf2019_pic_dat.csv from Sarah
 
- We are converting data so that there is a row for every image, and if the image has more than 1 species, there is more than 1 row. If the image has no animals/humans, then the species is labeled “none” and there is just that 1 row.
 
- In cases where there was a human, riding a horse, with a dog and cattle in 1 image (e.g.) – our labels get messy and are not always perfect. Anything more than 2 species and our system falls apart a little. This is incredibly rare with actual wildlife, so we weren’t super concerned, but definitely combinations of human-pets-livestock are a bit less tidy.
 
Fields:

File: unique id that matches the name of the image

Opstate: refers to the status of the camera. Normal = functioning as expected J. Once a camera is noted as “severely misdirected”, we often stopped labeling images. “maintenance” is usually set up and take down. These are often odd, close up images of humans. If I were you, I’d only look at images where OpState = “normal”.

Date

Time

Pic__CamID = camera id.

Trigger mode: M = Motion, T = Time, others are usually errors (C and U, I think, appear when a camera is in an error mode).

NearFar refers to if the animal was close or far from the camera (far animals aren’t counted). Again, I would possibly focus on “Near” instances as Far are often not captured by a motion trigger, and might be sort of rare in normal image sets.

Species = the species observed in the image

Count = number of individuals of that species.

Location relative to SWWLF – should give a folder location within the SWWLF2019 folder.

In [11]:
path_to_output = '/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_2020/idfg_swwlf_2019_megadb.json'  

**Name of the dataset**

In [7]:
dataset_name = 'idfg_swwlf_2019'

## Step 0 - Add an entry to the `datasets` table

done

## Step 1 - Prepare the `sequence` objects to insert into the database

### Step 1b - If you're starting from scratch...

In [9]:
csv_path = '/Users/siyuyang/Source/temp_data/CameraTrap/engagements/IDFG/swwlf2019_pic_dat.csv'

In [12]:
timelapse_df = pd.read_csv(csv_path, index_col=0, header=0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [14]:
len(timelapse_df)

11717080

In [18]:
timelapse_df.dtypes

File                            object
OpState                         object
Processor                       object
Date                            object
Time                            object
posix_date_time                 object
Pic_CamID                       object
TriggerMode                     object
Animal                            bool
Livestock                         bool
Human                             bool
Empty                             bool
YoungPresent                      bool
MarkedAnimal                      bool
IllnessUnusual                    bool
Comments                        object
NearFar                         object
Species                         object
Count                          float64
SPcomment                       object
Project                         object
Location_Relative_SWWLF2019     object
dtype: object

In [19]:
timelapse_df['Project'].unique()

array(['SWWLF2019'], dtype=object)

In [20]:
timelapse_df = timelapse_df.drop(columns=['File', 'Processor', 'Date', 'Time', 'SPcomment', 'Project'])

In [None]:
timelapse_df.sample(5)

In [23]:
timelapse_df['OpState'].unique()

array(['maintenance', 'normal', 'malfunction', 'minorly misdirected',
       'completely obscured', 'severely misdirected',
       'partially obscured'], dtype=object)

In [24]:
timelapse_df['TriggerMode'].unique()

array(['M', 'T', 'U', 'C'], dtype=object)

In [26]:
one_cam = timelapse_df[timelapse_df['Pic_CamID'] == 'IDFG2637']

In [None]:
len(one_cam)
one_cam.sample(10)  # R6/GMU60/F_2139/IDFG2637_20190615_125437_MD_1.JPG
# looks like the path prefix/folder is the same for the same camera, so cam ID is location ID

### Compile images and sequences

In [None]:
%%time
# 55 minutes to run this

# need to consolidate rows into image entries

im_to_classes = defaultdict(list)
im_to_count = defaultdict(float)  # we sum the count across diff species on each image
im_has_young = []  # to be made a set
all_species = set()

other_attributes = {}  # attributes that do not need to reconcile among rows

for i_row, row in tqdm(timelapse_df.iterrows()):
    im_path = row['Location_Relative_SWWLF2019']
    
    im_to_count[im_path] += row['Count']
    
    row_species = row['Species']
    if row_species != 'none':
        im_to_classes[im_path].append(row['Species'])
        all_species.add(row['Species'])
    
    if row['YoungPresent'] is True:
        im_has_young.append(im_path)  # mark as true if any species has young present
    
    if im_path not in other_attributes:
        other_attributes[im_path] = {
            'datetime': row['posix_date_time'],
            'location': row['Pic_CamID'],
            'trigger': row['TriggerMode'],
            'op_state': row['OpState']
        }
    
#     if i_row > 10000:
#         break

im_has_young = set(im_has_young)

In [50]:
len(other_attributes)  # num of images
len(im_to_classes)
len(im_to_count)
len(all_species)

11686098

1161781

11686098

43


Motion-triggered have sequence number and are named as
```
R2/GMU10A/F_653/IDFG2426_20190627_103229_MD_3.JPG
```
where the number after `MD_` is the sequence number

Time-triggered are named as
```
R2/GMU10A/F_653/IDFG2426_20190628_231000_TL_0.JPG
```

In [64]:
im_to_count_int = {}

for im_path, animal_count in im_to_count.items():
    if not math.isnan(animal_count):
        im_to_count_int[im_path] = int(animal_count)

In [68]:
del im_to_count

In [72]:
%%time

#c = 0

for im_path, im_attributes in tqdm(other_attributes.items()):
    p = os.path.basename(im_path).split('_')
    supposed_trigger_type = p[-2]  # MD or TL
    seq_id = '_'.join(im_path.split('_')[:-2])
    
    if supposed_trigger_type == 'TL':
        seq_id = 'dummy_' + seq_id
        frame_num = None
    else:
        frame_num = p[-1].split('.')[0]
        if frame_num.endswith('b'):
            frame_num = None
        else:
            frame_num = int(frame_num)
        #print(frame_num)
    
    im_classes = im_to_classes.get(im_path, None)
    if im_classes is None:
        if im_attributes['op_state'] == 'severely misdirected':
            im_classes = ['__label_unavailable']
        else:
            im_classes = ['empty']
    else:
        im_classes = list(set(im_classes))
    
    count = im_to_count_int.get(im_path, None)

#     print(im_path)
#     print(supposed_trigger_type)
#     print(seq_id)
#     print(im_classes)
#     print(count)
#     print()
#     c += 1
#     if c > 1000:
#         break

    im_attributes['file'] = im_path
    
    im_attributes['seq_id'] = seq_id
    if frame_num is not None:
        im_attributes['frame_num'] = frame_num
    
    im_attributes['class'] = im_classes

    if count is not None:
        im_attributes['count'] = count

100%|██████████| 11686098/11686098 [01:12<00:00, 161043.62it/s]

CPU times: user 1min 4s, sys: 6.4 s, total: 1min 11s
Wall time: 1min 12s





In [73]:
embedded = list(other_attributes.values())

In [74]:
del other_attributes

In [75]:
sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to idfg_swwlf_2019. Please make sure this is correct!
Making a deep copy of docs...


  0%|          | 26218/11686098 [00:00<00:44, 262109.63it/s]

Putting 11686098 images into sequences...


100%|██████████| 11686098/11686098 [02:32<00:00, 76750.68it/s]


Number of sequences: 10959983
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'frame_num', 'trigger', 'class', 'datetime', 'op_state', 'file', 'count', 'location'}

img_level_properties
{'frame_num', 'class', 'datetime', 'op_state', 'file', 'count'}

image-level properties that really should be sequence-level
{'trigger', 'location'}

Finished processing sequences.
Example sequence items:

{'seq_id': 'R7/GMU21A/S_91/IDFG0035_20190606_082306', 'dataset': 'idfg_swwlf_2019', 'images': [{'datetime': '2019-06-06 08:23:06', 'op_state': 'maintenance', 'file': 'R7/GMU21A/S_91/IDFG0035_20190606_082306_MD_1.JPG', 'frame_num': 1, 'class': ['human'], 'count': 0}], 'trigger': 'M', 'location': 'IDFG0035'}

[{'seq_id': 'dummy_R7/GMU36/F_1953/IDFG2348_20190903_212000', 'dataset': 'idfg_swwlf_2019', 'images': [{'datetime': '2019-09-03 21:20:00', 'op_state': 'normal', 'file': 'R7/GMU36/F_1953/IDFG2348_20190903_212000_TL_0.JPG', 'class':

In [4]:
# re-start 

with open('/Users/siyuyang/OneDrive - Microsoft/AI4Earth/CameraTrap/Databases/megadb_2020/temp_idfg_swwlf_2019_megadb.json') as f:
    sequences = json.load(f)

In [5]:
species_present = set()
for seq in sequences:
    for im in seq['images']:
        if im['class'][0] != 'empty':
            species_present.update(im['class'])

In [8]:
# drop images with complicated frame_num (1b, 2b)
dropped_images = []

for seq in sequences:
    
    if len(seq['images']) == 1:
        continue
        
    if seq['trigger'] == 'T':
        dropped_images.append(seq['images'][1])
    
    seq['images'] = [seq['images'][0]]

In [9]:
len(dropped_images)

14752

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [10]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 23min 20s, sys: 55.5 s, total: 24min 15s
Wall time: 25min 25s


## Step 4 - Save the `sequence` items to a file

You can now take the resulting JSON file to the .Net application for bulk insertion to the database:

In [12]:
with open(path_to_output, 'w') as f:
    json.dump(sequences, f)