In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [5]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# amapa_1819

In [10]:
dataset_name = 'amapa_1819'

container_root = '/mink_disk_0/camtraps/amapa_1819/'  

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

The labels are very neat, contained in four CSVs. 
- Some entries in the CSVs are not present in blob (probably the people images).
- The "RelativePath" column is the location. 
- Sequence info can be extracted from the file names
- There isn't a good identifier for each sequence, but the timestamp up to the minute is an okay divider between sequences. It seems that the camera would start numbering from (1) again in a new minute.

Images were AzCopied to the data disk first.

In [11]:
files_list = path_utils.recursive_file_list(container_root, convert_slashes=False)
len(files_list)

images_set = set([i.split(container_root)[1] for i in files_list if path_utils.is_image_file(i)])
len(images_set)

24221

19248

In [12]:
list(images_set)[1000:1010]

['Cameras_Praias_2019_Microsoft/2012P13/Site2012P13-11.09/VID_0469_11090469.JPG',
 'Cameras_Praias_2019_Microsoft/2017P2/Site2017P2-10.27/VID_0029_EK000029.JPG',
 'Cameras_Praias_2018/C2017P7_escondida/Site2017P7_escondida-10.19/VID_0748_10190002.JPG',
 'Cameras_Praias_2018/CBF27/SiteBF27-11.03/VID_0505_11030305.JPG',
 'Cameras_Praias_2018/C2012P032/Site2012P032-10.04/VID_0555_10040195.JPG',
 'Cameras_Praias_2018/C2017P2/Site2017P2-11.09/VID_0026_EK000026.JPG',
 'Cameras_Praias_2019_Microsoft/2012P16/Site2012P16-11.23/VID_2380_11230862.JPG',
 'Cameras_Praias_2018/C2012P16/Site2012P16-10.07/VID_0552_10070552.JPG',
 'Cameras_Praias_2018/C2012P181/Site2012P181-10.25/VID_0369_10250014.JPG',
 'Cameras_Praias_2018/C2012P031/Site2012P031-11.02/VID_0398_11020130.JPG']

In [14]:
# Load the CSV lables from RSPB
csv_paths = [
    os.path.join(container_root, 'Cameras_Praias_2018/Cameras_Praias_2018_Microsoft/Timelapse_cameras_praias2018.csv'),
    os.path.join(container_root, 'Cameras_Praias_2019_Microsoft/Timelapse_cameras_praias2019.csv')
]

In [16]:
csv_dfs = []
for p in csv_paths:
    csv = pd.read_csv(p)
    print(len(csv))
    csv_dfs.append(csv)

14736
11102


In [27]:
all_csv = pd.concat(csv_dfs, ignore_index=True)
all_csv.shape

# filter to non-video files
all_csv = all_csv[all_csv['video'] == False]
all_csv.shape

(25838, 26)

(19221, 26)

In [28]:
all_csv.columns

Index(['File', 'RelativePath', 'Folder', 'Date', 'Time', 'ImageQuality',
       'DeleteFlag', 'video', 'analyst', 'mammal', 'bird', 'reptile', 'people',
       'boat', 'individuals', 'youngpresent', 'species1', 'species2',
       'behavior1', 'behavior2', 'behavior3', 'behaviorobs', 'observation',
       'publicity', 'completed', 'Unnamed: 25'],
      dtype='object')

In [29]:
all_csv.sample(3)

Unnamed: 0,File,RelativePath,Folder,Date,Time,ImageQuality,DeleteFlag,video,analyst,mammal,...,species1,species2,behavior1,behavior2,behavior3,behaviorobs,observation,publicity,completed,Unnamed: 25
11844,VID_0054_10190054.JPG,C2017P4\Site2017P4-10.19,Cameras_Praias_2018,19-Oct-2018,07:59:02,Ok,False,False,FMichalski,False,...,People,,,,,,,False,True,
21447,VID_3098_11240581.JPG,2012P16\Site2012P16-11.24,Cameras_Praias_2019,24-Nov-2019,23:23:49,Ok,False,False,FMichalski,False,...,NI,,,,,,,False,True,
24275,VID_0189_11020189.JPG,2012P27\Site2012P27-11.02,Cameras_Praias_2019,02-Nov-2019,14:59:13,Ok,False,False,FMichalski,False,...,,,,,,,,False,True,


In [25]:
type(all_csv.iloc[6630]['video'])

numpy.bool_

In [30]:
for i in pd.unique(all_csv['species1']):
    print(i)

People
nan
Small lizard
Crax alector
Molothrus sp.
Small bird
Monasa atra
Iguana iguana
Hydrochoerus hydrochaeris
Ameiva sp.
Cuniculus paca
Urubitinga urubitinga
Tayassu pecari
Leptotila sp.
Mesembrinibis cayennensis
Cathartes sp.
Molothrus oryzivorus
Podocnemis unifilis
Psarocolius decumanus
Frog
Proechimys guyannensis
Cathartes melambrotus
Geotrygon sp.
Leopardus wiedii
Cochlearius cochlearius
Leopardus pardalis
Leptotila verreauxi
Myrmecophaga tridactyla
Insect
Lontra longicaudis
Ardea cocoi
Anas platyrhynchos domesticus
Canis lupus familiaris
Didelphis marsupialis
Coragyps atratus
Tupinambis teguixin
Sarcoramphus papa
NI
Crotophaga ani
Eira barbara
Calidris sp.
Phalacrocorax brasilianus
Pilherodius pileatus
Mazama americana
Panthera onca
Puma concolor
Cacicus cela
Tigrisoma lineatum
Tapirus terrestris
Butorides striata
Galictis vittata
Speothos venaticus
Dasyprocta leporina


In [31]:
for i in pd.unique(all_csv['species2']):
    print(i)

nan
Small bird
Anas platyrhynchos domesticus
People
Ameiva sp.
Coragyps atratus
Canis lupus familiaris
Frog


### Finding the sequences
The sequences seem to be indicated by when there is a video... The entire RelativePath seems to be a good proxy for a sequence, even though it may contain difference sequences at the same location (sometimes the camera field of vision is somewhat changed...)

In [50]:
locations = set()
embedded = []
num_images = 0

missing_images = []

seq_id = 0
num_last = 0
frame_counter = 1

for i_row, row in tqdm(all_csv.iterrows()):  # have to go in order for this to work
    if not path_utils.is_image_file(row['File']):
        continue
    num_images += 1
    frame_counter += 1
    
    # figuring out where a new sequence starts
    # new sequence if a video file breaks the numbering of file names
    fn = row['File']
    num = int(fn.split('_')[1])
    if num > num_last + 1 or num < num_last:  # could wrap around
        # new sequence
        seq_id += 1
        frame_counter = 1
    num_last = num
    
    rel_path = row['RelativePath'].replace('\\', '/')
    location = rel_path.split('/')[0]
    folder = row['Folder']
    if folder == 'Cameras_Praias_2019':
        folder = 'Cameras_Praias_2019_Microsoft'
    im_path = os.path.join(folder, rel_path, fn)
    
    if im_path not in images_set:
        missing_images.append(im_path)
        continue
    
    datetime = row['Date'] + ' ' + row['Time']
    
    classes = []
    if row['boat'] == True:
        classes.append('boat')
    
    if isinstance(row['species1'], str) and row['species1'] != '':
        classes.append(row['species1'].lower())
        
    if isinstance(row['species2'], str) and row['species2'] != '':
        classes.append(row['species2'].lower())  
    
    if len(classes) == 0:
        assert row['individuals'] == 0
        classes.append('empty')
    
    embedded.append({
        'file': im_path,
        'class': classes,
        'frame_num': frame_counter,
        'datetime': datetime,
        'count': row['individuals'],
        'analyst': row['analyst'],
        'observation': row['observation'] if isinstance(row['observation'], str) else None,

        'location': location,
        'seq_id': seq_id
    })
    locations.add(location)

19221it [00:03, 5906.19it/s]


In [51]:
num_images
seq_id # about 3.46 images per sequence

19221

5552

In [52]:
# number of locations
len(locations)

42

In [53]:
len(missing_images)

0

In [54]:
sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to amapa_1819. Please make sure this is correct!
Making a deep copy of docs...


100%|██████████| 19221/19221 [00:00<00:00, 858696.46it/s]

Putting 19221 images into sequences...
Number of sequences: 5552
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'frame_num', 'observation', 'analyst', 'count', 'file', 'class', 'location', 'datetime'}

img_level_properties
{'frame_num', 'observation', 'count', 'file', 'class', 'datetime'}

image-level properties that really should be sequence-level
{'location', 'analyst'}

! Sequence-level property analyst with value FMichalski should be a dataset-level property. Removed from sequences.
Finished processing sequences.
Example sequence items:

{"dataset": "amapa_1819", "seq_id": "1", "location": "C2012P031", "images": [{"file": "Cameras_Praias_2018/C2012P031/Site2012P031-09.30/VID_0103_09300001.JPG", "class": ["people"], "frame_num": 1, "datetime": "30-Sep-2018 10:40:49", "count": 1, "observation": null}, {"file": "Cameras_Praias_2018/C2012P031/Site2012P031-09.30/VID_0104_09300002.JPG", "class": ["people"], "frame_num"




In [56]:
sample(sequences, 3)

[OrderedDict([('dataset', 'amapa_1819'),
              ('seq_id', '2069'),
              ('location', 'C2012P181'),
              ('images',
               [{'file': 'Cameras_Praias_2018/C2012P181/Site2012P181-12.04/VID_0719_12040141.JPG',
                 'class': ['anas platyrhynchos domesticus'],
                 'frame_num': 1,
                 'datetime': '04-Dec-2018 13:53:49',
                 'count': 1,
                 'observation': None},
                {'file': 'Cameras_Praias_2018/C2012P181/Site2012P181-12.04/VID_0720_12040142.JPG',
                 'class': ['anas platyrhynchos domesticus'],
                 'frame_num': 2,
                 'datetime': '04-Dec-2018 13:53:50',
                 'count': 1,
                 'observation': None},
                {'file': 'Cameras_Praias_2018/C2012P181/Site2012P181-12.04/VID_0721_12040143.JPG',
                 'class': ['empty'],
                 'frame_num': 3,
                 'datetime': '04-Dec-2018 13:53:51',
         

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [57]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 1.64 s, sys: 0 ns, total: 1.64 s
Wall time: 1.65 s


In [58]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - copy images to flat folder

In [59]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [60]:
%%time

path_pairs = []
for seq in tqdm(sequences):
    seq_id = seq['seq_id']
    for im in seq['images']:
        
        if 'empty' not in im['class']:
            
            src_path = os.path.join(container_root, im['file'])
            assert os.path.exists(src_path), src_path
            frame = im['frame_num']
            dst_path = os.path.join('/mink_disk_0/camtraps/imerit12b', 
                                    f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
            path_pairs.append((src_path, dst_path))

100%|██████████| 5552/5552 [00:00<00:00, 6431.67it/s]

CPU times: user 90.2 ms, sys: 41.9 ms, total: 132 ms
Wall time: 866 ms





In [62]:
len(path_pairs)  # non-empty images out of total of 19221 (38%)
path_pairs[-100]

7364

('/mink_disk_0/camtraps/amapa_1819/Cameras_Praias_2019_Microsoft/BF52/SiteBF52-12.06/VID_0155_EK000002.JPG',
 '/mink_disk_0/camtraps/imerit12b/amapa_1819.seq5513.frame155.jpg')

In [63]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 8.59 s, sys: 21.5 s, total: 30 s
Wall time: 1min 29s


In [64]:
len(dst_paths)

7364