In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [20]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# rspb_gola_2020

This notebook is a template for how new datasets can be formatted for ingestion into the database.

The ideal dataset has both **location** and **sequence** information, in addition to any species or bounding box labels.

Give the path to a JSON file where output from this script will be written to. You can then take this file to the .Net app for ingestion to the database.

In [4]:
dataset_name = 'rspb_gola_2020'

container_root = '/mink_disk_0/camtraps/rspb_gola_2020'  
path_prefix = 'gola-labeled-20201116/Gola Darwin 2020 - humans removed'  # as they are on the container

downloaded_dir = '/mink_disk_0/camtraps/rspb_gola_2020/Gola Darwin 2020 - humans removed/'  # AzCopied the container to data disk, with one fewer level of directory

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

The labels are very neat, contained in four CSVs. 
- Some entries in the CSVs are not present in blob (probably the people images).
- The "RelativePath" column is the location. 
- Sequence info can be extracted from the file names
- There isn't a good identifier for each sequence, but the timestamp up to the minute is an okay divider between sequences. It seems that the camera would start numbering from (1) again in a new minute.

Images were AzCopied to the data disk first.

In [5]:
files_list = path_utils.recursive_file_list(downloaded_dir, convert_slashes=False)
len(files_list)

images_set = set([i.split(downloaded_dir)[1] for i in files_list if path_utils.is_image_file(i)])
len(images_set)

14331

14282

In [6]:
list(images_set)[1000:1010]

['Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D180600/DWCN22/D180600__DWCN22__2020-03-23__00-21-21(3).JPG',
 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115575/DWCN16/D115575__DWCN16__2020-03-13__23-29-44(2).JPG',
 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D180605/DWCN23/D180605__DWCN23__2020-03-04__20-07-43(5).JPG',
 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D170590/DWCN29/D170590__DWCN29__2020-02-27__14-20-35(15).JPG',
 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D115540/DWC N 19/D115540__DWC N 19__2020-05-21__05-19-46(45).JPG',
 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115570/DWCN15/D115570__DWCN15__2020-03-08__06-00-44(1).JPG',
 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D170585/DWCN20/D170585__DWCN20__2020-03-16__18-08-47(5).JPG',
 'Darwin_CamTrap_Sep_2020_Renamed_peopleremoved/D110535/DWC N 17/D110535__DWC N 17__2020-06-21__04-58-49(3).JPG',
 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D190595/DWCN25/D190595__DWCN25__2017-02-09__14-51-55(4).JPG',


In [7]:
# Load the CSV lables from RSPB

csv_paths = [os.path.join(downloaded_dir, p) for p in os.listdir(downloaded_dir) if p.endswith('.csv')]
len(csv_paths)

4

In [8]:
csv_dfs = []
for p in csv_paths:
    csv = pd.read_csv(p,
                     usecols=['File', 'RelativePath', 'Folder', 'Date', 'Time', 'ImageQuality',
           'DeleteFlag', 'Category', 'Event', 'SpeciesGroup', 'SpeciesName',
           'Count', 'Age', 'Sex', 'Obstruction'])
    print(len(csv))
    csv_dfs.append(csv)

9357
4252
2730
3949


In [9]:
all_csv = pd.concat(csv_dfs, ignore_index=True)

In [10]:
all_csv.columns
all_csv.shape

Index(['File', 'RelativePath', 'Folder', 'Date', 'Time', 'ImageQuality',
       'DeleteFlag', 'Category', 'Event', 'SpeciesGroup', 'SpeciesName',
       'Count', 'Age', 'Sex', 'Obstruction'],
      dtype='object')

(20288, 15)

In [11]:
all_csv.sample(3)

Unnamed: 0,File,RelativePath,Folder,Date,Time,ImageQuality,DeleteFlag,Category,Event,SpeciesGroup,SpeciesName,Count,Age,Sex,Obstruction
19924,D120605__DWC 07__2020-01-29__09-37-00(1).JPG,D120605\DWC 07,Darwin_CamTrap_Jun_2020_Renamed,29-Jan-2020,09:37:00,Ok,False,Deployment,1.0,,,0,,,
729,D105585__DWCN05__2020-03-12__07-44-23(3).JPG,D105585\DWCN05,Darwin_CamTrap_Feb_2020_Renamed,12-Mar-2020,07:44:23,Ok,False,Animal,1.0,Bovid,Duiker sp,1,,,
4891,D170585__DWCN20__2020-03-21__15-13-52(32).JPG,D170585\DWCN20,Darwin_CamTrap_Feb_2020_Renamed,21-Mar-2020,15:13:52,Ok,False,Animal,3.0,Bovid,Maxwell's duiker,1,,,


In [12]:
np.isnan(all_csv.loc[1, 'SpeciesName'])

True

In [13]:
for i in pd.unique(all_csv['SpeciesName']):
    print(i)

nan
Unknown
Duiker sp
Squirrel sp
Mouse sp
Forest giant pouched rat
White-bellied pangolin
Fire-footed rope squirrel
Brush-tailed porcupine
Slender-tailed squirrel
Western tree hyrax
Genet sp
Marsh mongoose
Sooty mangabey
Other (describe in notes)
Lesser spot-nosed monkey
Red river hog
Maxwell's duiker
Bongo
White-breasted guineafowl
Latham's francolin
Campbell's monkey
Bushbuck
African civet
Common cusimanse
Mongoose sp
Black duiker
Western chimpanzee
African giant squirrel
Ornate monitor
Jentink's duiker
Bay duiker
Greater cane-rat (Marsh cane rat)
Ogilby's duiker
Nkulengu Rail
Honey badger
African palm civet
Johnston's genet
African forest buffalo
Ichneumon Mongoose
Timneh parrot
Crested guineafowl
Galago sp.


In [14]:
for i in pd.unique(all_csv['SpeciesGroup']):
    print(i)

nan
Primate
Bovid
Unidentified
Squirrel
Non-squirrel rodent
Bird
Pangolin
Hyrax
Carnivore
Pig
Other
None
Antelope
Mustelid


In [38]:
embedded = []
seq_to_loc = {}
images_missing = []
num_images = 0

for i_row, row in tqdm(all_csv.iterrows()):
    
    rel_path = row['RelativePath'].replace('\\', '/')
    
    file = os.path.join(row['Folder'] + '_peopleremoved', rel_path, row['File'])
    
    if file not in images_set:
        images_missing.append(file)
        continue
    
    seq_id = row['File'].split('(')[0][:-3] # use the file name up to the seconds
    frame_num = int(row['File'].split('(')[1].split(')')[0])
    
    datetime = row['Date'] + ' ' + row['Time']
    
    # default value for class is the coarse Category - Deployment, Empty, Animal, Collection
    clss = None
    if isinstance(row['Category'], str) and row['Category'] == 'Empty':
        clss = 'empty'
    elif isinstance(row['SpeciesName'], str):
        clss = row['SpeciesName'].lower()
        
    if clss is None and isinstance(row['SpeciesGroup'], str) and row['SpeciesGroup'] != None:
        clss = row['SpeciesGroup'].lower()
        
    if clss.startswith('other'):
        clss = 'other'
    if '(' in clss:
        clss = clss.split('(')[0].strip()
    assert clss is not None
    
    embedded.append({
        'file': file,
        'class': [clss],  # only one species per image it seems
        'species_group': row['SpeciesGroup'].lower() if isinstance(row['SpeciesGroup'], str) else None,
        'frame_num': frame_num,
        'datetime': datetime,
        'count': row['Count'],
        'age': row['Age'],
        'sex': row['Sex'],
        'obstruction': row['Obstruction'],
        'location': rel_path,
        'seq_id': seq_id
    })
    seq_to_loc[seq_id] = rel_path
    num_images += 1

20288it [00:03, 6473.26it/s]


In [32]:
len(seq_to_loc)

2509

In [33]:
# number of locations
len(set(seq_to_loc.values()))

75

In [34]:
len(images_missing)
images_missing[100]

num_images

6006

'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D105580/DWCN19/D105580__DWCN19__2020-02-15__10-14-35(1).JPG'

14282

In [39]:
sequences = process_sequences(embedded, dataset_name)

The dataset_name is set to rspb_gola_2020. Please make sure this is correct!
Making a deep copy of docs...


100%|██████████| 14282/14282 [00:00<00:00, 983145.41it/s]

Putting 14282 images into sequences...
Number of sequences: 2509
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'count', 'location', 'datetime', 'class', 'species_group', 'age', 'sex', 'frame_num', 'obstruction', 'file'}

img_level_properties
{'count', 'datetime', 'class', 'species_group', 'age', 'frame_num', 'obstruction', 'file'}

image-level properties that really should be sequence-level
{'sex', 'location'}






Finished processing sequences.
Example sequence items:

{"dataset": "rspb_gola_2020", "seq_id": "D100595__DWCN08__2020-02-18__17-46", "location": "D100595/DWCN08", "images": [{"file": "Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D100595/DWCN08/D100595__DWCN08__2020-02-18__17-46-03(1).JPG", "class": ["unknown"], "species_group": "primate", "frame_num": 1, "datetime": "18-Feb-2020 17:46:03", "count": 2, "age": null, "obstruction": "Yes"}, {"file": "Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D100595/DWCN08/D100595__DWCN08__2020-02-18__17-46-03(2).JPG", "class": ["unknown"], "species_group": "primate", "frame_num": 2, "datetime": "18-Feb-2020 17:46:03", "count": 2, "age": null, "obstruction": "Yes"}, {"file": "Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D100595/DWCN08/D100595__DWCN08__2020-02-18__17-46-03(3).JPG", "class": ["unknown"], "species_group": "primate", "frame_num": 3, "datetime": "18-Feb-2020 17:46:03", "count": 2, "age": null, "obstruction": "Yes"}], "sex": null}

{"data

In [40]:
sample(sequences, 10)

[OrderedDict([('dataset', 'rspb_gola_2020'),
              ('seq_id', 'D115580__DWCN18__2020-03-13__00-49'),
              ('location', 'D115580/DWCN18'),
              ('images',
               [{'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115580/DWCN18/D115580__DWCN18__2020-03-13__00-49-38(1).JPG',
                 'class': ['empty'],
                 'species_group': None,
                 'frame_num': 1,
                 'datetime': '13-Mar-2020 00:49:38',
                 'count': 0,
                 'age': None,
                 'obstruction': None},
                {'file': 'Darwin_CamTrap_Feb_2020_Renamed_peopleremoved/D115580/DWCN18/D115580__DWCN18__2020-03-13__00-49-38(2).JPG',
                 'class': ['empty'],
                 'species_group': None,
                 'frame_num': 2,
                 'datetime': '13-Mar-2020 00:49:38',
                 'count': 0,
                 'age': None,
                 'obstruction': None},
                {'file': 'Darw

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [41]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 1.1 s, sys: 3.66 ms, total: 1.11 s
Wall time: 1.11 s


In [42]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - copy images to flat folder

In [43]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [51]:
%%time

path_pairs = []
for seq in tqdm(sequences):
    seq_id = seq['seq_id']
    for im in seq['images']:
        
        if 'empty' not in im['class']:
        
            src_path = os.path.join(downloaded_dir, im['file'])
            assert os.path.exists(src_path), src_path
            frame = im['frame_num']
            dst_path = os.path.join('/mink_disk_0/camtraps/imerit12b', 
                                    f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
            path_pairs.append((src_path, dst_path))

100%|██████████| 2509/2509 [00:00<00:00, 25779.33it/s]

CPU times: user 78 ms, sys: 24.5 ms, total: 102 ms
Wall time: 100 ms





In [52]:
len(path_pairs)  # non-empty images out of total of 14282 (90%)
path_pairs[10000]

12985

('/mink_disk_0/camtraps/rspb_gola_2020/Gola Darwin 2020 - humans removed/Darwin_CamTrap_Dec_2019_Renamed_peopleremoved/D145580/N4/D145580__N4__2019-12-17__09-58-47(3).JPG',
 '/mink_disk_0/camtraps/imerit12b/rspb_gola_2020.seqD145580__N4__2019-12-17__09-58.frame3.jpg')

In [53]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 23.2 s, sys: 48.7 s, total: 1min 11s
Wall time: 34.5 s
