In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# uw_gardner, folder Processed Images, bird images

In [4]:
dataset_name = 'uw_gardner'

container_root = '/mink_disk_0/camtraps/uw-gardner/'
# no path prefix - this dataset was added to the datasets table previously

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

Species and location in a CSV (all bird entries, no human and almost no vehicle) but no sequence info.

Only 3430 images are both in the CSV and in the "Processed Images" folder.

In [5]:
csv_path = os.path.join(container_root, 'Processed Images/Bassing_BirdImages_2021-01-25.csv')

In [6]:
df = pd.read_csv(
    csv_path,
    usecols=['File', 'RelativePath', 'Folder', 'DateTime', 'CameraLocation', 'Count', 'Species']
)

In [7]:
len(df)
df.sample(2)

39069

Unnamed: 0,File,RelativePath,Folder,DateTime,CameraLocation,Species,Count
24413,RCNX6667.JPG,Camera_1\C116_5.29.19\DCIM\100RECNX,NE6248,2018-11-12 10:47:05,NE6248_1,Turkey,1
24823,RCNX8234.JPG,Camera_1\C116_5.29.19\DCIM\100RECNX,NE6248,2019-05-05 08:58:09,NE6248_1,Turkey,1


In [8]:
df['Species'].unique()  # spp = multiple species

array(['Grouse Spp', 'Common Raven', 'Turkey', 'Bird Spp', 'Raptor Spp'],
      dtype=object)

In [9]:
folder = os.path.join(container_root, 'Processed Images')

paths = path_utils.recursive_file_list(folder)
len(paths)
paths = sorted([p.split(container_root)[1] for p in paths if path_utils.is_image_file(p) and not os.path.basename(p).startswith('.')])
len(paths)

247797

247752

In [10]:
paths[10]

'Processed Images/NE1483_complete/Camera_32/C27_7.27.18/DCIM/100RECNX/RCNX0011.JPG'

In [11]:
files_in_csv = []

for i_row, row in tqdm(df.iterrows()):
    fn = os.path.join(row['Folder'], row['RelativePath'].replace('\\', '/'), row['File'])
    files_in_csv.append(fn)

39069it [00:04, 8647.15it/s]


In [12]:
paths_set = set(paths)

In [13]:
csv_set = set(files_in_csv)

In [14]:
intersection = paths_set.intersection(csv_set)

In [15]:
len(intersection)

3430

In [17]:
files_in_csv[0]

'Processed Images/NE1483_complete/Camera_32/C80_10.4.18/DCIM/100RECNX/RCNX0611.JPG'

In [18]:
paths[0]

'Processed Images/NE1483_complete/Camera_32/C27_7.27.18/DCIM/100RECNX/RCNX0001.JPG'

In [33]:
locations = set()
sequences = []

for i_row, row in tqdm(df.iterrows()):
    fn = os.path.join('Processed Images', row['Folder'], row['RelativePath'].replace('\\', '/'), row['File'])

    if fn not in intersection:
        continue
    
    clss = row['Species'].lower()
    
    seq_id = '_'.join([row['Folder'], row['RelativePath'].replace('\\', '_'), row['File'].split('.')[0]]).replace('.', '-')
    assert '/' not in seq_id, row

    location = row['CameraLocation']
    locations.add(location)
        
    sequences.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{seq_id}',
        'images': [
            {
                'file': fn,
                'frame_num': 1, # only one image, but easier for ingesting the annotations
                'datetime': row['DateTime'], # extra field,
            }
        ],
        'location': location,
        'class': [clss],
        'count': row['Count']
    })

39069it [00:04, 8750.13it/s]


In [34]:
len(sequences)
sample(sequences, 2)

3430

[{'dataset': 'uw_gardner',
  'seq_id': 'dummy_NE4891_Camera_5_C4_9-11-18_DCIM_105RECNX_RCNX0342',
  'images': [{'file': 'Processed Images/NE4891/Camera_5/C4_9.11.18/DCIM/105RECNX/RCNX0342.JPG',
    'frame_num': 1,
    'datetime': '2018-07-16 12:28:10'}],
  'location': 'NE4891_5',
  'class': ['turkey'],
  'count': 1},
 {'dataset': 'uw_gardner',
  'seq_id': 'dummy_NE5740_Camera_15_C36_7-27-18_DCIM_116RECNX_RCNX0457',
  'images': [{'file': 'Processed Images/NE5740/Camera_15/C36_7.27.18/DCIM/116RECNX/RCNX0457.JPG',
    'frame_num': 1,
    'datetime': '2018-06-16 17:01:26'}],
  'location': 'NE5740_15',
  'class': ['common raven'],
  'count': 2}]

In [35]:
len(locations)

28

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [36]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 469 ms, sys: 0 ns, total: 469 ms
Wall time: 467 ms


In [37]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - copy non-empties to flat folder

In [38]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [39]:
%%time

path_pairs = []
for seq in tqdm(sequences):
    
    if seq['class'][0] == 'empty':
        continue
    
    seq_id = seq['seq_id']
    
    im = seq['images'][0]
    frame = im['frame_num']
    
    src_path = os.path.join(container_root, im['file'])
    assert os.path.exists(src_path), src_path

    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12d', 
                            f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
    path_pairs.append((src_path, dst_path))

100%|██████████| 3430/3430 [00:00<00:00, 114156.53it/s]

CPU times: user 22.3 ms, sys: 12 ms, total: 34.3 ms
Wall time: 32.6 ms





In [40]:
len(path_pairs)  # non-empty images out of total of 19221 (38%)
path_pairs[3000]

3430

('/mink_disk_0/camtraps/uw-gardner/Processed Images/NE7302/Camera_84/C69_2.26.19/DCIM/102RECNX/RCNX1972.JPG',
 '/mink_disk_0/camtraps/imerit12d/uw_gardner.seqdummy_NE7302_Camera_84_C69_2-26-19_DCIM_102RECNX_RCNX1972.frame1.jpg')

In [41]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 2.63 s, sys: 5.46 s, total: 8.09 s
Wall time: 13.9 s


In [42]:
len(dst_paths)

3430