In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [4]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [5]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# wpz_wolverine_labeled

In [1]:
dataset_name = 'wpz_wolverine_labeled'

container_root = '/mink_disk_0/camtraps/wpz/'  
path_prefix = 'wolverine_survey/images/Wolverine Monitoring Photos/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

Species in a CSV. State info is in the file names.

Sometimes the suffix of filename is lower-cased in the CSV.

Emtpy is denoted "None"

In [9]:
csv_path = os.path.join(container_root, 'wolverine_survey/images/All4StatesIDs.csv')

In [10]:
df = pd.read_csv(csv_path)

In [13]:
len(df)
df.sample(2)

377756

Unnamed: 0,ImageID,FileName,Label
306160,49791,WY_Image_050141.jpg,Magpie
90866,94868,ID_Image_090868.JPG,Human


In [18]:
df['Label'].unique()

array(['Human', 'Wolverine', 'None', 'American Marten', 'Red Fox',
       'Coyote', 'Magpie', 'Hawk', 'Snowshoe Hare', "Clark's Nutcracker",
       'Wolf', 'Mountain Lion', 'Gray Jay', 'Mule Deer',
       'Rocky Mountain Elk', "Steller's Jay", 'Unk Mammal',
       'Golden eagle', 'Unk Bird', 'Domestic dog', 'Fisher', 'unknown',
       'White-tailed Deer', 'Moose', 'Red Squirrel', 'Common Raven',
       'Chickadee', 'Woodpecker', 'Flying Squirrel', 'Goshawk', 'Bobcat',
       'skunk', 'Grizzly Bear', 'Black Bear', 'Blue Grouse',
       'Golden-mantled GS', 'Timelapse Photo', 'Canada Lynx',
       'Long-tailed Weasel', 'Mouse', 'Weasel', 'Researcher', '11amTest',
       'Mountain goat', 'Cascade red fox', 'Douglas squirrel',
       'Misc Squirrel', 'American Black Bear', 'grouse',
       'Northern Flying Squirrel', 'American Badger', 'Misc Bird',
       'Misc Small Rodent', 'Misc Chipmunk', 'Badger', 'Time Lapse Photo',
       'Pacific Marten', 'Unknown', 'Raven', 'Elk', 'Golden Eagle',


In [14]:
folder = os.path.join(container_root, path_prefix)

paths = path_utils.recursive_file_list(folder)
len(paths)
paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p) and not os.path.basename(p).startswith('.')])
len(paths)

377756

377756

In [16]:
fn_to_path = {}  # no extension to full file name
for p in paths:
    fn_to_path[p.split('.')[0]] = p
len(fn_to_path)

377756

In [15]:
paths[-100]

'WY_Image_121637.JPG'

In [17]:
fn_to_path['WY_Image_050141']

'WY_Image_050141.JPG'

In [19]:
locations = set()
sequences = []

for i_row, row in tqdm(df.iterrows()):
    
    df_fn = row['FileName']
    fn = fn_to_path[df_fn.split('.')[0]]
    
    location = df_fn.split('_')[0]
    locations.add(location)
    
    # '11amTest' and 'Time Lapse Photo' usually empty but leaving it
    clss = row['Label'].lower()
    if clss == 'none':
        clss = 'empty'
    elif clss == 'researcher':
        clss = 'human'
        
    sequences.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_{len(sequences)}',
        'images': [
            {
                'file': fn,
                'frame_num': 1, # only one image, but easier for ingesting the annotations
                'image_id': row['ImageID'] # extra field
            }
        ],
        'location': location,
        'class': [clss]
    })

377756it [00:42, 8810.92it/s]


In [20]:
num_empty = sum([1 for seq in sequences if seq['class'][0] == 'empty'])
num_empty

69568

In [21]:
num_empty / 377756

0.18416120458708796

In [23]:
len(locations)
locations

4

{'ID', 'MT', 'WA', 'WY'}

In [27]:
len(sequences)
sequences[20000:20003]

377756

[{'dataset': 'wpz_wolverine_labeled',
  'seq_id': 'dummy_20000',
  'images': [{'file': 'ID_Image_020002.JPG',
    'frame_num': 1,
    'image_id': 21733}],
  'location': 'ID',
  'class': ['red fox']},
 {'dataset': 'wpz_wolverine_labeled',
  'seq_id': 'dummy_20001',
  'images': [{'file': 'ID_Image_020003.JPG',
    'frame_num': 1,
    'image_id': 21734}],
  'location': 'ID',
  'class': ['red fox']},
 {'dataset': 'wpz_wolverine_labeled',
  'seq_id': 'dummy_20002',
  'images': [{'file': 'ID_Image_020004.JPG',
    'frame_num': 1,
    'image_id': 21735}],
  'location': 'ID',
  'class': ['red fox']}]

## Step 2 - Pass the schema check

Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.

If the format conforms, the following messages will be printed:

```
Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
```

For large datasets, the second step will take some time (~ a minute). 

Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems.

In [25]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 44.8 s, sys: 33.4 ms, total: 44.9 s
Wall time: 44.9 s


In [26]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

### Step 2b - sample

In [29]:
sequences_to_include = []

batches_folder = 0
for i in range(0, len(sequences), 10):  # 10 sequential images, take 1/4
    if batches_folder % 3 == 0:
        batch = sequences[i: i + 10]
        sequences_to_include.extend(batch)
    batches_folder += 1

len(sequences_to_include)

125920

In [32]:
sequences_to_include = [seq for seq in sequences_to_include if not seq['class'][0] == 'empty']
len(sequences_to_include)

102695

In [None]:
sequences_to_include[100000: 100000 + 40]

In [34]:
sample(sequences_to_include, 1)

[{'dataset': 'wpz_wolverine_labeled',
  'seq_id': 'dummy_228334',
  'images': [{'file': 'WA_Image_025671.JPG',
    'frame_num': 1,
    'image_id': 32583}],
  'location': 'WA',
  'class': ['common raven']}]

### Step 2c - copy images to flat folder

In [35]:
def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

In [38]:
%%time

path_pairs = []
for seq in tqdm(sequences_to_include):
    seq_id = seq['seq_id']
    
    im = seq['images'][0]
    frame = im['frame_num']
    im_basename = im['file']  # inc extension
    
    src_path = os.path.join(container_root, path_prefix, im['file'])
    assert os.path.exists(src_path), src_path

    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12c', 
                            f'{dataset_name}.seq{seq_id}.frame{frame}.file{im_basename}')
    path_pairs.append((src_path, dst_path))

100%|██████████| 102695/102695 [00:00<00:00, 115059.61it/s]

CPU times: user 744 ms, sys: 169 ms, total: 913 ms
Wall time: 906 ms





In [39]:
len(path_pairs)  # non-empty images out of total of 19221 (38%)
path_pairs[3000]

102695

('/mink_disk_0/camtraps/wpz/wolverine_survey/images/Wolverine Monitoring Photos/ID_Image_011644.JPG',
 '/mink_disk_0/camtraps/imerit12c/wpz_wolverine_labeled.seqdummy_11642.frame1.fileID_Image_011644.JPG')

In [40]:
%%time

with ThreadPool(8) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 39.5 s, sys: 1min 19s, total: 1min 59s
Wall time: 4min 31s


In [41]:
len(dst_paths)

102695