In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [4]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# awc_202103

In [1]:
dataset_name = 'awc_202103'

container_root = '/mink_disk_0/camtraps/awc/'
path_prefix = '202103drop/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

Folder structure for labeled folders (no location info, guessed sequence):

`Reptiles`

species - region/IMG_0463.JPG - can guess sequence from when the number in image name breaks, but sometimes there are consecutive sequences that we can't separate.

`MZT reptile pics`

species / 10_IMG_0779.JPG - same thing for sequence info.


Folder structure for unlabeled folders (has location info but no sequence info), quite empty:

`Bullo Rocky Cams Nov19-Feb20`

location / RCNX0001.JPG


`Treatment_R1`

FM96 - MG102 (looks like both components are unique - location) / 3.1.20 (date - is different from the one in the header of the image though) / RCNX0001.JPG

In [5]:
folder = os.path.join(container_root, path_prefix)

paths = path_utils.recursive_file_list(folder)
len(paths)
paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p) and not os.path.basename(p).startswith('.')])
len(paths)

494380

494332

In [8]:
paths[10]
paths[-10]

'Bullo Rocky Cams Nov19-Feb20/HP212/RCNX1001.JPG'

'Treatment_R1/FM99 - MG144/10.2.20/RCNX0061.JPG'

### Labeled folders

In [51]:
labeled_folders = ('Reptiles', 'MZT reptile pics')

images_labeled = []
species = set()

last_frame_num = 0
seq_count = 0
frame_in_seq = 1

for p in tqdm(paths):
    if not p.startswith(labeled_folders):
        continue
    
    p_parts = p.split('/')
    
    clss = p_parts[1].split('-')[0].strip().lower()
    species.add(clss)
    
    fn = p_parts[-1].split('.')[0]
    fn_num = int(fn.split('_')[-1])
    
    if fn_num > last_frame_num + 1 or fn_num < last_frame_num:
        seq_count += 1  # new sequence
        frame_in_seq = 1
    last_frame_num = fn_num
    
    
    images_labeled.append({
        'dataset': dataset_name,
        'seq_id': f'labeled_{seq_count}',
        'location': 'unknown',
        'class': [clss],
        'file': p,
        'frame_num': frame_in_seq
    })
    frame_in_seq += 1
    
len(images_labeled)

100%|██████████| 494332/494332 [00:00<00:00, 1659085.20it/s]


712

In [52]:
species

{'ctenophorus scutulatus',
 'reptile sp',
 'tiliqua occipitalis',
 'varanus gouldii',
 'varanus scalaris',
 'varanus sp',
 'varanus tristis',
 'varanus varius'}

In [53]:
images_labeled[10:15]

[{'dataset': 'awc_202103',
  'seq_id': 'labeled_6',
  'location': 'unknown',
  'class': ['varanus sp'],
  'file': 'MZT reptile pics/Varanus sp/13_IMG_0188.JPG',
  'frame_num': 2},
 {'dataset': 'awc_202103',
  'seq_id': 'labeled_6',
  'location': 'unknown',
  'class': ['varanus sp'],
  'file': 'MZT reptile pics/Varanus sp/14_IMG_0189.JPG',
  'frame_num': 3},
 {'dataset': 'awc_202103',
  'seq_id': 'labeled_7',
  'location': 'unknown',
  'class': ['varanus sp'],
  'file': 'MZT reptile pics/Varanus sp/15_IMG_0154.JPG',
  'frame_num': 1},
 {'dataset': 'awc_202103',
  'seq_id': 'labeled_8',
  'location': 'unknown',
  'class': ['varanus sp'],
  'file': 'MZT reptile pics/Varanus sp/1_IMG_0070.JPG',
  'frame_num': 1},
 {'dataset': 'awc_202103',
  'seq_id': 'labeled_8',
  'location': 'unknown',
  'class': ['varanus sp'],
  'file': 'MZT reptile pics/Varanus sp/2_IMG_0071.JPG',
  'frame_num': 2}]

In [54]:
sequences_labeled = process_sequences(images_labeled, dataset_name)

100%|██████████| 712/712 [00:00<00:00, 747894.93it/s]

The dataset_name is set to awc_202103. Please make sure this is correct!
Making a deep copy of docs...
Putting 712 images into sequences...
Number of sequences: 172
Checking the location field...
Checking which fields in a CCT image entry are sequence-level...

all_img_properties
{'location', 'class', 'file', 'dataset', 'frame_num'}

img_level_properties
{'file', 'frame_num'}

image-level properties that really should be sequence-level
{'location', 'class', 'dataset'}

Finished processing sequences.
Example sequence items:

{"dataset": "awc_202103", "seq_id": "labeled_1", "location": "unknown", "images": [{"file": "MZT reptile pics/Reptile sp/0_IMG_0066.JPG", "frame_num": 1}], "class": ["reptile sp"]}

{"dataset": "awc_202103", "seq_id": "labeled_33", "location": "unknown", "images": [{"file": "MZT reptile pics/Varanus varius/6_IMG_0117.JPG", "frame_num": 1}], "class": ["varanus varius"]}






### Unlabeled folders

In [41]:
sequences_unlabeled = []
locations = set()

for p in tqdm(paths):
    if p.startswith(labeled_folders):
        continue
    
    p_parts = p.split('/')
    
    location = f'{p_parts[0]}_{p_parts[1]}'.replace(' ', '')
    locations.add(location)
    
    # RCNX0007.JPG
    im_id = p_parts[-1].split('.')[0].split('RCNX')[1]
    
    seq_id = f'dummy_{location}_{im_id}_{len(sequences_unlabeled)}'
    
    sequences_unlabeled.append({
        'dataset': dataset_name,
        'seq_id': seq_id,
        'location': location,
        'class': ['__label_unavailable'],
        'images': [{
            'file': p,
            'frame_num': 1
        }]
    })
    
len(locations)
len(sequences_unlabeled)

100%|██████████| 494332/494332 [00:03<00:00, 147250.66it/s]


156

493620

### Combined 

In [55]:
sequences = sequences_labeled + sequences_unlabeled
len(sequences)

493792

## Step 2 - Pass the schema check

In [56]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 58.6 s, sys: 7.73 ms, total: 58.6 s
Wall time: 58.6 s


In [57]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, ensure_ascii=False)

### Step 2b - sample unlabeled sequences and copy to flat folder

In [58]:
sample_sequences_unlabeled = sample(sequences_unlabeled, 100000)
locations_in_sample = set()
for seq in sample_sequences_unlabeled:
    locations_in_sample.add(seq['location'])
len(locations_in_sample)  # all 156 locations are represented

156

In [59]:
seq_to_send = sequences_labeled + sample_sequences_unlabeled
len(seq_to_send)

path_pairs = []

for seq in tqdm(seq_to_send):
    seq_id = seq['seq_id']
    
    for im in seq['images']:
        frame = im['frame_num']
    
        src_path = os.path.join(container_root, path_prefix, im['file'])
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

100172

100%|██████████| 100172/100172 [00:00<00:00, 133857.04it/s]


In [60]:
len(path_pairs)
path_pairs[:10]
path_pairs[-100]

100712

[('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Reptile sp/0_IMG_0066.JPG',
  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_1.frame1.jpg'),
 ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Reptile sp/1_IMG_0383.JPG',
  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_2.frame1.jpg'),
 ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Reptile sp/2_IMG_0384.JPG',
  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_2.frame2.jpg'),
 ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus scalaris/0_IMG_0190.JPG',
  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_3.frame1.jpg'),
 ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus scalaris/1_IMG_0191.JPG',
  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_3.frame2.jpg'),
 ('/mink_disk_0/camtraps/awc/202103drop/MZT reptile pics/Varanus scalaris/2_IMG_0192.JPG',
  '/mink_disk_0/camtraps/imerit12g/awc_202103.seqlabeled_3.frame3.jpg'),
 ('/mink_disk_0/camtraps/awc/20210

('/mink_disk_0/camtraps/awc/202103drop/Treatment_R1/FM80 - MG109/31.12.19/RCNX0378.JPG',
 '/mink_disk_0/camtraps/imerit12g/awc_202103.seqdummy_Treatment_R1_FM80-MG109_0378_489577.frame1.jpg')

In [62]:
%%time

def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

with ThreadPool(12) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 1min 54s, sys: 3min 40s, total: 5min 34s
Wall time: 6min 44s


In [63]:
len(dst_paths)

100712