In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils  # ai4eutils
import sas_blob_utils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# parkscanada_garrow_201920_trains

In [37]:
dataset_name = 'parkscanada_garrow_201920_trains'

container_root = '/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

## Step 0 - Add an entry to the `datasets` table

Done

## Step 1 - Prepare the `sequence` objects to insert into the database

No sequence information available.

In [8]:
api_inputs_dir = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada'

api_inputs = path_utils.recursive_file_list(api_inputs_dir)

In [9]:
api_inputs

['/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada/api_inputs/parkscanada-garrow-20210207/parkscanada-garrow-20210207.chunk001.json',
 '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada/api_inputs/parkscanada-garrow-20210207/parkscanada-garrow-20210207.chunk000.json',
 '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/parkscanada/api_inputs/parkscanada-garrow-20200409/parkscanada-garrow-20200409.chunk000.json']

In [14]:
all_paths = []

for p in api_inputs:
    with open(p) as f:
        li = json.load(f)
        all_paths.extend(li)
len(all_paths)

1816046

In [16]:
all_paths[1000]
all_paths[-1000]

'garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_91291images_Feb202020_March212020/100RECNX/RCNX6570.JPG'

'CPRail5_589703_5670051_17_12_2019to29_12_2019/101RECNX/IMG_1660.JPG'

### 2019 folders

In [20]:
sequences_2019 = []
locations_2019 = set()

for p in tqdm(all_paths):
    if p.startswith('garrow-2020.') or not path_utils.is_image_file(p):
        continue
        
    p_parts = p.split('/')
    railway_loc = '2019_' + p_parts[0].split('_')[0]
    locations_2019.add(railway_loc)
    
    sequences_2019.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_2019_{len(sequences_2019)}',
        'location': railway_loc,
        'class': ['train'],
        'images': [{
            'file': p,
            'frame_num': 1,
        }]
    })
    
len(sequences_2019)
len(locations_2019)

100%|██████████| 1816046/1816046 [00:03<00:00, 562853.03it/s] 


218417

5

In [26]:
locations_2019

{'2019_CPRail1',
 '2019_CPRail2',
 '2019_CPRail3',
 '2019_CPRail4',
 '2019_CPRail5'}

In [27]:
sample(sequences_2019, 1)

[{'dataset': 'parkscanada_garrow_201920_trains',
  'seq_id': 'dummy_2019_138479',
  'location': '2019_CPRail3',
  'class': ['train'],
  'images': [{'file': 'CPRail3_593114_5669476_13_11_2019to22_11_2019/101RECNX/IMG_1371.JPG',
    'frame_num': 1}]}]

### 2020 folders

In [23]:
sequences_2020 = []
locations_2020 = set()

for p in tqdm(all_paths):
    if not p.startswith('garrow-2020.') or not path_utils.is_image_file(p):
        continue
        
    p_parts = p.split('/')
    railway_loc = '2020_' + p_parts[2].split('_')[0]
    locations_2020.add(railway_loc)
    
    sequences_2020.append({
        'dataset': dataset_name,
        'seq_id': f'dummy_2020_{len(sequences_2020)}',
        'location': railway_loc,
        'class': ['train'],
        'images': [{
            'file': p,
            'frame_num': 1,
        }]
    })
    
len(sequences_2020)
len(locations_2020)

100%|██████████| 1816046/1816046 [00:15<00:00, 118761.22it/s]


1597625

5

In [24]:
locations_2020

{'2020_CPRail1',
 '2020_CPRail2',
 '2020_CPRail3',
 '2020_CPRail4',
 '2020_CPRail5'}

In [25]:
sample(sequences_2020, 1)

[{'dataset': 'parkscanada_garrow_201920_trains',
  'seq_id': 'dummy_2020_7034',
  'location': '2020_CPRail3',
  'class': ['train'],
  'images': [{'file': 'garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_91291images_Feb202020_March212020/101RECNX/RCNX2605.JPG',
    'frame_num': 1}]}]

### Combined

In [28]:
sequences = sequences_2019 + sequences_2020
len(sequences)

1816042

## Step 2 - Pass the schema check

In [29]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 3min 40s, sys: 103 ms, total: 3min 40s
Wall time: 3min 40s


### Step 2b - Sample

Only taking a random sample of 25k images.

We will also only include this sample in the megadb.

In [30]:
sequences_sample = sample(sequences, 25000)

locations_sampled = set()
for seq in sequences_sample:
    locations_sampled.add(seq['location'])
len(locations_sampled)  # covering all 10 locations

10

In [31]:
len(sequences_sample)

25000

In [32]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences_sample, f, ensure_ascii=False)

### Step 2c - Download the sampled images

In [33]:
list_to_download = []

for seq in sequences_sample:
    for im in seq['images']:
        list_to_download.append(im['file'] + '\n')
len(list_to_download)

25000

In [34]:
list_to_download[10000]

'garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_56019images_Sept102020_Oct102020/105RECNX/RCNX2952.JPG\n'

In [35]:
with open('/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/parkscanada_trains_files.txt', 'w') as f:
    f.writelines(list_to_download)

### Step 2d - Copy to flat folder

In [38]:
path_pairs = []

for seq in tqdm(sequences_sample):
    seq_id = seq['seq_id']
    
    for im in seq['images']:
        frame = im['frame_num']
    
        src_path = os.path.join(container_root, im['file'])
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

100%|██████████| 25000/25000 [00:00<00:00, 77210.24it/s]


In [42]:
len(path_pairs)
sample(path_pairs, 5)

25000

[('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/CPRail4_591659_5669241_17_12_2019to28_12_2019/101RECNX/IMG_4496.JPG',
  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2019_182929.frame1.jpg'),
 ('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/garrow-2020.12.30/Garrow2021ParksCanada/CPRail2_594233_5669466_16881images_June172020_July152020/100RECNX/IMG_8751.JPG',
  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2020_878340.frame1.jpg'),
 ('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_59172images_April92020_May82020/103RECNX/RCNX0172.JPG',
  '/mink_disk_0/camtraps/imerit12g/parkscanada_garrow_201920_trains.seqdummy_2020_1357262.frame1.jpg'),
 ('/mink_disk_0/camtraps/parkscanada-garrow/parkscanada-garrow/garrow-2020.12.30/Garrow2021ParksCanada/CPRail3_592916_5669361_64565images_May82020_June172020/100RECNX/RCNX2125.JPG',
  '/mink_

In [43]:
%%time

def copy_file(src_path, dst_path):
    return copyfile(src_path, dst_path)

with ThreadPool(12) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 32.2 s, sys: 32.6 s, total: 1min 4s
Wall time: 29.2 s


In [44]:
len(dst_paths)

25000