In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH
sys.path.append('/home/mink/lib/ai4eutils')

In [3]:
import json
import os
from collections import Counter, defaultdict
from random import sample
from shutil import copyfile
from multiprocessing.pool import ThreadPool

import numpy as np
import pandas as pd
from tqdm import tqdm

import path_utils, sas_blob_utils  # ai4eutils

from data_management.megadb.schema import sequences_schema_check
from data_management.megadb.converters.cct_to_megadb import process_sequences

# idfg_swwlf_2020

In particular, we want to label sequences with low MDv4 detection confidence.

In [13]:
dataset_name = 'idfg_swwlf_2020'

container_root = '/mink_disk_0/camtraps/idfg/'
path_prefix = 'SWWLF2020/'

path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' 
path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' 

In [5]:
all_csv_path = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/idfg_SWWLF2020_all.csv'

with open(all_csv_path) as f:
    all_csv = pd.read_csv(f)

In [6]:
len(all_csv)

11589732

In [7]:
all_csv.sample(2)

Unnamed: 0,image_path,max_confidence,detections,max_conf_animal,max_conf_person,max_conf_group,max_conf_vehicle
4172076,SWWLF2020\R3_mccall_01\IDFG2307\SWWLF2020_IDFG...,0.0,,,,,
8914935,SWWLF2020\R6_02\IDFG2704\SWWLF2020_IDFG2704_20...,0.0,,,,,


In [8]:
all_csv.iloc[918957]['image_path']

'SWWLF2020\\R1_01\\IDFG1476\\SWWLF2020_IDFG1476_20200705_140000_TL_0b.JPG'

In [28]:
seq_dict = defaultdict(list)

seq_count = 0
cur_frame_num = 1

for i_row, row in all_csv.iterrows():
    
    if i_row % 1000000 == 0:
        print(i_row)
    
    im_path = row['image_path'].replace('\\', '/').split(path_prefix)[1]
    b_parts = os.path.basename(im_path).split('_')
    
    frame = b_parts[-1].split('.')[0]
    if frame.endswith('b'):
        continue;  # skip the "b" images - they seem to be very close to the previous frame
    else:
        frame_num = int(frame)
    
    if frame_num < cur_frame_num:
        # new seq
        seq_count += 1

    seq_dict[seq_count].append({
        'file': im_path,
        'frame_num': frame_num,
        'datetime': b_parts[2],
        'seq_id': seq_count,
        'location': '_'.join(b_parts[:2]),
        'max_conf': row['max_confidence']
    })
    
    cur_frame_num = frame_num

0
1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000


In [30]:
len(seq_dict)

1428238

In [35]:
total_im = 0
for seq in seq_dict.values():
    total_im += len(seq)
total_im

11013726

### CSV with false negatives

In [18]:
csv_path = '/mink_disk_0/camtraps/megadetectorv5_annotation_prep/annotation_prep/swwlf2020_false_neg_animal.csv'

with open(csv_path) as f:
    false_negatives_df = pd.read_csv(f)

In [19]:
len(false_negatives_df)
false_negatives_df.sample(1)

8294

Unnamed: 0.1,Unnamed: 0,File,Human,Wildlife,Livestock,Pet_pack_horse,Vehicle,Empty,YoungPresent,Species,Count,max_conf_animal
3022,3023,SWWLF2020_IDFG0943_20200810_110742_MD_2.JPG,True,False,False,True,False,False,False,human,0.0,0.291


In [22]:
false_negatives_df['Species'].unique()

array(['wolf', 'turkey', 'bobcat', 'dog_domestic', 'human',
       'whitetaileddeer', 'cattle_cow', 'elk', 'moose', 'bear_black',
       'bird', 'squirrel', 'rabbit_hare', 'other_comments', 'horse',
       'muledeer', 'deer_speciesunknown', 'unknown', 'fox_red',
       'ground_squirrel', 'none', 'badger', 'rodent', 'coyote',
       'goat_domestic', 'mountain_lion', 'cat_domestic', 'skunk_striped',
       'llama', 'sheep_domestic', 'marmot', 'goat_mountain', 'pronghorn',
       'porcupine'], dtype=object)

We want to identify the sequences from which these false negative images are from, and check if the sequence overall is low-confidence.

If it's a time-triggered image, we should get it annotated.

In [24]:
false_negs_species = {}
false_negs_count = {}

for i_row, row in false_negatives_df.iterrows():
    false_negs_species[row['File']] = row['Species']
    false_negs_count[row['File']] = int(row['Count']) if not np.isnan(row['Count']) else 0

### Filter to sequences with species label we get from the false negatives list

In [36]:
from copy import deepcopy

In [50]:
sequences = []
num_images = 0

for seq_id, seq in tqdm(seq_dict.items()):
    if len(seq) == 0:
        continue
    
    species = None
    location = seq[0]['location']
    date = seq[0]['datetime']  # only a date, no timestamp
    
    for im in seq:
        basename = os.path.basename(im['file']) 
        if basename in false_negs_species:
            species = false_negs_species[basename]
            count = false_negs_count[basename]
    
    if species is None:  # no label, exclude
        continue
        
    images = deepcopy(seq)
    
    for im in images:
        del im['seq_id'],
        del im['location']
        
    sequences.append({
        'dataset': dataset_name,
        'seq_id': f'md_missed_{seq_id}',
        'location': location,
        'class': [species],
        'datetime': date,
        'images': images
    })
    num_images += len(images)

    
len(sequences) 

100%|██████████| 1428239/1428239 [00:15<00:00, 89847.53it/s]


3847

In [51]:
num_images

216572

In [57]:
sample(sequences, 1)

[{'dataset': 'idfg_swwlf_2020',
  'seq_id': 'md_missed_219772',
  'location': 'SWWLF2020_IDFG1763',
  'class': ['elk'],
  'datetime': '20200823',
  'images': [{'file': 'R2_02/IDFG1763/SWWLF2020_IDFG1763_20200823_203309_MD_1.JPG',
    'frame_num': 1,
    'datetime': '20200823',
    'max_conf': 0.7709999999999999},
   {'file': 'R2_02/IDFG1763/SWWLF2020_IDFG1763_20200823_203309_MD_2.JPG',
    'frame_num': 2,
    'datetime': '20200823',
    'max_conf': 0.212},
   {'file': 'R2_02/IDFG1763/SWWLF2020_IDFG1763_20200823_203310_MD_3.JPG',
    'frame_num': 3,
    'datetime': '20200823',
    'max_conf': 0.0}]}]

In [58]:
# some frame numbers are not unique within the sequence; drop such frames
num_dropped = 0

for seq in sequences:
    frame_numbers = []
    frames = []
    
    for im in seq['images']:
        if im['frame_num'] in frame_numbers:
            num_dropped += 1
            continue
        else:
            frame_numbers.append(im['frame_num'])
        frames.append(im)
    seq['images'] = frames

In [85]:
%%time

sequences_schema_check.sequences_schema_check(sequences)

Verified that the sequence items meet requirements not captured by the schema.
Verified that the sequence items conform to the schema.
CPU times: user 853 ms, sys: 0 ns, total: 853 ms
Wall time: 852 ms


In [86]:
len(sequences)

3847

In [87]:
num_images = sum([len(seq['images']) for seq in sequences])
num_images

12559

In [88]:
locations = Counter([seq['location'] for seq in sequences])

In [89]:
len(locations)  # most only have one or two sequences at the location

517

In [66]:
with open(path_to_output_temp, 'w', encoding='utf-8') as f:
    json.dump(sequences, f, indent=1, ensure_ascii=False)

In [103]:
im_to_send = []
for seq in sequences:
    for im in seq['images']:
        if im['file'] == 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG':
            print(seq)
        
        im_to_send.append(im)
        if im['file'] == 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG':
            print(im_to_send[-1])
    # im_to_send.extend(seq['images'])
len(im_to_send)

{'dataset': 'idfg_swwlf_2020', 'seq_id': 'md_missed_6234', 'location': 'SWWLF2020_IDFG0808', 'class': ['human'], 'datetime': '20200801', 'images': [{'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG', 'frame_num': 1, 'datetime': '20200801', 'max_conf': 1.0}, {'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161956_MD_2.JPG', 'frame_num': 2, 'datetime': '20200801', 'max_conf': 1.0}, {'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161956_MD_3.JPG', 'frame_num': 3, 'datetime': '20200801', 'max_conf': 0.888}]}
{'file': 'R1_01/IDFG0808/SWWLF2020_IDFG0808_20200801_161954_MD_1.JPG', 'frame_num': 1, 'datetime': '20200801', 'max_conf': 1.0}


12559

In [105]:
'SWWLF2020/R1_02/IDFG1501/SWWLF2020_IDFG1501_20200704_151530_MD_1.JPG\n' in list_to_download

True

In [106]:
list_to_download = [os.path.join(path_prefix, im['file']) + '\n' for im in im_to_send]

with open(f'/mink_disk_0/camtraps/megadetectorv5_annotation_prep/batch_12_lists/{dataset_name}_files.txt', 'w') as f:
    f.writelines(list_to_download)

In [107]:
list_to_download[-50]

'SWWLF2020/R7_02/IDFG2749/SWWLF2020_IDFG2749_20200902_195724_MD_3.JPG\n'

## Rename and copy to imerit12g folder

In [108]:
%%time

path_pairs = []

not_there = []

for seq in sequences:
    seq_id = seq['seq_id']
    for im in seq['images']:
        src_path = os.path.join(container_root, path_prefix, im['file'])
        if not os.path.exists(src_path):
            not_there.append((im, seq_id))
        
        frame = im['frame_num']
        dst_path = os.path.join('/mink_disk_0/camtraps/imerit12g', 
                                f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')
        path_pairs.append((src_path, dst_path))

path_pairs[-50]

CPU times: user 83.8 ms, sys: 32.5 ms, total: 116 ms
Wall time: 115 ms


('/mink_disk_0/camtraps/idfg/SWWLF2020/R7_02/IDFG2749/SWWLF2020_IDFG2749_20200902_195724_MD_3.JPG',
 '/mink_disk_0/camtraps/imerit12g/idfg_swwlf_2020.seqmd_missed_1410386.frame3.jpg')

In [109]:
im_to_send[-50]

{'file': 'R7_02/IDFG2749/SWWLF2020_IDFG2749_20200902_195724_MD_3.JPG',
 'frame_num': 3,
 'datetime': '20200902',
 'max_conf': 1.0}

In [110]:
len(path_pairs)

12559

In [112]:
len(not_there)

315

In [113]:
%%time

def copy_file(src_path, dst_path):
    if os.path.exists(src_path):
        return copyfile(src_path, dst_path)

with ThreadPool(16) as pool:
    dst_paths = pool.starmap(copy_file, path_pairs)

CPU times: user 29.4 s, sys: 37.1 s, total: 1min 6s
Wall time: 30.3 s
