# Create a subset of NACTI training data

In [1]:
import json
import pandas as pd
from shutil import copyfile
import os
import re
import numpy as np

Import the data

In [2]:
nacti_json = '/data/nacti/nacti_metadata.json'

with open(nacti_json) as f:
    data = json.loads(f.read())

Create mappings for category id to category name and image id to image filename

In [3]:
category_map = {cat['id']:re.sub(' |-', '_', cat['common name']) for cat in data['categories']}

In [4]:
id_path_map = {img['id']:img['file_name'] for img in data['images']}

Create a list of species important to our analysis

In [5]:
species_list = [
    'american_black_bear', 
    'bobcat', 
    'coyote', 
    'empty', 
    'vehicle', 
    'cougar', 
    'human', 
    'dog', 
    'white_tailed_deer',
    'wolf',
    'moose',
    'gray_fox',
    'elk',
    'domestic_dog',
    'wild_turkey',
    'red_fox',
    'domestic_cow'
]

In [17]:
len(species_list)

17

### Create a dataframe of labeled data

In [6]:
# Import annotations
df = pd.DataFrame(data['annotations'])
# Get the file path for each image id
df['file_path'] = df['image_id'].apply(lambda x: id_path_map[x])
# Get the category name for each category id
df['category_name'] = df['category_id'].apply(lambda x: category_map[x])
# Get the current subfolder that the file is located in
df['subfolder'] = df['file_path'].apply(lambda x: re.findall('sub\d{3}', x)[0])
df.head()

Unnamed: 0,category_id,id,image_id,file_path,category_name,subfolder
0,10,b56989c2-d98a-11e8-969f-000d3a71ec1a,2010_Unit150_Ivan097_img0001.jpg,sub001/2010_Unit150_Ivan097_img0001.jpg,red_deer,sub001
1,10,b56989c3-d98a-11e8-969f-000d3a71ec1a,2010_Unit150_Ivan097_img0002.jpg,sub001/2010_Unit150_Ivan097_img0002.jpg,red_deer,sub001
2,10,b56989c4-d98a-11e8-969f-000d3a71ec1a,2010_Unit150_Ivan097_img0003.jpg,sub001/2010_Unit150_Ivan097_img0003.jpg,red_deer,sub001
3,10,b56989c5-d98a-11e8-969f-000d3a71ec1a,2010_Unit150_Ivan097_img0004.jpg,sub001/2010_Unit150_Ivan097_img0004.jpg,red_deer,sub001
4,10,b56989c6-d98a-11e8-969f-000d3a71ec1a,2010_Unit150_Ivan097_img0005.jpg,sub001/2010_Unit150_Ivan097_img0005.jpg,red_deer,sub001


Get only the species that are necessary for our analysis

In [7]:
# Get species that are in our species list
subset = df[df['category_name'].isin(species_list)]

# Get species with at least 100 observations
# sample_count = (subset.groupby(['category_name'])['id'].nunique() > 100).to_dict()
# subset = subset[subset['category_name'].apply(lambda x: sample_count[x])]

In [8]:
subset.groupby(['category_name'])['id'].nunique()

category_name
american_black_bear      88478
bobcat                   25443
cougar                   14756
coyote                   21946
domestic_cow           2019012
domestic_dog               752
elk                      22143
empty                   460135
gray_fox                 10230
moose                     9964
red_fox                   1723
vehicle                  26015
white_tailed_deer        13598
wild_turkey               4366
wolf                       474
Name: id, dtype: int64

### Split the data

In [10]:
def apply_train_test_dev_labels(value, splits):
    label = []
    for i in range(value):
        if i < value*splits[0]:
            label += ['train']
        elif (i >= value*splits[0]) and (i < value*(splits[1] + splits[0])):
            label += ['val']
        else:
            label += ['test']
    return label

In [11]:
max_val = 100000
list_ = []
splits = (0.7, 0.15, 0.15)

for key, grp in subset.groupby(['category_name']):
    samp_size = len(grp)
    grp = grp.sample(frac=1).reset_index(drop=True)
    if samp_size > max_val:
        grp = grp.iloc[:max_val]
        samp_size = max_val
    grp['set'] = apply_train_test_dev_labels(samp_size, splits)
    list_.append(grp)

shuff_df = pd.concat(list_).reset_index(drop=True)
shuff_df['new_path'] = '/data/training_data/' + shuff_df['set'] + '/' + shuff_df['category_name'] + '/'  + shuff_df['image_id']
shuff_df['old_path'] = '/data/nacti/' + shuff_df['file_path']
shuff_df['dir_name'] = shuff_df['new_path'].apply(lambda x: os.path.dirname(x))

In [12]:
shuff_df.groupby(['set', 'category_name'])['id'].nunique()

set    category_name      
test   american_black_bear    13271
       bobcat                  3816
       cougar                  2213
       coyote                  3291
       domestic_cow           15000
       domestic_dog             112
       elk                     3321
       empty                  15000
       gray_fox                1534
       moose                   1494
       red_fox                  258
       vehicle                 3902
       white_tailed_deer       2039
       wild_turkey              654
       wolf                      71
train  american_black_bear    61935
       bobcat                 17811
       cougar                 10330
       coyote                 15363
       domestic_cow           70000
       domestic_dog             527
       elk                    15501
       empty                  70000
       gray_fox                7161
       moose                   6975
       red_fox                 1207
       vehicle                18211
 

In [13]:
shuff_df[['category_name', 'old_path', 'new_path']]

Unnamed: 0,category_name,old_path,new_path
0,american_black_bear,/data/nacti/sub343/WA-Site1_11628.JPG,/data/training_data/train/american_black_bear/...
1,american_black_bear,/data/nacti/sub343/WA-Site2_00550.JPG,/data/training_data/train/american_black_bear/...
2,american_black_bear,/data/nacti/sub348/WA-Site5_01340.JPG,/data/training_data/train/american_black_bear/...
3,american_black_bear,/data/nacti/sub039/CA-04_05_04_2016_CA-04_0013...,/data/training_data/train/american_black_bear/...
4,american_black_bear,/data/nacti/sub041/CA-04_10_31_2016_CA-04_0025...,/data/training_data/train/american_black_bear/...
5,american_black_bear,/data/nacti/sub346/WA-Site2_31369.JPG,/data/training_data/train/american_black_bear/...
6,american_black_bear,/data/nacti/sub037/CA-03_06_07_2016_CA-03_0021...,/data/training_data/train/american_black_bear/...
7,american_black_bear,/data/nacti/sub347/WA-Site3_04901.JPG,/data/training_data/train/american_black_bear/...
8,american_black_bear,/data/nacti/sub350/WA-Site9_01445.JPG,/data/training_data/train/american_black_bear/...
9,american_black_bear,/data/nacti/sub104/CA-42_12_02_2015_CA-42_0012...,/data/training_data/train/american_black_bear/...


In [14]:
shuff_df['set'].value_counts()/len(shuff_df)

train    0.700017
val      0.150000
test     0.149984
Name: set, dtype: float64

In [15]:
for i in list(shuff_df['dir_name'].unique()):
    os.makedirs(i)

In [16]:
for idx, row in shuff_df.iterrows():
    copyfile(row['old_path'], row['new_path'])