# Wildlife Image Recognition - Data Collection Notebook

This project focuses on building a convolutional neural network with transfer learning in order to identify dangerous animals in the mountain west. To start, we need to collect the data from the [LiLa website](https://lila.science/datasets/nacti) using code they've provided. Then we can clean it up to prepare for modeling. 

In [None]:
import pandas as pd
import numpy as np
import warnings
import json
import urllib.request
import tempfile
import zipfile
import os

from tqdm import tqdm
from multiprocessing.pool import ThreadPool
from urllib.parse import urlparse


warnings.filterwarnings('ignore')

The following code was provided by Lila in order to collect data from their site. It is commented out as to not accidentally start the data collection process again. It collects photos for certain designated species to include in our dataset. 

In [None]:
# Code written by LILA to help users access their image data. 
# # Find this code here: https://github.com/microsoft/CameraTraps/blob/master/data_management/download_lila_subset.py
# #
# #
# # Example of how to download a list of files from LILA, e.g. all the files
# # in a data set corresponding to a particular species.
# #

# #%% Constants and imports


# # LILA camera trap master metadata file
# metadata_url = 'http://lila.science/wp-content/uploads/2020/03/lila_sas_urls.txt'

# # In this example, we're using the Missouri Camera Traps data set and the Caltech Camera Traps dataset
# datasets_of_interest = ['NACTI']

# # All lower-case; we'll convert category names to lower-case when comparing
# species_of_interest = ['ursus americanus', 'odocoileus hemionus', 'canis latrans','puma concolor', 'alces alces', 'cervus canadensis',
#                        'procyon lotor', 'unidentified chipmunk','tamiasciurus hudsonicus', 'empty']

# # We'll write images, metadata downloads, and temporary files here
# output_dir = r'c:\temp\lila_downloads_by_species_final'
# os.makedirs(output_dir,exist_ok=True)

# # We will demonstrate two approaches to downloading, one that loops over files
# # and downloads directly in Python, another that uses AzCopy.
# #
# # AzCopy will generally be more performant and supports resuming if the 
# # transfers are interrupted.  It assumes that azcopy is on the system path.
# use_azcopy_for_download = False

# overwrite_files = False

# # Number of concurrent download threads (when not using AzCopy) (AzCopy does its
# # own magical parallelism)
# n_download_threads = 50


# #%% Support functions

# def download_url(url, destination_filename=None, force_download=False, verbose=True):
#     """
#     Download a URL (defaulting to a temporary file)
#     """
    
#     if destination_filename is None:
#         temp_dir = os.path.join(tempfile.gettempdir(),'lila')
#         os.makedirs(temp_dir,exist_ok=True)
#         url_as_filename = url.replace('://', '_').replace('.', '_').replace('/', '_')
#         destination_filename = \
#             os.path.join(temp_dir,url_as_filename)
            
#     if (not force_download) and (os.path.isfile(destination_filename)):
#         print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url)))
#         return destination_filename
    
#     if verbose:
#         print('Downloading file {} to {}'.format(os.path.basename(url),destination_filename),end='')
    
#     os.makedirs(os.path.dirname(destination_filename),exist_ok=True)
#     urllib.request.urlretrieve(url, destination_filename)  
#     assert(os.path.isfile(destination_filename))
    
#     if verbose:
#         nBytes = os.path.getsize(destination_filename)    
#         print('...done, {} bytes.'.format(nBytes))
        
#     return destination_filename


# def download_relative_filename(url, output_base, verbose=False):
#     """
#     Download a URL to output_base, preserving relative path
#     """
    
#     p = urlparse(url)
#     # remove the leading '/'
#     assert p.path.startswith('/'); relative_filename = p.path[1:]
#     destination_filename = os.path.join(output_base,relative_filename)
#     download_url(url, destination_filename, verbose=verbose)
    

# def unzip_file(input_file, output_folder=None):
#     """
#     Unzip a zipfile to the specified output folder, defaulting to the same location as
#     the input file    
#     """
    
#     if output_folder is None:
#         output_folder = os.path.dirname(input_file)
        
#     with zipfile.ZipFile(input_file, 'r') as zf:
#         zf.extractall(output_folder)


# #%% Download and parse the metadata file

# # Put the master metadata file in the same folder where we're putting images
# p = urlparse(metadata_url)
# metadata_filename = os.path.join(output_dir,os.path.basename(p.path))
# download_url(metadata_url, metadata_filename)

# # Read lines from the master metadata file
# with open(metadata_filename,'r') as f:
#     metadata_lines = f.readlines()
# metadata_lines = [s.strip() for s in metadata_lines]

# # Parse those lines into a table
# metadata_table = {}

# for s in metadata_lines:
    
#     if len(s) == 0 or s[0] == '#':
#         continue
    
#     # Each line in this file is name/sas_url/json_url
#     tokens = s.split(',')
#     assert len(tokens)==3
#     url_mapping = {'sas_url':tokens[1],'json_url':tokens[2]}
#     metadata_table[tokens[0]] = url_mapping
    
#     assert 'https' not in tokens[0]
#     assert 'https' in url_mapping['sas_url']
#     assert 'https' in url_mapping['json_url']


# #%% Download and extract metadata for the datasets we're interested in

# for ds_name in datasets_of_interest:
    
#     assert ds_name in metadata_table
#     json_url = metadata_table[ds_name]['json_url']
    
#     p = urlparse(json_url)
#     json_filename = os.path.join(output_dir,os.path.basename(p.path))
#     download_url(json_url, json_filename)
    
#     # Unzip if necessary
#     if json_filename.endswith('.zip'):
        
#         with zipfile.ZipFile(json_filename,'r') as z:
#             files = z.namelist()
#         assert len(files) == 1
#         unzipped_json_filename = os.path.join(output_dir,files[0])
#         if not os.path.isfile(unzipped_json_filename):
#             unzip_file(json_filename,output_dir)        
#         else:
#             print('{} already unzipped'.format(unzipped_json_filename))
#         json_filename = unzipped_json_filename
    
#     metadata_table[ds_name]['json_filename'] = json_filename
    
# # ...for each dataset of interest


# #%% List of files we're going to download (for all data sets)

# # Flat list or URLS, for use with direct Python downloads
# urls_to_download = []

# # # For use with azcopy
# downloads_by_dataset = {}

# for ds_name in datasets_of_interest:
    
#     json_filename = metadata_table[ds_name]['json_filename']
#     sas_url = metadata_table[ds_name]['sas_url']
    
#     base_url = sas_url.split('?')[0]    
#     assert not base_url.endswith('/')
    
#     sas_token = sas_url.split('?')[1]
#     assert not sas_token.startswith('?')
    
#     ## Open the metadata file
    
#     with open(json_filename, 'r') as f:
#         data = json.load(f)
    
#     categories = data['categories']
#     for c in categories:
#         c['name'] = c['name'].lower()
#     category_id_to_name = {c['id']:c['name'] for c in categories}
#     annotations = data['annotations']
#     images = data['images']


# #     ## Build a list of image files (relative path names) that match the target species

#     category_ids = []
    
#     for species_name in species_of_interest:
#         matching_categories = list(filter(lambda x: x['name'] == species_name, categories))
#         if len(matching_categories) == 0:
#             continue
#         assert len(matching_categories) == 1
#         category = matching_categories[0]
#         category_id = category['id']
#         category_ids.append(category_id)
    
#     print('Found {} matching categories for data set {}:'.format(len(category_ids),ds_name))
    
#     if len(category_ids) == 0:
#         continue
    
#     for i_category,category_id in enumerate(category_ids):
#         print(category_id_to_name[category_id],end='')
#         if i_category != len(category_ids) -1:
#             print(',',end='')
#     print('')
    
#     # Retrieve all the images that match that category
#     image_ids_of_interest = set([ann['image_id'] for ann in annotations if ann['category_id'] in category_ids])
    
#     print('Selected {} of {} images for dataset {}'.format(len(image_ids_of_interest),len(images),ds_name))
    
#     # Retrieve image file names
#     filenames = [im['file_name'] for im in images if im['id'] in image_ids_of_interest]
#     assert len(filenames) == len(image_ids_of_interest)
    
#     # Convert to URLs
#     for fn in filenames:        
#         url = base_url + '/' + fn
#         urls_to_download.append(url)

#     downloads_by_dataset[ds_name] = {'sas_url':sas_url,'filenames':filenames}
    
# # ...for each dataset

#     print('Found {} images to download'.format(len(urls_to_download)))

#     # Loop over files
#     print('Downloading images for {0} without azcopy'.format(species_of_interest))
    
#     if n_download_threads <= 1:
    
#         for url in tqdm(urls_to_download):        
#             download_relative_filename(url,output_dir,verbose=True)
        
#     else:
    
#         pool = ThreadPool(n_download_threads)        
#         tqdm(pool.imap(lambda s: download_relative_filename(s,output_dir,verbose=False), urls_to_download), total=len(urls_to_download))
    
# print('Done!')

Now that we've collected the data, we can organize and clean it. This will include combining dataframes, designating our target variable, and formatting the data appropriately for modeling.

In [None]:
import json
with open('species_download_metadata.json') as json_data:
    data = json.load(json_data)

In [None]:
#get the image data from the json format
image_data = pd.DataFrame(data['images'])
image_data.shape

In [None]:
metadata = pd.read_csv('nacti_metadata.csv')

In [None]:
#these are the species we want included in our modelling dataset
species_of_interest = ['ursus americanus', 'odocoileus hemionus', 'canis latrans','puma concolor', 'alces alces', 'cervus canadensis',
                       'procyon lotor', 'unidentified chipmunk','tamiasciurus hudsonicus', 'empty']

#merge images with their metadata
full_dataset= metadata.merge(image_data, on='id')
#make sure we only include our species of interest
full_dataset[full_dataset.name.isin(species_of_interest)].shape


In [None]:
#split species into dangerous - our target variable - and neutral
species_dangerous = ['ursus americanus', 'canis latrans','puma concolor', 'alces alces']
species_neutral = ['odocoileus hemionus', 'cervus canadensis','procyon lotor', 
                   'unidentified chipmunk','tamiasciurus hudsonicus']
empty = ['empty']

#here we'll rebalance our dataset a bit 
dangerous_data= full_dataset[full_dataset.name.isin(species_dangerous)]

#take 1/3 of the neutral animal data ~ 50k
neutral_data = full_dataset[full_dataset.name.isin(species_neutral)].sample(50000)

#take 1/15 of the empty data ~ 30k
empty_data = full_dataset[full_dataset.name.isin(empty)].sample(30000)
                             


print(dangerous_data.shape,
     neutral_data.shape,
     empty_data.shape)

#concat our target variable data, and neutral and empty into one dataset
dfs = [dangerous_data, neutral_data, empty_data]
animal_data = pd.concat(dfs).sample(50000)


In [None]:
#create a binary column as the target variable
def is_dangerous(row):
    dangerous_list = ['moose', 'american black bear', 'coyote', 'cougar']
    if row in dangerous_list:
        return 1
    else:
        return 0
    
animal_data['dangerous'] = animal_data.common_name.apply(is_dangerous)


In [None]:
len(animal_data[animal_data.dangerous==1])/len(animal_data)


In [None]:
#animal_data.to_csv('smaller_animal_data.csv')

In [None]:
animal_data = animal_data.reset_index()


In [None]:
#below we're sampling some of the images to get an understanding of the dataset
animal_data[animal_data.filename.str.contains('part3/sub324/CA-09_0003554')]
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
animal_df1 = animal_data.reset_index()
animal_df1
animal_df1[animal_df1.filename.str.contains('part1/sub108/CA-45_11_02_2015_CA-45_0012269.jpg')]

In [None]:
bear = Image.open('species_photos/nacti-unzipped/part2/sub212/FL-19_08_04_2016_FL-19_0060889.JPG')
bear.size

In [None]:
combined = pd.read_csv('combined_with_pixels.csv')
combined.head()

In [None]:
#drop any rows that don't have pixel values
combined1 = combined.dropna(subset=['pixels'], axis=0)

In [None]:
combined1.info()

In [None]:
#get proportions of each species in the dataset
len(combined1[combined1.dangerous==1])/len(combined1)

In [None]:
len(combined1[combined1.common_name.str.contains('bear')==1])/len(combined1)

In [None]:
len(combined1[combined1.common_name.str.contains('moose')==1])/len(combined1)

In [None]:
len(combined1[combined1.common_name.str.contains('coyote')==1])/len(combined1)

In [None]:
len(combined1[combined1.common_name.str.contains('cougar')==1])/len(combined1)

In [None]:
#export this to csv so it can be used in our modeling notebook
#combined1.to_csv('final_5000_nonulls.csv')