## Notebook Setup and Required Packages

In [1]:
# Data Handling
import pandas as pd
import numpy as np

# IO - getting files and images from MongoDB and S3
from pymongo import MongoClient
from kaggle_secrets import UserSecretsClient
import requests

from concurrent.futures import ThreadPoolExecutor

from pathlib import Path
from PIL import Image
from io import BytesIO

import os
import re
import shutil
import json
import time

# Install speciesnet and related megadetector libraries
!pip install -Uqq speciesnet megadetector-utils

from IPython.display import display
from IPython.display import JSON

from speciesnet import SpeciesNet
import kagglehub



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.7/93.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m791.3/791.3 kB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m111.4 MB/s[0m eta [3

In [2]:
# Run a quick check to see if the GPU is being used
!python -m speciesnet.scripts.gpu_test

*** Running Torch tests ***

Torch version: 2.6.0+cu124
CUDA available (according to PyTorch): True
CUDA version (according to PyTorch): 12.4
CuDNN version (according to PyTorch): 90100
Found 1 CUDA devices:
0: Tesla P100-PCIE-16GB


In [3]:
# Configuration for Multithreading and Batching
num_batches = 10
max_threads = 8
output_root = Path("output")

# Prepare folders
output_root.mkdir(exist_ok=True)
images_root = Path("images")
images_root.mkdir(exist_ok=True)

## Access The URIs from S3 through MongoDB

In [4]:
# Get the stored mongo uri secret
user_secrets = UserSecretsClient()
mongo_uri = user_secrets.get_secret("MONGO_URI")

# Connect to the MongoDB client
client = MongoClient(mongo_uri)
 
# Access the database and collection
db = client['test']
collection = db['cameratrapmedias'] 
 
# Query the collection to retrieve records with image URLs, metadata, and the first index of 'relativePath'
data = list(collection.aggregate([
    {
        '$project': {
            '_id': 0,
            'publicURL': 1,
            'timestamp': 1,
            'folderName': { '$arrayElemAt': ['$relativePath', 1] },
            'fileName': 1
        }
    },
    # { '$limit': 150 }
]))
 
# Convert the data to a pandas DataFrame for exploration
df = pd.DataFrame(data)

# Export the small array to a CSV file for preview
df.to_csv('ur_test_medias.csv', index=False)

## We are going to create a column that creates a file name to save the image

In [5]:
# This function will format the final string
def make_filename(s):
    # s = s.lower()
    s = re.sub(r'[^\w\s.-]', '', s) # remove special characters except dash or underscore or period
    s = re.sub(r'\s+', '_', s) # replace whitespace with underscore
    return s

# Combine the relative path second (folder name) + fileName
df['imageName'] = df['folderName'] + '--' + df['fileName']
df['imageName'] = df['imageName'].apply(make_filename)

print(df.head())

            timestamp                                          publicURL  \
0 2024-01-27 13:33:15  https://urbanriverrangers.s3.amazonaws.com/ima...   
1 2024-01-24 18:56:50  https://urbanriverrangers.s3.amazonaws.com/ima...   
2 2024-01-24 19:01:54  https://urbanriverrangers.s3.amazonaws.com/ima...   
3 2024-01-24 19:03:05  https://urbanriverrangers.s3.amazonaws.com/ima...   
4 2024-01-24 19:04:19  https://urbanriverrangers.s3.amazonaws.com/ima...   

       fileName                               folderName  \
0  SYFW0060.JPG                   2024-01-30_prologis_02   
1  SYFW0001.JPG  2024-01-30_Learnin_platform_camera_test   
2  SYFW0002.JPG  2024-01-30_Learnin_platform_camera_test   
3  SYFW0004.JPG  2024-01-30_Learnin_platform_camera_test   
4  SYFW0006.JPG  2024-01-30_Learnin_platform_camera_test   

                                           imageName  
0               2024-01-30_prologis_02--SYFW0060.JPG  
1  2024-01-30_Learnin_platform_camera_test--SYFW0...  
2  2024-01-30_Lea

Now that we have a connection to the MongoDB server and access to the URLs, let's use the download images.

## Download Images

In [6]:
%%time
# Create a directory to save the images
output_root.mkdir(exist_ok=True)
path = Path('images')
path.mkdir(exist_ok=True)

# Create a tool for resizing so cropping top and bottom can happen while keeping the aspect ratio
def resize_to_height(image, target_height=256):
    og_width, og_height = image.size
    new_width = int(og_width * (target_height / og_height))
    return image.resize((new_width, target_height))

# Create a tool for downloading and processing images
def process_row(row, dest_folder):
    url = row['publicURL']
    filename = row['imageName']
    # Download the image
    dest = dest_folder/filename

    try:
        # Download image to memory
        response = requests.get(url)
        response.raise_for_status()

        # Open and process the image
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image = resize_to_height(image, target_height=256)
        image.save(dest, format="JPEG", quality=85)
        
    except Exception as e:
        print(f"failed to process{filename}: {e}")

# Download and display some images where at least an animal was found - ex rat
df_test = df[44410:44910] # 500 images with some known animal detections
df_big_chunk = df[0:10000] # first 10000 images

# Process Batches
for batch_idx, df_chunk in enumerate(np.array_split(df_big_chunk, num_batches)): # change to df_test to split a test batch size
    batch_folder = images_root / f'batch_{batch_idx}'
    batch_folder.mkdir(exist_ok=True)

    print(f'Processing batch {batch_idx + 1} / {num_batches} with {len(df_chunk)} images...')

    start = time.time()
    
    with ThreadPoolExecutor(max_workers=max_threads) as executor:
        executor.map(lambda row: process_row(row, batch_folder), [row for _, row in df_chunk.iterrows()])

    end = time.time()
    print(f"Batch {batch_idx+1} took {end - start:.2f} seconds.")
        
print(f'{len(df_test)} Images Downloaded and Resized')

Processing batch 1 / 10 with 1000 images...


  return bound(*args, **kwds)


Batch 1 took 147.00 seconds.
Processing batch 2 / 10 with 1000 images...
Batch 2 took 138.80 seconds.
Processing batch 3 / 10 with 1000 images...
Batch 3 took 98.17 seconds.
Processing batch 4 / 10 with 1000 images...
Batch 4 took 95.22 seconds.
Processing batch 5 / 10 with 1000 images...
Batch 5 took 101.32 seconds.
Processing batch 6 / 10 with 1000 images...
Batch 6 took 107.54 seconds.
Processing batch 7 / 10 with 1000 images...
Batch 7 took 105.00 seconds.
Processing batch 8 / 10 with 1000 images...
Batch 8 took 93.59 seconds.
Processing batch 9 / 10 with 1000 images...
Batch 9 took 103.89 seconds.
Processing batch 10 / 10 with 1000 images...
Batch 10 took 104.37 seconds.
500 Images Downloaded and Resized
CPU times: user 39min 14s, sys: 4min 30s, total: 43min 44s
Wall time: 18min 14s


In [7]:
# Uncomment and run this if the images need to be redone
# !rm images -r
# !rm docs.zip
%lsmagic

Available line magics:
%alias  %alias_magic  %autoawait  %autocall  %automagic  %autosave  %bookmark  %cat  %cd  %clear  %colors  %conda  %config  %connect_info  %cp  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %killbgscripts  %ldir  %less  %lf  %lk  %ll  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %ls  %lsmagic  %lx  %macro  %magic  %man  %matplotlib  %mkdir  %more  %mv  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %pip  %popd  %pprint  %precision  %prun  %psearch  %psource  %pushd  %pwd  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %rm  %rmdir  %run  %save  %sc  %set_env  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%debug  %%file  %%html  %%javascript  %%js  %%latex  %%markdown  %%perl  %%prun  %%pypy  %%

## Running Species Net on the Full Dataset
Now that we have the max number of images downloaded (19.5GB) let's run speciesnet

Note there might be a better way of doing this using bytes downloaded from s3 - but I haven't figured that part out yet.

### We're going to try a multithreading chunks approach

In [8]:
def print_predictions(predictions_dict: dict) -> None:
    print("Predictions:")
    for prediction in predictions_dict["predictions"][0:1]:
        print(prediction["filepath"], "=>", prediction["prediction"])

### Download Model

In [9]:
# Choose the folder we're going to download the model to
model_path = '/content/models'
os.makedirs(model_path, exist_ok=True)

# Download the model (it will go to a folder like /kaggle/input/...)
download_path = kagglehub.model_download('google/speciesnet/PyTorch/v4.0.1a',
                                          force_download=True)

print('Model downloaded to temporary folder: {}'.format(download_path))

# List the contents of the downloaded directory to identify the actual files/subdirectories
model_files = os.listdir(download_path)

# Copy the contents of the model file to our destination folder
for item_name in model_files:
    source_path = os.path.join(download_path, item_name)
    destination_path = os.path.join(model_path, item_name)
    if os.path.isfile(source_path):
        shutil.copy2(source_path, destination_path)
    elif os.path.isdir(source_path):
        shutil.copytree(source_path, destination_path, dirs_exist_ok=True)

print('{} files copied to: {}'.format(len(model_files),model_path))

Model downloaded to temporary folder: /kaggle/input/speciesnet/pytorch/v4.0.1a/1
6 files copied to: /content/models


In [10]:
# Pick the model we want to use (4.0.1a)
model = SpeciesNet(model_path)

print('Model Loaded')

Model Loaded


In [11]:
# Let's format a request string as a list of dicts (aka JSON string format)
def create_instances(batch_folder):
    image_paths = [f'{batch_folder}/{f}' for f in os.listdir(batch_folder) if f.lower().endswith('.jpg')]

    instances = []
    for image_path in image_paths:
        instances.append({
            'filepath': image_path,
            'latitude': 41.906782,
            'longitude': -87.651927
        })

    # Check that it's saved correctly by verifying the first
    print(instances[0])

    return instances


for batch_index in range(len(os.listdir(images_root))):
    instances = create_instances(f'{images_root}/batch_{batch_index}')

    # make the predictions and get a sense of how long it would take
    %time predictions_dict = model.predict(instances_dict={"instances": instances})

    print_predictions(predictions_dict) # show the first prediction of each batch

    # Save the dict to the batch folder
    with open(f'{images_root}/batch_{batch_index}/predictions_dict_{batch_index}.json', 'w') as f:
        json.dump(predictions_dict, f, indent=2)

    print(f'predictions_dict_{batch_index}.json saved to {images_root}/batch_{batch_index}')

{'filepath': 'images/batch_0/2024-02-01_16-41-42--SYEW0598.JPG', 'latitude': 41.906782, 'longitude': -87.651927}
CPU times: user 3min 51s, sys: 2 s, total: 3min 53s
Wall time: 1min 53s
Predictions:
images/batch_0/2024-02-01_16-41-42--SYEW0598.JPG => f1856211-cfb7-4a5b-9158-c0f72fd09ee6;;;;;;blank
predictions_dict_0.json saved to images/batch_0
{'filepath': 'images/batch_1/2024-02-01_Bubbly_003--SYFW0254.JPG', 'latitude': 41.906782, 'longitude': -87.651927}
CPU times: user 3min 56s, sys: 2.46 s, total: 3min 58s
Wall time: 1min 56s
Predictions:
images/batch_1/2024-02-01_Bubbly_003--SYFW0254.JPG => f1856211-cfb7-4a5b-9158-c0f72fd09ee6;;;;;;blank
predictions_dict_1.json saved to images/batch_1
{'filepath': 'images/batch_2/2024-02-01_Bubbly_spypoint_garden--HDPH0710.JPG', 'latitude': 41.906782, 'longitude': -87.651927}
CPU times: user 3min 15s, sys: 1.73 s, total: 3min 17s
Wall time: 1min 36s
Predictions:
images/batch_2/2024-02-01_Bubbly_spypoint_garden--HDPH0710.JPG => f1856211-cfb7-4a5b-9

In [12]:
# !rm images/batch_*/predictions*

## Let's save the predictions dict json file

In [13]:
# Create a docs folder for previewing the images
output_path = '/kaggle/working/output/docs'
os.makedirs(output_path, exist_ok=True)

!python -m megadetector.postprocessing.postprocess_batch_results /kaggle/working/output/predictions_dict_master.json /kaggle/working/output/docs

Loading results from /kaggle/working/output/predictions_dict_master.json
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/megadetector/postprocessing/postprocess_batch_results.py", line 2058, in <module>
    main()
  File "/usr/local/lib/python3.11/dist-packages/megadetector/postprocessing/postprocess_batch_results.py", line 2052, in main
    ppresults = process_batch_results(options)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/megadetector/postprocessing/postprocess_batch_results.py", line 1026, in process_batch_results
    detections_df, other_fields = load_api_results(
                                  ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/megadetector/postprocessing/load_api_results.py", line 54, in load_api_results
    detection_results = load_md_or_spec

## Let's zip the folder so we can easily download it

In [14]:
shutil.make_archive('/kaggle/working/output/docs', 'zip', '/kaggle/working/output/docs')

# and finally clean up the tree that made it this far
shutil.rmtree('/kaggle/working/output/docs')  # Deletes the folder
