In [1]:
# Make sure we've got the latest fastbook version
! pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.8/719.8 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.1/124.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m0:00:01[0m0:01[0mm
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m30.4

In [None]:
# Connect to MongoDB and retrieve image URLs and metadata
from pymongo import MongoClient
import pandas as pd
import os

# Get the stored mongo uri secret
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
mongo_uri = user_secrets.get_secret("MONGO_URI")

In [9]:
# Connect to the MongoDB client
client = MongoClient(mongo_uri)
 
# Access the database and collection
db = client['test']
collection = db['cameratrapmedias'] 
 
# Query the collection to retrieve records with image URLs, metadata, and the first index of 'relativePath'
data = list(collection.aggregate([
    {
        '$project': {
            '_id': 0,
            'publicURL': 1,
            'timestamp': 1,
            'folderName': { '$arrayElemAt': ['$relativePath', 1] },
            'fileName': 1
        }
    },
    # { '$limit': 150 }
]))
 
# Convert the data to a pandas DataFrame for exploration
df = pd.DataFrame(data)

# Export the small array to a CSV file for preview
df.to_csv('ur_test_medias.csv', index=False)
 
# Display the first few rows of the DataFrame
print(df.head())

            timestamp  \
0 2024-01-27 13:33:15   
1 2024-01-24 18:56:50   
2 2024-01-24 19:01:54   
3 2024-01-24 19:03:05   
4 2024-01-24 19:04:19   

                                                                                                                   publicURL  \
0                   https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_prologis_02/DCIM/100MEDIA/SYFW0060.JPG   
1  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0001.JPG   
2  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0002.JPG   
3  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0004.JPG   
4  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0006.JPG   

       fileName                               folderName  
0  SYFW0060.JPG      

In [21]:
# We are going to create a column that creates a file name to save the image
import re

# This function will format the final string
def make_filename(s):
    # s = s.lower()
    s = re.sub(r'[^\w\s.-]', '', s) # remove special characters except dash or underscore or period
    s = re.sub(r'\s+', '_', s) # replace whitespace with underscore
    return s

# Combine the relative path second (folder name) + fileName
df['imageName'] = df['folderName'] + '--' + df['fileName']
df['imageName'] = df['imageName'].apply(make_filename)

print(df.head())

            timestamp  \
0 2024-01-27 13:33:15   
1 2024-01-24 18:56:50   
2 2024-01-24 19:01:54   
3 2024-01-24 19:03:05   
4 2024-01-24 19:04:19   

                                                                                                                   publicURL  \
0                   https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_prologis_02/DCIM/100MEDIA/SYFW0060.JPG   
1  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0001.JPG   
2  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0002.JPG   
3  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0004.JPG   
4  https://urbanriverrangers.s3.amazonaws.com/images/2024/2024-01-30_Learnin_platform_camera_test/DCIM/100MEDIA/SYFW0006.JPG   

       fileName                               folderName  \
0  SYFW0060.JPG     

Now that we have a connection to the MongoDB server and access to the URLs, let's use the `fastbook` library to simplify image handling and prepare for model inference.

In [44]:
# Preview 4 images from the publicURL column using fastbook
from fastbook import *
from fastai.vision.widgets import *

# Create a directory to save the images
path = Path('images')
path.mkdir(exist_ok=True)

# Download and display some images where at least an animal was found - ex rat
df_test = df[44410:44420]

for _,row in df.iterrows(): # Select the df_test for only 10 images when testing
    url = row['publicURL']
    filename = row['imageName']
    # Download the image
    dest = path/filename
    download_url(f'{url}?width=480&height=480', dest, show_progress=False) # No need to show a download bar for each one


In [43]:
# Uncomment and run this if the images need to be redone
# !rm images -r

## Running Species Net on the Full Dataset
Now that we have the max number of images downloaded (19.5GB) let's run speciesnet

Note there might be a better way of doing this using bytes downloaded from s3 - but I haven't figured that part out yet.


In [23]:
! pip install -Uqq speciesnet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.3/43.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.7/93.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m956.3/956.3 kB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [40]:
# Run a quick check to see if the GPU is being used
!python -m speciesnet.scripts.gpu_test

*** Running Torch tests ***

Torch version: 2.6.0+cu124
CUDA available (according to PyTorch): True
CUDA version (according to PyTorch): 12.4
CuDNN version (according to PyTorch): 90100
Found 1 CUDA devices:
0: Tesla P100-PCIE-16GB


In [24]:
from IPython.display import display
from IPython.display import JSON

from speciesnet import DEFAULT_MODEL
from speciesnet import draw_bboxes
from speciesnet import load_rgb_image
from speciesnet import SpeciesNet
from speciesnet import SUPPORTED_MODELS

In [53]:
def print_predictions(predictions_dict: dict) -> None:
    print("Predictions:")
    for prediction in predictions_dict["predictions"][0:5]:
        print(prediction["filepath"], "=>", prediction["prediction"])

### Download Model

In [26]:
import kagglehub
import os
import shutil

# Choose the folder we're going to download to
model_path = '/content/models'
os.makedirs(model_path, exist_ok=True)

# Download the model (it will go to a folder like /kaggle/input/...)
download_path = kagglehub.model_download('google/speciesnet/PyTorch/v4.0.1a',
                                          force_download=True)

print('Model downloaded to temporary folder: {}'.format(download_path))

# List the contents of the downloaded directory to identify the actual files/subdirectories
model_files = os.listdir(download_path)

# Copy the contents of the model file to our destination folder
for item_name in model_files:
    source_path = os.path.join(download_path, item_name)
    destination_path = os.path.join(model_path, item_name)
    if os.path.isfile(source_path):
        shutil.copy2(source_path, destination_path)
    elif os.path.isdir(source_path):
        shutil.copytree(source_path, destination_path, dirs_exist_ok=True)

print('{} files copied to: {}'.format(len(model_files),model_path))

Model downloaded to temporary folder: /kaggle/input/speciesnet/pytorch/v4.0.1a/1
6 files copied to: /content/models


In [27]:
# Pick the model we want to use (4.0.1a)
model = SpeciesNet(model_path)

In [38]:
# Let's format a request string as a list of dicts (aka JSON string format)
image_paths = [f'{path}/{f}' for f in os.listdir(path) if f.lower().endswith('.jpg')]

instances = []
for image_path in image_paths:
    instances.append({
        'filepath': image_path,
        'latitude': 41.906782,
        'longitude': -87.651927
    })

# Check that it's saved correctly by verifying the first
print(instances[0])

{'filepath': 'images/2024-05-25_WM_Boardwalk_G--SYFW1871.JPG', 'latitude': 41.906782, 'longitude': -87.651927}


In [None]:
# Make the Predictions
predictions_dict = model.predict(
    instances_dict={
        "instances": instances
    }
)

In [56]:
# Print 5 of these only for checking
print_predictions(predictions_dict)
# display(JSON(predictions_dict))

Predictions:
images/2024-05-25_WM_Boardwalk_G--SYFW1871.JPG => b1352069-a39c-4a84-a949-60044271c0c1;aves;;;;;bird
images/2024-05-25_WM_Boardwalk_G--SYFW1862.JPG => f2d233e3-80e3-433d-9687-e29ecc7a467a;mammalia;;;;;mammal
images/2024-05-25_WM_Boardwalk_G--SYFW1864.JPG => 1f689929-883d-4dae-958c-3d57ab5b6c16;;;;;;animal
images/2024-05-25_WM_Boardwalk_G--SYFW1870.JPG => f1856211-cfb7-4a5b-9158-c0f72fd09ee6;;;;;;blank
images/2024-05-25_WM_Boardwalk_G--SYFW1860.JPG => 1f689929-883d-4dae-958c-3d57ab5b6c16;;;;;;animal


<IPython.core.display.JSON object>

In [67]:
# Let's save the predictions dict json file
import json

with open('images/predictions_dict.json','w') as f:
    json.dump(predictions_dict, f, indent=2)

In [68]:
# Using the megadectector visualization utils
!pip install megadetector-utils -Uqq

In [72]:
# Create a docs folder for previewing the images
output_path = '/kaggle/working/docs'
os.makedirs(output_path, exist_ok=True)

!python -m megadetector.postprocessing.postprocess_batch_results /kaggle/working/images/predictions_dict.json docs

Loading results from /kaggle/working/images/predictions_dict.json
This appears to be a SpeciesNet output file, converting to MD format
Writing temporary results to /tmp/megadetector_temp_files/e71cfa24-342a-11f0-922c-0242ac130202.json
Converting results to dataframe
Finished loading MegaDetector results for 10 images from /kaggle/working/images/predictions_dict.json
Choosing default confidence threshold of 0.2 based on MD version
Assigning images to rendering categories
100%|█████████████████████████████████████████| 10/10 [00:00<00:00, 9976.94it/s]
Finished loading and preprocessing 10 rows from detector output, predicted 9 positives.
100%|███████████████████████████████████████████| 10/10 [00:04<00:00,  2.37it/s]
Rendered 10 images (of 10) in 4.22 seconds (0.42 seconds per image)
Generating classification category report
This appears to be a SpeciesNet output file, converting to MD format
Writing temporary results to /tmp/megadetector_temp_files/e9a23a7a-342a-11f0-922c-0242ac130202.j

In [74]:
# Let's zip the folder so we can easily download it
import shutil

shutil.make_archive('/kaggle/working/docs', 'zip', '/kaggle/working/docs')

# and finally clean up the tree that made it this far
shutil.rmtree('/kaggle/working/docs')  # Deletes the folder
