# Example Notebook of Running 100 images through megadetector only
https://pypi.org/project/megadetector/  

The model is not any faster than running speciesnet as an ensemble.  
This is more of a proof of concept for 'what if we ran MD only?'

In [1]:
# Hush ye annoyances
import warnings
warnings.filterwarnings('ignore')
print("=== Warnings Ignored ===")



In [2]:
%%time
# Single dependency library
!pip install --quiet megadetector > /dev/null 2>&1 # Put thy annoyances elsewhere
print("=== MegaDetector Installed ===")

=== MegaDetector Installed ===
CPU times: user 1.37 s, sys: 396 ms, total: 1.76 s
Wall time: 2min 6s


In [3]:
# Data Handling
import pandas as pd
import numpy as np

# IO - getting files and images from MongoDB and S3
from pymongo import MongoClient
from kaggle_secrets import UserSecretsClient
from pathlib import Path
from tqdm import tqdm

import os
import re
import shutil
import json
import time
import requests

# Image preprocessing and multithreading
import cv2
from concurrent.futures import ThreadPoolExecutor, as_completed

# Megadetector
from megadetector.utils import url_utils, path_utils
from megadetector.visualization import visualization_utils as vis_utils
from megadetector.detection.run_detector_batch import load_and_run_detector_batch, write_results_to_file

# Pretty Views
from IPython.display import display
from IPython.display import JSON

print("=== Libraries Loaded ===")

=== Libraries Loaded ===


In [4]:
# Get the stored mongo uri secret
user_secrets = UserSecretsClient()
mongo_uri = user_secrets.get_secret("MONGO_URI")

# Connect to the MongoDB client
client = MongoClient(mongo_uri)
 
# Access the database and collection
db = client['test']
collection = db['cameratrapmedias'] 
 
# Pull only documents with publicURL
cursor = collection.find({'publicURL': {'$exists': True}}, {'_id': 0, 'publicURL': 1, 'mediaID': 1}).limit(100)
df = pd.DataFrame(list(cursor))

# deduplicate the publicURLs by using the mediaID
df = df.drop_duplicates(subset=['mediaID'], keep='first')

# Return to rows
rows = df.to_dict(orient='records')
print(f"Loaded {len(rows)} image URLs")

Loaded 100 image URLs


In [5]:
%%time
# planning to save all to working directory images with a resize
image_dir = Path('images')
image_dir.mkdir(parents=True, exist_ok=True)

# Target Height
TARGET_HEIGHT = 256

# Resize Target
def resize_keep_aspect_ratio(image, target_height):
    h, w = image.shape[:2]
    new_width = int(w * (target_height / h))
    resized = cv2.resize(image, (new_width, target_height))
    return resized

# --- Worker Function ---

def download_resize_save(row):
    url = row['publicURL']
    filename = f"{row['mediaID']}.jpg"
    out_path = image_dir / filename

    if out_path.exists():
        return (url, True, "Already exists")

    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()

        img_array = np.frombuffer(resp.content, np.uint8)
        img = cv2.imdecode(img_array, cv2.IMREAD_COLOR)

        if img is None:
            return (url, False, "Failed to decode")

        # Resize
        img = resize_keep_aspect_ratio(img, TARGET_HEIGHT)
        cv2.imwrite(str(out_path), img)
        return (url, True, "Success")

    except Exception as e:
        print(f"❌ Error processing {url}: {e}")
        return (url, False, str(e))

# --- Run Parallel ---

MAX_WORKERS = 16  # Tune based on Kaggle resources, maybe 8-16

results = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(download_resize_save, row): row for row in rows}
    for f in tqdm(as_completed(futures), total=len(futures)):
        res = f.result()
        results.append(res)

# --- Summary ---

success_count = sum(1 for _, success, _ in results if success)
fail_count = len(results) - success_count

print(f"✅ Success: {success_count}")
print(f"❌ Failed: {fail_count}")

100%|██████████| 100/100 [00:08<00:00, 11.51it/s]

✅ Success: 100
❌ Failed: 0
CPU times: user 16.1 s, sys: 7.05 s, total: 23.2 s
Wall time: 8.73 s





In [6]:
# Try running from command line
os.environ["WANDB_MODE"] = "disabled"

!python -m megadetector.detection.run_detector_batch \
    "/kaggle/input/megadetector/pytorch/mdv5a/1/md_v5a.0.0.pt" \
    "/kaggle/working/images" \
    "/kaggle/working/test_output.json" \
    --output_relative_filenames \
    --threshold 0.1 \
    --checkpoint_frequency 10000 \
    --checkpoint_path "/kaggle/working/md_checkpoint.json" \
    --quiet
    # --previous_results_file "/kaggle/working/test_output.json"
    # --resume_from_checkpoint /kaggle/working/md_checkpoint.json  # Uncomment if resuming

100 image files found in the input directory
The checkpoint file will be written to /kaggle/working/md_checkpoint.json
PyTorch reports 1 available CUDA devices
GPU available: True
2025-06-23 15:31:17.250557: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750692677.445662     126 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750692677.499418     126 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading PT detector with compatibility mode classic
Fusing layers... 
Model summary: 733 layers, 140054656 parameters, 0 gradients, 208.8 GFLOPs
Loaded model in 26.84 seconds
100%|█████████████████████████████████████████| 100/100 [00:10<00:00,  