##### Setup

In [1]:
import pandas as pd
import requests
from tqdm import tqdm
import time

##### Preview IUCN data

In [2]:
# Load cleaned IUCN data
species_list = pd.read_parquet("../clean/cleaned_iucn_mammals.parquet", columns = ["sci_name"])
display(species_list.info())
species_list = sorted(species_list["sci_name"].dropna().unique())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13178 entries, 0 to 13177
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sci_name  13178 non-null  object
dtypes: object(1)
memory usage: 103.1+ KB


None

In [4]:
print(len(species_list))
print(species_list)

5928
['Abditomys Latidens', 'Abeomelomys Sevia', 'Abrawayaomys Chebezi', 'Abrawayaomys Ruschii', 'Abrocoma Bennettii', 'Abrocoma Boliviensis', 'Abrocoma Budini', 'Abrocoma Cinerea', 'Abrocoma Famatina', 'Abrocoma Shistacea']


##### Get Batches

In [3]:
import os
import json
import random

# Constants
API_ENDPOINT = "https://api.inaturalist.org/v1/observations"
BATCH_SIZE = 1000
RETRY_WAIT = 60  # seconds to wait on 429
GLOBAL_THROTTLE_INTERVAL = 1  # entries
GLOBAL_THROTTLE_WAIT = 0.75  # seconds
OUTPUT_DIR = "../clean/inat_batches/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

species_list_path = "../clean/cleaned_iucn_mammals.parquet"
species_df = pd.read_parquet(species_list_path, columns=["sci_name"])
species_list = sorted(species_df["sci_name"].str.capitalize().dropna().unique())

def fetch_species_data(species):
    try:
        response = requests.get(API_ENDPOINT, params={
            "taxon_name": species,
            "iconic_taxa[]": "Mammalia",
            "per_page": 1,
            "has[]": "photos"
        })

        # Rate limited
        if response.status_code == 429:
            return "RATE_LIMIT"

        response.raise_for_status()
        data = response.json()

        results = data.get("results", [])
        if not results:
            return "MISSING_SPECIES"

        photos = results[0].get("photos", [])
        if not photos:
            return "MISSING_PHOTO"
        else:
            return photos[0].get("url", None)

    except requests.exceptions.RequestException:
        return "ERROR"
    except json.decoder.JSONDecodeError:
        return "JSON_ERROR"

def process_species_batch(start_idx=0, end_idx=BATCH_SIZE, batch_id=0):
    batch_species = species_list[start_idx:end_idx]
    missing_species = []
    species_missing_photos = []
    species_with_photos = []

    for i, species in enumerate(tqdm(batch_species, desc=f"Batch {batch_id}")):
        # Global throttle
        if i % GLOBAL_THROTTLE_INTERVAL == 0 and i != 0:
            time.sleep(GLOBAL_THROTTLE_WAIT + random.uniform(0, 0.25))

        result = fetch_species_data(species)

        # Retry on rate limit
        retries = 0
        while result == "RATE_LIMIT" and retries < 3:
            print(f"Rate limited. Sleeping {RETRY_WAIT}s before retrying...")
            time.sleep(RETRY_WAIT)
            result = fetch_species_data(species)
            retries += 1

        if result == "MISSING_SPECIES":
            missing_species.append(species)
        elif result == "MISSING_PHOTO":
            species_missing_photos.append(species)
        elif result and isinstance(result, str) and result.startswith("http"):
            species_with_photos.append((species, result))
        else:
            missing_species.append(species)

    # Save batch results
    pd.DataFrame(missing_species, columns=["sci_name"]).to_csv(f"{OUTPUT_DIR}missing_species_batch{batch_id}.csv", index=False)
    pd.DataFrame(species_missing_photos, columns=["sci_name"]).to_csv(f"{OUTPUT_DIR}missing_photos_batch{batch_id}.csv", index=False)
    pd.DataFrame(species_with_photos, columns=["sci_name", "photo_url"]).to_csv(f"{OUTPUT_DIR}species_with_photos_batch{batch_id}.csv", index=False)

    return f"Batch {batch_id} complete."

# Utility to process all batches if needed
def run_all_batches():
    total_batches = len(species_list) // BATCH_SIZE + (len(species_list) % BATCH_SIZE > 0)
    for b in range(total_batches):
        start = b * BATCH_SIZE
        end = min((b + 1) * BATCH_SIZE, len(species_list))
        process_species_batch(start, end, b)

# Utility to merge batch outputs
def merge_all_batches():
    from glob import glob

    def merge_csvs(pattern, output_name):
        files = sorted(glob(f"{OUTPUT_DIR}{pattern}"))
        dfs = [pd.read_csv(f) for f in files]
        final_df = pd.concat(dfs, ignore_index=True)
        final_df.drop_duplicates(inplace=True)
        final_df.to_csv(f"{OUTPUT_DIR}{output_name}.csv", index=False)
        return final_df

    df1 = merge_csvs("missing_species_batch*.csv", "all_missing_species")
    df2 = merge_csvs("missing_photos_batch*.csv", "all_missing_photos")
    df3 = merge_csvs("species_with_photos_batch*.csv", "all_species_with_photos")

    return df1, df2, df3


In [None]:
run_all_batches()

In [4]:
process_species_batch(start_idx=3000, end_idx=4000, batch_id=3)

Batch 3: 100%|██████████████████████████████| 1000/1000 [22:32<00:00,  1.35s/it]


'Batch 3 complete.'

In [5]:
process_species_batch(start_idx=4000, end_idx=5000, batch_id=4)

Batch 4: 100%|██████████████████████████████| 1000/1000 [22:08<00:00,  1.33s/it]


'Batch 4 complete.'

In [6]:
process_species_batch(start_idx=5000, end_idx=6000, batch_id=5)

Batch 5: 100%|████████████████████████████████| 928/928 [19:33<00:00,  1.26s/it]


'Batch 5 complete.'

##### Inspect results

In [7]:
df_missing_species, df_missing_photos, df_with_photos = merge_all_batches()

In [8]:
df_missing_species.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1640 entries, 0 to 1639
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sci_name  1640 non-null   object
dtypes: object(1)
memory usage: 12.9+ KB


In [9]:
df_missing_photos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sci_name  0 non-null      object
dtypes: object(1)
memory usage: 132.0+ bytes


In [10]:
df_with_photos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4288 entries, 0 to 4287
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sci_name   4288 non-null   object
 1   photo_url  4288 non-null   object
dtypes: object(2)
memory usage: 67.1+ KB


In [20]:
print(df_missing_species.shape[0]/len(species_list))
print(df_missing_species.shape[0] + df_with_photos.shape[0])

0.2766531713900135
5928


In [22]:
display(pd.DataFrame({"Category":["missing species", "species with no photos", "species with photos", "Total"],
                      "Counts" : [df_missing_species.shape[0], df_missing_photos.shape[0], df_with_photos.shape[0], len(species_list)]
                     }))
print(f"{df_missing_species.shape[0]/len(species_list)}")

Unnamed: 0,Category,Counts
0,missing species,1640
1,species with no photos,0
2,species with photos,4288
3,Total,5928


0.2766531713900135


##### Preview found images

In [13]:
from IPython.display import HTML

def render_images(df, n=5):
    html = ""
    for _, row in df.sample(n).iterrows():
        species = row[0]
        url = row[1]
        html += f"<p><b>{species}</b><br><img src='{url}' width='200'></p>"
    return HTML(html)

# Example
render_images(pd.read_csv("../clean/inat_batches/all_species_with_photos.csv"))


  species = row[0]
  url = row[1]
