# Introduction
As of 2025-06-12, the volunteers at rangers.urbanrivers have added 59,351 observations.


In [None]:
# Data Handling
import pandas as pd

# IO - getting files and images
from kaggle_secrets import UserSecretsClient
import requests
import json
import os
import urllib.parse


In [None]:
%%time
# Define the list of species we are going to pull
species_list = [
    "Canis latrans", "Canis familiaris", "Felis catus", "Castor canadensis",
    "Ondatra zibethicus", "Sylvilagus floridanus", "Sciurus carolinensis",
    "Procyon lotor", "Lontra canadensis", "Didelphis virginiana",
    "Anas platyrhynchos", "Branta canadensis", "Trachemys scripta elegans",
    "Chelydra serpentina", "Chrysemys picta", "Apalone spinifera",
    "Columba livia", "Sturnus vulgaris", "Agelaius phoeniceus",
    "Passer domesticus", "Turdus migratorius", "Corvus brachyrhynchos",
    "Ardea herodias", "Nycticorax nycticorax", "Astur cooperii",
    "Actitis macularius", "Aix sponsa", "Ardea alba", "Cardinalis cardinalis",
    "Cyprinus carpio"
]
species_encoded = ",".join([urllib.parse.quote(s) for s in species_list])

# Get URLs for organising download links
user_secrets = UserSecretsClient()
obs_url = user_secrets.get_secret("OBS_BASE")

def fetch_all_obs():
    batch_size = 1000
    page = 1
    all_images = []


    while True:
        url = f"{obs_url}?species={species_encoded}&limit={batch_size}&page={page}"
        
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Request failed at page {page} with status code {response.status_code}")
            break

        data = response.json()
        images = data.get("images", [])
        if not images:
            # No more images left
            break

        all_images.extend(images)
        print(f"Page {page}: Retrieved {len(images)} image observations")
        page += 1

    return all_images


# Fetch JSON for all images
try:
    print("===== Starting JSON Fetch =====")
    obs_json = fetch_all_obs()
except Exception as e:
    print(f"Problem with fetch: {e}")
else:
    print("===== Completed JSON Fetch =====")


In [None]:
%%time
# Process the JSON
def process_obs_json(obs):
    records = []
    for ob in obs:
        media_id = ob.get("mediaID")
        public_url = ob.get("publicURL")
        for species in ob.get("speciesConsensus", []):
            scientific_name = species.get("scientificName")
            observation_count = species.get("observationCount")
            records.append({
                "mediaID": media_id,
                "publicURL": public_url,
                "scientificName": scientific_name,
                "observationCount": observation_count
            })
    df = pd.DataFrame(records)

    return df

df = process_obs_json(obs_json)

with pd.option_context('display.width', 0, 'display.max_colwidth', None):
    display(df.head())

df.to_csv('/kaggle/working/initial_processed_data.csv')

print(len(df))

# Start Here to avoid re-querying the api

In [None]:
df = pd.read_csv("/kaggle/working/initial_")

In [None]:
# seems like we have duplicates for every image and observation - just want a validated list
df2 = df.drop_duplicates().reset_index(drop=True)
with pd.option_context('display.width', 0, 'display.max_colwidth', None):
    display(df2.head())

print(f'Total rows: {len(df2)}\n')
print(df2['scientificName'].value_counts())

In [None]:
# Filter to species with at least 3 votes
df3 = df2[(df2['observationCount'] >= 3) ]
df3.loc[:, 'scientificName'] = df3['scientificName'].fillna("blank")
df3 = df3.sort_values(by='observationCount', ascending=False).reset_index(drop=True)
with pd.option_context('display.width', 0, 'display.max_colwidth', None):
    display(df3.head())

print(f'Total rows: {len(df3)}\n')
print(df3['scientificName'].value_counts())

In [None]:
# This might be a multiple classification problem - let's see if things change when we group by and list the scientific names
df3_grouped = df3.groupby(['mediaID', 'publicURL'])['scientificName'] \
    .agg(lambda x: ';'.join(sorted(set(x)))) \
    .reset_index()

# To keep classes where only blank and yet not ones containing blank
def is_only_blank(label_str):
    return label_str.strip() == "blank"

# Filter: keep rows where "blank" is not in the list OR is the only label
# This is because sometimes people are categorizing multiple blanks in a row and do not see the animal while zoned out.
df3_grouped = df3_grouped[
    ~df3_grouped['scientificName'].str.contains(';blank') |
    df3_grouped['scientificName'].apply(is_only_blank)
]

# Show the results
with pd.option_context('display.width', 0, 'display.max_colwidth', None):
    display(df3_grouped.head())
    
print(f'Total rows: {len(df3_grouped)}\n')
print(df3_grouped['scientificName'].value_counts())

In [None]:
# Save the filtered df as a starting point for image requests
df3_grouped.to_csv("/kaggle/working/labeled_species_urls.csv")