In [11]:
import os
import pandas as pd

from sodapy import Socrata

import requests

import time

from urllib.parse import urlparse
import re

from dotenv import load_dotenv

Dataset: https://{domain}/resource/{identifier}.json | The dataset as a json

Count: https://{domain}/views.json?count=True | The count of datasets in the domain

Schema: https://{domain}/views/{identifier}.json | The schema of the given dataset (identifier)

Metadata: https://{domain}/views.json?limit=200&page=1 | The metadata of the datasets in the domain (list of objects representing the datasets)

In [12]:
class Domain:
    def __init__(self, domain):
        self.url = f"https://{domain}/views.json"

    def city_datasets_count(self):
        params = {"count": True}
        response = requests.get(self.url, params=params, timeout=10)
        response.raise_for_status()
        data = response.json()
        return data.get("count", -1) # Error (no count) represented by -1 rather than 0
    
    def city_datasets_ids(self):
        ids = []
        page = 1
        while True:
            params = {"limit": 200, "page": page}
            data = requests.get(self.url, params=params).json()
            if not data:  # no more pages
                break
            # extract just the ids
            for item in data:
                ids.append(item["id"])
            page += 1  # move to next page
        return ids

    def write_dataset_ids_to_file(self, filepath=None):
        # Build filename automatically if not provided
        if filepath is None:
            domain = urlparse(self.url).hostname or "unknown_domain"
            safe_domain = re.sub(r"[^a-zA-Z0-9._-]", "_", domain)
            filepath = f"{safe_domain}_ids.txt"

        ids = self.city_datasets_ids()

        with open(filepath, "w", encoding="utf-8") as f:
            for dataset_id in ids:
                f.write(dataset_id + "\n")

        return filepath

In [None]:
nyc_domain = "data.cityofnewyork.us"

domain = Domain(nyc_domain)

print(domain.city_datasets_count()) # currently 2994 returned from getting the count for data.cityofnewyork.us
print(domain.write_dataset_ids_to_file()) # length should be same as count

'data.cityofnewyork.us_ids.txt'

In [3]:
# Tokens are optional (`None` can be used instead), though requests will be rate limited.
#
# If you choose to use a token, run the following command on the terminal (or add it to your .bashrc)
# $ export SODAPY_APPTOKEN=<token>
load_dotenv()
socrata_token = os.getenv("NYC_SOCRATA_TOKEN")

In [None]:
# Dataset: https://{domain}/resource/{identifier}.json
# Metadata: https://{domain}/views/{identifier}.json
nyc_dataset_identifier = "fhrw-4uyv" # 311 data
# nyc_client = Socrata(nyc_domain, username=os.getenv("NYC_SOCRATA_USERNAME"), password=os.getenv("NYC_SOCRATA_PASSWORD"), app_token=os.getenv("NYC_SOCRATA_TOKEN"), timeout=60)
nyc_client = Socrata(nyc_domain, socrata_token, timeout=600) # Often takes longer than default 10 sec timeout to load
nyc_results = nyc_client.get(nyc_dataset_identifier)
nyc_df = pd.DataFrame.from_dict(nyc_results)
print("Domain: {domain:}\nSession: {session:}\nURI Prefix: {uri_prefix:}".format(**nyc_client.__dict__))
print(nyc_df.shape)

Domain: data.cityofnewyork.us
Session: <requests.sessions.Session object at 0x00000205E3DEF460>
URI Prefix: https://
(1000, 50)


In [None]:
# Experimenting with smaller NOLA data
nola_domain = "data.nola.gov"
nola_dataset_identifier = "2mq3-p3xc" # ID from previous study that no longer works ("You don't have the security clearance to view this page.")
# nola_client = Socrata(nola_domain, username=os.getenv("nola_SOCRATA_USERNAME"), password=os.getenv("nola_SOCRATA_PASSWORD"), app_token=os.getenv("nola_SOCRATA_TOKEN"), timeout=60)
nola_client = Socrata(nola_domain, socrata_token)
nola_results = nola_client.get(nola_dataset_identifier)
nola_df = pd.DataFrame.from_dict(nola_results)
print("Domain: {domain:}\nSession: {session:}\nURI Prefix: {uri_prefix:}".format(**nola_client.__dict__))
print(nola_df.shape)

HTTPError: 403 Client Error: Forbidden.
	You must be logged in to access this resource

In [None]:
# extract tree-related complaints
# Example on sodapy github that did not work because Chattanooga data changed
tree_related = pd.concat(
    [
        nyc_df.complaint_type.str.contains(r"[T|t]ree").value_counts(),
    ],
    axis=1,
    keys=["nyc"],
)
tree_related.div(tree_related.sum()).round(2)

In [None]:
nyc_df.head()

Unnamed: 0,unique_key,created_date,agency,agency_name,complaint_type,descriptor,location_type,incident_zip,incident_address,street_name,...,closed_date,resolution_description,resolution_action_updated_date,vehicle_type,facility_type,bridge_highway_name,bridge_highway_segment,taxi_company_borough,bridge_highway_direction,road_ramp
0,66932951,2025-11-24T02:06:15.000,NYPD,New York City Police Department,Noise - Residential,Loud Music/Party,Residential Building/House,10306,1742 RICHMOND ROAD,RICHMOND ROAD,...,,,,,,,,,,
1,66927366,2025-11-24T02:05:57.000,NYPD,New York City Police Department,Noise - Residential,Loud Television,Residential Building/House,10002,280 MADISON STREET,MADISON STREET,...,,,,,,,,,,
2,66927332,2025-11-24T02:05:27.000,NYPD,New York City Police Department,Noise - Commercial,Loud Music/Party,Store/Commercial,11103,42-12 BROADWAY,BROADWAY,...,,,,,,,,,,
3,66927308,2025-11-24T02:04:27.000,NYPD,New York City Police Department,Illegal Parking,Commercial Overnight Parking,Street/Sidewalk,10470,343 EAST 238 STREET,EAST 238 STREET,...,,,,,,,,,,
4,66931824,2025-11-24T02:03:41.000,NYPD,New York City Police Department,Noise - Street/Sidewalk,Loud Talking,Street/Sidewalk,11385,2025 WOODBINE STREET,WOODBINE STREET,...,,,,,,,,,,


In [None]:
SCROLL_FILE = "last_scroll_id.txt"
DOMAIN_FILE = "socrata_domains.txt"

def load_state():
    """Load scroll ID + seen domains if resuming."""
    # Load scroll ID
    if os.path.exists(SCROLL_FILE):
        with open(SCROLL_FILE, "r") as f:
            scroll_id = f.read().strip()
            if scroll_id == "":
                scroll_id = "*"   # fallback
    else:
        scroll_id = "*"

    # Load seen domains set
    seen = set()
    if os.path.exists(DOMAIN_FILE):
        with open(DOMAIN_FILE, "r") as f:
            for line in f:
                seen.add(line.strip())

    return scroll_id, seen


def save_scroll_id(scroll_id):
    """Write latest scroll ID to disk so we can resume."""
    with open(SCROLL_FILE, "w") as f:
        f.write(scroll_id)


def get_all_domains_resume():
    url = "https://api.us.socrata.com/api/catalog" # Seems to be a catalog of all things accessible by the api
    limit = 1000 # Limit 1000 because it is small enough to avoid timeouts. 10000 gets timed out. Optimal would probably be between

    # Load previous state
    scroll_id, seen = load_state()

    print(f"Starting with scroll_id={scroll_id}, {len(seen)} domains already saved.")

    # Open output file in append mode
    with open(DOMAIN_FILE, "a") as f_out:

        while True:
            print(f"Fetching scroll_id={scroll_id} ...")

            params = {"scroll_id": scroll_id, "limit": limit}

            try:
                resp = requests.get(url, params=params, timeout=10)
                resp.raise_for_status()
            except Exception as e:
                print(f"Error: {e}, retrying in 5 seconds...")
                time.sleep(5)
                continue

            data = resp.json()
            results = data.get("results", [])

            if not results:
                print("Deep scroll completed or no more results.")
                break

            # Process results
            for item in results:
                metadata = item.get("metadata", {})
                domain = metadata.get("domain")

                if domain and domain not in seen:
                    f_out.write(domain + "\n")
                    f_out.flush()  # ensure immediate write
                    seen.add(domain)

            # Update scroll ID for next request
            next_scroll = results[-1].get("resource").get("id") # id of previous resource can be used to get next scroll
            if not next_scroll:
                print("Finished scrolling dataset.")
                break

            scroll_id = next_scroll
            save_scroll_id(scroll_id)  # persist checkpoint

            time.sleep(0.2) # Only to avoid timeouts, may not be necessary

    print(f"\nCompleted with {len(seen)} total domains.")
    return seen

# Getting all domains to find cities, as the original study was about the state of urban data across US cities, not just NY
domains = get_all_domains_resume() # 555 domains discovered amongst ~220k things in the catalog

Starting with scroll_id=*, 118 domains already saved.
Fetching scroll_id=* ...
Fetching scroll_id=26is-s4fm ...
Fetching scroll_id=2auq-ndkr ...
Fetching scroll_id=2fh6-vrts ...
Fetching scroll_id=2k8a-dz2p ...
Fetching scroll_id=2qsi-qheg ...
Fetching scroll_id=2vhj-s442 ...
Fetching scroll_id=322a-riji ...
Fetching scroll_id=36ib-rtmu ...
Fetching scroll_id=3b78-mfyi ...
Fetching scroll_id=3fxc-nque ...
Fetching scroll_id=3kbj-ypat ...
Fetching scroll_id=3qys-jk4f ...
Fetching scroll_id=3vft-99rh ...
Fetching scroll_id=4292-pktu ...
Fetching scroll_id=46zs-4ngp ...
Fetching scroll_id=4bn5-jdm8 ...
Fetching scroll_id=4g6s-ak9g ...
Fetching scroll_id=4m2v-hzec ...
Fetching scroll_id=4rka-uupg ...
Fetching scroll_id=4w6e-7nqk ...
Fetching scroll_id=52ny-36z2 ...
Fetching scroll_id=576d-v5m3 ...
Fetching scroll_id=5bn2-vnxz ...
Fetching scroll_id=5g3x-yfbg ...
Fetching scroll_id=5kp7-t9c8 ...
Fetching scroll_id=5rei-mff9 ...
Fetching scroll_id=5w2u-reag ...
Fetching scroll_id=62vi-89fw .