# Data Retrieval Demo

This notebook demonstrates how to retrieve data from various environmental APIs:
- USGS Water Services (groundwater, surface water)
- EPA Water Quality Portal
- NOAA Climate Data
- Custom REST APIs

In [None]:
import pandas as pd
import sys
from pathlib import Path
from datetime import datetime, timedelta

sys.path.insert(0, str(Path.cwd().parent.parent))

from scripts.data_retrieval import USGSWaterServices, EPAWaterQuality, NOAAClimate, RESTClient
from scripts.data_retrieval import (
    generate_usgs_sites, generate_groundwater_levels, 
    generate_water_quality_data, generate_sample_dataset
)

# Configuration: Set to True to use sample data instead of API calls
# Useful for offline development or when API services are unavailable
OFFLINE_MODE = False

## USGS Water Services

The USGS provides extensive water data through their Water Services API. No API key required.

In [None]:
usgs = USGSWaterServices()

# Available parameter codes
print("Common USGS parameter codes:")
for name, code in usgs.PARAM_CODES.items():
    print(f"  {name}: {code}")

In [None]:
if OFFLINE_MODE:
    # Use synthetic sample data
    gw_sites = generate_usgs_sites(n_sites=100, state="CO")
    print(f"Generated {len(gw_sites)} sample groundwater sites")
else:
    # Get all groundwater sites in Upper Colorado Basin
    try:
        gw_sites = usgs.get_sites(
            huc="14",  # Upper Colorado HUC
            site_type="GW",  # Groundwater
        )
        print(f"Found {len(gw_sites)} groundwater sites")
    except Exception as e:
        print(f"API request failed: {e}")
        print("Falling back to sample data...")
        gw_sites = generate_usgs_sites(n_sites=100, state="CO")

gw_sites.head()

In [None]:
# Get stream discharge data for a specific site
# Site 09380000 is Colorado River at Lees Ferry, AZ - a key monitoring location
discharge = usgs.get_daily_values(
    sites=["09380000"],
    parameter_codes=["00060"],  # Discharge
    start_date=datetime(2024, 1, 1),
    end_date=datetime(2024, 6, 30),
)
print(f"Retrieved {len(discharge)} daily discharge measurements")
discharge.head()

In [None]:
# Get real-time (instantaneous) data
realtime = usgs.get_instantaneous_values(
    sites=["09380000"],
    parameter_codes=["00060", "00065"],  # Discharge and gage height
    period="P7D",  # Past 7 days
)
print(f"Retrieved {len(realtime)} instantaneous measurements")
realtime.head()

## EPA Water Quality Portal

The Water Quality Portal aggregates data from EPA, USGS, and state agencies.

In [None]:
epa = EPAWaterQuality()

# Search for available characteristics (what can be measured)
arsenic_chars = epa.search_characteristics("arsenic")
print("Arsenic-related characteristics:")
for char in arsenic_chars[:10]:
    print(f"  - {char}")

In [None]:
# Get monitoring stations in Colorado
stations = epa.get_stations(
    state_code="CO",
    site_type="Well",
)
print(f"Found {len(stations)} well monitoring stations in Colorado")
stations.head()

In [None]:
# Get pH measurements from the Upper Colorado Basin
# Note: This can take a while for large queries
ph_data = epa.get_results(
    huc="14",  # Upper Colorado
    characteristic_name="pH",
    start_date=datetime(2023, 1, 1),
    end_date=datetime(2023, 12, 31),
)
print(f"Retrieved {len(ph_data)} pH measurements")
ph_data.head()

<cell_type>markdown</cell_type>## NOAA Climate Data

Historical weather and climate data. **Requires a free API token.**

Get your token at: https://www.ncdc.noaa.gov/cdo-web/token

### Setup Instructions:
1. Visit https://www.ncdc.noaa.gov/cdo-web/token
2. Enter your email and request a token
3. Set the token as an environment variable or in config/api_config.yml

**Note:** The cells below are commented out because they require an API key.
Uncomment them after setting up your token.

In [None]:
# Set your NOAA API token
import os
# os.environ['NOAA_API_TOKEN'] = 'your_token_here'

# Or load from config
# import yaml
# with open('../config/api_config.yml') as f:
#     config = yaml.safe_load(f)
#     os.environ['NOAA_API_TOKEN'] = config['noaa']['api_token']

In [None]:
# Example (uncomment if you have a token)
# noaa = NOAAClimate()

# # Get weather stations in Colorado
# stations = noaa.get_stations(location_id="FIPS:08")
# print(f"Found {len(stations)} weather stations in Colorado")

# # Get precipitation data
# precip = noaa.get_data(
#     data_type_ids=["PRCP"],
#     location_id="FIPS:08",
#     start_date=datetime(2024, 1, 1),
#     end_date=datetime(2024, 3, 31),
# )
# print(f"Retrieved {len(precip)} precipitation records")

## Custom REST APIs

Use the generic RESTClient for any REST API.

In [None]:
# Example with a public test API
client = RESTClient("https://jsonplaceholder.typicode.com")

# Simple GET request
posts = client.get_dataframe("/posts")
print(f"Retrieved {len(posts)} posts")
posts.head()

In [None]:
# Example: Creating a client for a hypothetical state water API
class MyStateWaterAPI(RESTClient):
    """Example of extending RESTClient for a specific API."""
    
    def __init__(self, api_key: str):
        super().__init__(
            base_url="https://api.example-water.gov/v1",
            api_key=api_key,
            api_key_header="X-API-Key",
            api_key_prefix="",
            rate_limit_delay=0.5,  # Be nice to the API
        )
    
    def get_wells(self, county=None):
        params = {"county": county} if county else {}
        return self.get_dataframe("/wells", params, data_key="wells")
    
    def get_measurements(self, well_id, start, end):
        return self.get_dataframe(
            f"/wells/{well_id}/measurements",
            params={"start": start, "end": end},
            data_key="measurements"
        )

print("Custom API client pattern defined - adapt for your specific data sources")

## Combining Multiple Data Sources

Often you'll want to merge data from multiple APIs.

In [None]:
# Example: Combine USGS sites with EPA water quality data
# This shows the pattern - actual merge would depend on your specific needs

# Get USGS sites
usgs_sites = usgs.get_sites(
    state_code="CO",
    site_type="ST",  # Streams
)

# Get EPA stations
epa_stations = epa.get_stations(
    state_code="CO",
    site_type="Stream",
)

print(f"USGS stream sites: {len(usgs_sites)}")
print(f"EPA stream stations: {len(epa_stations)}")

# You could merge these based on coordinates or site identifiers
# to create a unified dataset