
# 1.2: Data Acquisition



**Objective:** To programmatically and reproducibly download all necessary raw data for the project, including Digital Elevation Models (DEMs), climate data, and soil properties. This notebook is designed to be idempotent, meaning it can be run multiple times without re-downloading existing files.



**Gold-Standard Practices Implemented:**

- **Configuration-Driven:** All paths, data sources, and parameters are loaded from the central `config.yml` file.
- **System-Aware Parallelism:** The number of concurrent downloads is dynamically set based on the number of physical CPU cores, optimizing for I/O-bound tasks.
- **Idempotent & Resilient:** The script checks for existing files and skips them. Downloads are wrapped in a retry mechanism to handle transient network errors.
- **Atomic Writes:** Files are downloaded to a temporary `.part` file and are only renamed upon successful completion, preventing data corruption from failed downloads.
- **Provenance Tracking:** A JSON sidecar file is created for each downloaded artifact, recording its source, parameters, and download time.
- **Cloud-Native Discovery:** DEM and climate data are discovered using the modern STAC (SpatioTemporal Asset Catalog) standard, the best practice for working with large geospatial archives.
- **Daymet Zarr Access:** Daily climate records are streamed from the Planetary Computer Daymet Zarr asset, allowing robust subsetting without issuing thousands of individual HTTP requests.
- **Direct SDA Integration:** Soil tabular data are queried from USDA Soil Data Access using the `pygnatsgofetch` helper, ensuring we capture the full component and horizon tables required for WEPP parameterization.


In [1]:

# === 1. Configuration & Setup ===

# --- Core Libraries ---
from __future__ import annotations

import logging
import math
import os
import re
import sys
import threading
from collections.abc import Sequence
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from pathlib import Path

import requests
import yaml

# --- Project-Specific Modules ---
# Add project's src directory to path to allow imports
def find_project_root(marker='config.yml'):
    path = Path.cwd().resolve()
    while path.parent != path:
        if (path / marker).exists():
            return path
        path = path.parent
    raise FileNotFoundError(f"Project root with marker '{marker}' not found.")

PROJECT_ROOT = find_project_root()
sys.path.insert(0, str(PROJECT_ROOT / 'src'))

# Now that src is in the path, we can import project modules
from utils import get_key_from_sw_corner, get_bbox_from_key, write_provenance, setup_colored_logging
from download_utils import execute_downloads
import pygnatsgofetch as gnatsgo

# --- Geospatial & System Libraries ---
import fsspec
import geopandas as gpd
import pandas as pd
import psutil
import pyproj
import rioxarray
import xarray as xr
from pystac_client import Client
import planetary_computer
from shapely import union_all
from shapely.geometry import box, mapping

# --- Visualization & Utility Libraries ---
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D
from tqdm.auto import tqdm
from tenacity import retry, stop_after_attempt, wait_exponential

# --- Gold-Standard Logging Setup ---
setup_colored_logging()
log = logging.getLogger("1.2_data_acquisition")

# --- Configuration Loading ---
CONFIG_PATH = PROJECT_ROOT / "config.yml"
with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

# --- Path Configuration (from config) ---
# Input paths for processed study areas
STUDY_AREAS_GPKG = PROJECT_ROOT / config['paths']['study_areas']
EXTERNAL_HOLDOUT_GPKG = PROJECT_ROOT / config['paths']['external_holdout']

# Output paths for raw data downloads
DEM_OUT_DIR = PROJECT_ROOT / config['paths']['dem_dir']
CLIMATE_OUT_DIR = PROJECT_ROOT / config['paths']['climate_dir']
SOILS_OUT_DIR = PROJECT_ROOT / config['paths']['soils_dir']

# Output path for report figures generated by this notebook
REPORTS_DIR = PROJECT_ROOT / config['paths']['outputs_dir'] / 'data_acquisition'

# Create all output directories if they don't exist
for p in [DEM_OUT_DIR, CLIMATE_OUT_DIR, SOILS_OUT_DIR, REPORTS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# --- Data Source & Parameter Configuration (from config) ---
PC_STAC_URL = config['data_sources']['dem']['stac_url']
DEM_COLLECTION = config['data_sources']['dem']['collection']
PRISM_DAILY_VARS = config['data_sources']['prism']['daily_variables']
PRISM_DATE_RANGE = config['data_sources']['prism']['date_range']
GNATSGO_JOB_NAME = config['data_sources']['gnatsgo']['job_name']
WGS84_CRS = config['parameters']['wgs84_crs']

# --- System-Aware Parallelism ---
PHYSICAL_CORES = psutil.cpu_count(logical=False)
MAX_CONCURRENT_DOWNLOADS = max(1, PHYSICAL_CORES)

# --- Log Setup Summary ---
log.info("--- Configuration Summary ---")
log.info(f"Project Root:       {PROJECT_ROOT}")
log.info(f"Input CV Areas:     {STUDY_AREAS_GPKG}")
log.info(f"Output DEMs:        {DEM_OUT_DIR}")
log.info(f"Output Climate:     {CLIMATE_OUT_DIR}")
log.info(f"Output Soils:       {SOILS_OUT_DIR}")
log.info(f"Output Reports:     {REPORTS_DIR}")
log.info(f"Concurrent Workers: {MAX_CONCURRENT_DOWNLOADS}")
log.info("Setup complete.")


[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - --- Configuration Summary ---[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Project Root:       /workspace[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Input CV Areas:     /workspace/data/processed/study_areas.gpkg[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Output DEMs:        /workspace/data/raw/dem_30m[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Output Climate:     /workspace/data/raw/climate_prism[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Output Soils:       /workspace/data/raw/soils_gnatsgo[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Output Reports:     /workspace/outputs/data_acquisition[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Concurrent Workers: 10[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Setup complete.[0m


In [2]:
# === 2. Create Master Area of Interest (AOI) ===

# --- GOLD-STANDARD: Dynamically set PROJ_LIB environment variable ---
try:
    pyproj_datadir = pyproj.datadir.get_data_dir()
    os.environ['PROJ_LIB'] = pyproj_datadir
    log.info(f"PROJ_LIB environment variable set to: {pyproj_datadir}")
except Exception as e:
    log.warning(f"Could not set PROJ_LIB automatically. This may cause CRS errors. {e}")

def create_master_aoi() -> tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
    """
    Loads all study area polygons, merges them into a single master Area of Interest (AOI),
    and reprojects it to the project's target CRS.
    """
    log.info(f"Loading and merging all study area boundaries to create master AOI.")

    if not STUDY_AREAS_GPKG.exists() or not EXTERNAL_HOLDOUT_GPKG.exists():
        raise FileNotFoundError(f"Input files not found. Please run Notebook 1.1 first.")

    gdf_cv = gpd.read_file(STUDY_AREAS_GPKG, layer='cv_provinces')
    gdf_holdout = gpd.read_file(EXTERNAL_HOLDOUT_GPKG, layer='external_holdout')

    gdf_all = pd.concat([gdf_cv, gdf_holdout], ignore_index=True)
    log.info(f"Loaded a total of {len(gdf_all)} polygons for the master AOI.")

    master_aoi_geom = union_all(gdf_all.geometry)
    master_aodf = gpd.GeoDataFrame(geometry=[master_aoi_geom], crs=gdf_all.crs)
    master_aoi_wgs84 = master_aodf.to_crs(WGS84_CRS)

    log.info(f"✅ Master AOI created and reprojected to {WGS84_CRS}.")
    return master_aoi_wgs84, master_aodf

# --- Execute and Visualize ---
master_aoi_wgs84, master_aodf = create_master_aoi()
log.info("Visualizing the master Area of Interest...")
master_aoi_wgs84.explore()


[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - PROJ_LIB environment variable set to: /opt/conda/envs/app/share/proj[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Loading and merging all study area boundaries to create master AOI.[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Loaded a total of 6 polygons for the master AOI.[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - ✅ Master AOI created and reprojected to EPSG:4326.[0m
[38;20m2025-10-04 00:17:41 - 1.2_data_acquisition - INFO - Visualizing the master Area of Interest...[0m


### Justification for Spatial Intersection Method

To ensure efficiency and minimize unnecessary data downloads, we will not simply download all DEM tiles that fall within the rectangular bounding box of our Area of Interest (AOI). For irregularly shaped study areas like ours, this naive approach would result in downloading numerous tiles that do not actually contain any of our regions.

Instead, we employ a more precise spatial intersection method. This involves:
1.  Generating a complete grid of all possible 1x1 degree tile footprints that cover the AOI's bounding box.
2.  Performing a spatial join (`sjoin`) between these tile footprints and the actual multipolygon of our master AOI.

This operation selects only the tiles that truly intersect our study areas, creating a minimal and precise list of required data. This is a key optimization that enhances the scalability and reproducibility of our workflow.


In [None]:
# === 3. Identify Required DEM Tiles via Spatial Analysis ===

def identify_dem_tiles(aoi_gdf: gpd.GeoDataFrame) -> tuple[gpd.GeoDataFrame, list[str]]:
    """
    Performs a spatial intersection to find the minimal set of 1x1 degree DEM tiles
    required to cover the master Area of Interest (AOI).
    """
    log.info("Performing spatial intersection to find required DEM tile keys...")
    minx, miny, maxx, maxy = aoi_gdf.total_bounds

    lons = range(math.floor(minx), math.ceil(maxx))
    lats = range(math.floor(miny), math.ceil(maxy))
    tile_geoms = [box(lon, lat, lon + 1, lat + 1) for lon in lons for lat in lats]
    tile_keys = [get_key_from_sw_corner(lon, lat) for lon in lons for lat in lats]

    candidate_tiles_gdf = gpd.GeoDataFrame({"key": tile_keys}, geometry=tile_geoms, crs=aoi_gdf.crs)

    intersecting_tiles = gpd.sjoin(candidate_tiles_gdf, aoi_gdf, how="inner", predicate="intersects")
    required_dem_keys = sorted(intersecting_tiles["key"].unique())

    log.info(f"Identified {len(required_dem_keys)} required tiles (vs. {len(candidate_tiles_gdf)} in bounding box).")
    log.info(f"Skipping {len(candidate_tiles_gdf) - len(required_dem_keys)} unnecessary downloads.")

    return candidate_tiles_gdf, required_dem_keys

# --- Execute and store results ---
candidate_tiles_gdf, required_dem_keys = identify_dem_tiles(master_aoi_wgs84)


In [None]:
# === 4. DEM Pre-flight Check ===

def check_dem_coverage(required_keys: list[str], dem_dir: Path) -> tuple[list[str], set[str]]:
    """
    Scans the local DEM directory for existing tiles and determines which are missing.
    """
    log.info(f"Scanning {dem_dir} for existing DEM tiles...")
    on_disk_keys = {re.search(r'(n|s)\d{2}(w|e)\d{3}', p.name).group(0) for p in dem_dir.glob("*.tif") if re.search(r'(n|s)\d{2}(w|e)\d{3}', p.name)}
    missing_keys = sorted(set(required_keys) - on_disk_keys)

    log.info("--- DEM Coverage Summary ---")
    log.info(f"   Tiles Required: {len(required_keys)}")
    log.info(f"    Tiles On Disk: {len(on_disk_keys)}")
    log.info(f"    Tiles Missing: {len(missing_keys)}")

    if not missing_keys:
        log.info("✅ All required DEM tiles are already on disk. No download needed.")
    else:
        log.info(f"Found {len(missing_keys)} tiles to download.")

    return missing_keys, on_disk_keys

# --- Execute and store results ---
missing_dem_keys, on_disk_dem_keys = check_dem_coverage(required_dem_keys, DEM_OUT_DIR)


In [None]:

# === 5. STAC-Based Discovery & Parallel Download Functions ===

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def _fetch_stac_items(catalog, collection, bbox, datetime_range):
    """Resiliently fetches STAC items for a given search."""
    search = catalog.search(collections=[collection], bbox=bbox, datetime=datetime_range)
    return list(search.item_collection())

def build_dem_download_jobs_stac(keys_to_find: list) -> tuple[list[dict], list[str]]:
    """
    Probes the Planetary Computer STAC API for valid DEM URLs and returns download jobs.
    URLs are NOT pre-signed; signing happens just-in-time in the download worker.
    """
    jobs, not_found_keys = [], []
    log.info(f"Connecting to STAC Catalog at: {PC_STAC_URL}")
    catalog = Client.open(PC_STAC_URL)

    log.info(f"Querying STAC for {len(keys_to_find)} missing DEM tiles from collection '{DEM_COLLECTION}'...")
    for key in tqdm(keys_to_find, desc="STAC Query", unit="key"):
        try:
            bbox = get_bbox_from_key(key)
            items = _fetch_stac_items(catalog, DEM_COLLECTION, bbox, datetime_range=None)

            if not items:
                not_found_keys.append(key)
                continue

            asset_key = 'data' if 'data' in items[0].assets else 'cog'
            asset_url = items[0].assets[asset_key].href

            filename = f"USGS_1_{key}.tif"
            out_path = DEM_OUT_DIR / filename

            jobs.append({
                "url": asset_url,  # Store the original, unsigned URL
                "out_path": out_path,
                "key": key,
                "source_info": {
                    "stac_url": PC_STAC_URL,
                    "collection": DEM_COLLECTION,
                    "item_id": items[0].id,
                    "asset_key": asset_key,
                    "original_href": asset_url
                }
            })
        except Exception as e:
            log.error(f"Error querying STAC for key {key}: {e}")
            not_found_keys.append(key)

    return jobs, not_found_keys

log.info("STAC-based DEM discovery functions defined.")


In [None]:
# === 6. Execute DEM Download Workflow ===

if 'ipykernel' in sys.modules and missing_dem_keys:
    dem_download_jobs, dem_unavailable_keys = build_dem_download_jobs_stac(missing_dem_keys)

    if dem_download_jobs:
        df_plan = pd.DataFrame([{"id": j["key"], "url": j["url"]} for j in dem_download_jobs])
        log.info("--- DEM Download Plan ---")
        display(df_plan.head())

        log.info("Generating DEM download coverage map...")
        fig, ax = plt.subplots(figsize=(12, 8))
        master_aoi_wgs84.boundary.plot(ax=ax, color="k", linewidth=1.5, zorder=2)
        candidate_tiles_gdf.plot(ax=ax, facecolor="none", edgecolor='gray', linewidth=0.2, zorder=1)

        keys_to_download = [j['key'] for j in dem_download_jobs]
        missing_geoms = candidate_tiles_gdf[candidate_tiles_gdf['key'].isin(keys_to_download)].geometry

        if not missing_geoms.empty:
            gpd.GeoDataFrame(geometry=missing_geoms).plot(ax=ax, facecolor="orange", edgecolor="darkred", alpha=0.6, zorder=3)

        ax.set_title(f"Coverage Map: {len(dem_download_jobs)} DEM Tiles to Download (Orange)", fontsize=16)
        ax.legend(handles=[
            Line2D([0], [0], color="k", lw=1.5, label="AOI Boundary"),
            Rectangle((0, 0), 1, 1, facecolor="orange", alpha=0.6, label="Tile to Download")
        ], loc='best')

        figure_path = REPORTS_DIR / "dem_download_plan.png"
        fig.savefig(figure_path, dpi=300, bbox_inches='tight')
        log.info(f"Saved download plan map to {figure_path}")
        plt.show()

        execute_downloads(dem_download_jobs, "DEM Tiles", MAX_CONCURRENT_DOWNLOADS)
    else:
        log.info("Probe complete. No downloadable files found for the missing DEM keys.")

    if dem_unavailable_keys:
        log.warning(f"Could not find STAC assets for {len(dem_unavailable_keys)} keys: {dem_unavailable_keys}")
else:
    log.info("Skipping DEM download execution (no missing keys or not in an interactive session).")


In [None]:
# === 7. DEM Post-flight Check ===

def verify_dem_coverage(required_keys: list[str], candidates: gpd.GeoDataFrame, aoi: gpd.GeoDataFrame, dem_dir: Path):
    """
    Verifies which DEM files were successfully downloaded, logs a summary,
    and generates a final coverage map.
    """
    log.info("Running post-flight verification for DEM data...")
    on_disk_keys_after = {p.stem.split('_')[-1] for p in dem_dir.glob("*.tif")}
    missing_after = sorted(set(required_keys) - on_disk_keys_after)

    log.info("---- DEM Coverage (Post-Download) ----")
    log.info(f"  Tiles Required: {len(required_keys)}")
    log.info(f"   Tiles On Disk: {len(on_disk_keys_after)}")
    log.info(f"   Still Missing: {len(missing_after)} (likely unavailable from source)")

    if not missing_after: log.info("✅ All required DEM tiles are now on disk.")

    log.info("Generating final DEM coverage map...")
    fig, ax = plt.subplots(figsize=(12, 8))
    aoi.boundary.plot(ax=ax, color="k", linewidth=1.5, zorder=3)

    acquired_geoms = candidates[candidates['key'].isin(on_disk_keys_after)].geometry
    if not acquired_geoms.empty:
        gpd.GeoDataFrame(geometry=acquired_geoms).plot(ax=ax, facecolor="green", edgecolor="white", alpha=0.7, zorder=2)

    unavailable_geoms = candidates[candidates['key'].isin(missing_after)].geometry
    if not unavailable_geoms.empty:
        gpd.GeoDataFrame(geometry=unavailable_geoms).plot(ax=ax, facecolor="orange", edgecolor="darkred", alpha=0.6, zorder=1)

    ax.set_title(f"Post-Download DEM Coverage: {len(on_disk_keys_after)} / {len(required_keys)} Tiles Acquired", fontsize=16)
    ax.legend(handles=[
        Line2D([0], [0], color="k", lw=1.5, label="AOI Boundary"),
        Rectangle((0, 0), 1, 1, facecolor="green", alpha=0.7, label=f"Acquired ({len(on_disk_keys_after)})"),
        Rectangle((0, 0), 1, 1, facecolor="orange", alpha=0.6, label=f"Unavailable ({len(missing_after)})")
    ], loc='best')

    figure_path = REPORTS_DIR / "dem_post_download_coverage.png"
    fig.savefig(figure_path, dpi=300, bbox_inches='tight')
    log.info(f"Saved final coverage map to {figure_path}")
    plt.show()

# --- Execute Verification ---
verify_dem_coverage(required_dem_keys, candidate_tiles_gdf, master_aoi_wgs84, DEM_OUT_DIR)


In [None]:

# === 8. Acquire Daymet Daily Climate Data ===

DAYMET_COLLECTION = "daymet-daily-na"
DAYMET_ZARR_ROLES = ("zarr",)
DAYMET_VARIABLE_MAP = {
    "ppt": "prcp",
    "prcp": "prcp",
    "tmin": "tmin",
    "tmax": "tmax",
}

def _aoi_bounds_4326(aoi_gdf: gpd.GeoDataFrame) -> tuple[float, float, float, float]:
    aoi_4326 = aoi_gdf.to_crs(WGS84_CRS)
    minx, miny, maxx, maxy = aoi_4326.total_bounds
    return float(minx), float(miny), float(maxx), float(maxy)

def _to_time_slice(start_str: str, end_str: str) -> slice:
    return slice(pd.to_datetime(start_str), pd.to_datetime(end_str))

def _coord_names(ds: xr.Dataset) -> tuple[str, str]:
    lat = "lat" if "lat" in ds.coords else ("y" if "y" in ds.coords else None)
    lon = "lon" if "lon" in ds.coords else ("x" if "x" in ds.coords else None)
    if not lat or not lon:
        raise ValueError(f"Could not locate latitude/longitude coordinates in Daymet dataset. Found: {list(ds.coords)}")
    return lat, lon

def _resolve_daymet_asset_href(collection) -> str | None:
    asset = collection.assets.get("zarr")
    if asset is not None:
        return asset.href
    for asset in collection.assets.values():
        roles = tuple(asset.roles or ())
        if asset.href.endswith(".zarr") or any(role in DAYMET_ZARR_ROLES for role in roles):
            return asset.href
    return None

def _normalize_daymet_variables(requested: Sequence[str]) -> dict[str, str]:
    normalized: dict[str, str] = {}
    for var in requested:
        dataset_var = DAYMET_VARIABLE_MAP.get(var)
        if dataset_var is None:
            log.warning(f"Requested climate variable '{var}' is not available in the Daymet collection and will be skipped.")
            continue
        normalized[var] = dataset_var
    return normalized

def acquire_daymet_daily_via_zarr(
    aoi_gdf: gpd.GeoDataFrame,
    variables: Sequence[str] | None = None,
    clip_polygon: bool = False,
) -> None:
    """Download daily Daymet climate data from the Planetary Computer Zarr cube."""
    requested = tuple(variables) if variables is not None else tuple(PRISM_DAILY_VARS)
    var_map = _normalize_daymet_variables(requested)
    if not var_map:
        log.error("No valid climate variables were requested. Aborting Daymet acquisition.")
        return

    log.info("--- Starting Daymet daily climate acquisition via Planetary Computer ---")
    log.info(f"Variables requested: {list(var_map.keys())}")

    catalog = Client.open(PC_STAC_URL)
    collection = catalog.get_collection(DAYMET_COLLECTION)
    if collection is None:
        log.error(f"Unable to locate collection '{DAYMET_COLLECTION}' on the Planetary Computer.")
        return

    asset_href = _resolve_daymet_asset_href(collection)
    if asset_href is None:
        log.error("No Zarr asset was found for the Daymet collection; cannot proceed with climate acquisition.")
        return

    signed_href = planetary_computer.sign(asset_href)
    mapper = fsspec.get_mapper(signed_href, anon=True)

    try:
        ds = xr.open_zarr(mapper, consolidated=True)
    except TypeError:
        ds = xr.open_zarr(mapper)

    lat_name, lon_name = _coord_names(ds)

    time_slice = _to_time_slice(PRISM_DATE_RANGE[0], PRISM_DATE_RANGE[1])
    ds_time = ds.sel(time=time_slice)

    minx, miny, maxx, maxy = _aoi_bounds_4326(aoi_gdf)
    spatial_subset = ds_time.sel({
        lon_name: slice(minx, maxx),
        lat_name: slice(maxy, miny),
    })

    if spatial_subset.time.size == 0:
        log.error("No temporal data remained after subsetting the Daymet cube. Please check the configured date range.")
        return

    clip_geom = None
    if clip_polygon:
        clip_geom = [mapping(aoi_gdf.to_crs(WGS84_CRS).unary_union)]

    out_root = CLIMATE_OUT_DIR / "daily"
    out_root.mkdir(parents=True, exist_ok=True)

    start_year = pd.to_datetime(PRISM_DATE_RANGE[0]).year
    end_year = pd.to_datetime(PRISM_DATE_RANGE[1]).year

    total_saved = 0
    for output_var, dataset_var in var_map.items():
        if dataset_var not in spatial_subset.data_vars:
            log.warning(f"Daymet dataset does not include variable '{dataset_var}'. Skipping.")
            continue

        da = spatial_subset[dataset_var]
        if clip_geom is not None:
            da = da.rio.write_crs("EPSG:4326")
            da = da.rio.clip(clip_geom, "EPSG:4326", drop=True)
        da = da.rename(output_var)

        var_dir = out_root / output_var
        var_dir.mkdir(parents=True, exist_ok=True)

        for year in range(start_year, end_year + 1):
            year_slice = slice(f"{year}-01-01", f"{year}-12-31")
            da_year = da.sel(time=year_slice)
            if da_year.sizes.get("time", 0) == 0:
                continue

            out_path = var_dir / f"daymet_{output_var}_daily_{year}.nc"
            if out_path.exists():
                log.info(f"Skipping existing Daymet file: {out_path.name}")
                continue

            tmp_path = out_path.with_suffix(out_path.suffix + ".part")
            da_year.to_netcdf(tmp_path)
            tmp_path.replace(out_path)

            write_provenance(
                artifact_path=out_path,
                source_info={
                    "stac_collection": DAYMET_COLLECTION,
                    "asset_href": asset_href,
                    "platform": "Microsoft Planetary Computer",
                },
                parameters={
                    "requested_variable": output_var,
                    "dataset_variable": dataset_var,
                    "time_range": [f"{year}-01-01", f"{year}-12-31"],
                    "spatial_subset": "polygon" if clip_polygon else "bbox",
                },
            )
            total_saved += 1

    log.info(f"✅ Daymet export complete. Saved {total_saved} new file(s) under {out_root}.")
    log.info("Daymet precipitation units are millimeters/day; temperatures are in degrees Celsius.")

# Run it
acquire_daymet_daily_via_zarr(master_aoi_wgs84, variables=PRISM_DAILY_VARS, clip_polygon=False)


In [None]:

# === 9. Acquire gNATSGO Tabular Soil Data ===

def acquire_gnatsgo_tabular_data(aoi_master: gpd.GeoDataFrame) -> None:
    """Downloads gNATSGO tabular component and horizon data for the AOI using pygnatsgofetch."""
    log.info("--- Starting gNATSGO Tabular Soil Data Acquisition ---")

    output_path = SOILS_OUT_DIR / f"{GNATSGO_JOB_NAME}_tabular_data.csv"
    if output_path.exists():
        log.info(f"✅ Tabular gNATSGO data file '{output_path.name}' already exists.")
        return

    log.info(f"Fetching tabular soil data for job '{GNATSGO_JOB_NAME}'. This may take several minutes...")

    try:
        gnatsgo_fetcher = gnatsgo.Gnatsgo(SOILS_OUT_DIR)
        gnatsgo_fetcher.get_data(shapefile=aoi_master, job_name=GNATSGO_JOB_NAME)

        if output_path.exists() and output_path.stat().st_size > 0:
            try:
                record_count = int(pd.read_csv(output_path).shape[0])
            except Exception:
                record_count = None

            write_provenance(
                artifact_path=output_path,
                source_info={"database": "USDA-NRCS Soil Data Access", "tool": "pygnatsgofetch"},
                parameters={
                    "job_name": GNATSGO_JOB_NAME,
                    "record_count": record_count,
                },
            )
            log.info(f"✅ gNATSGO tabular data acquisition complete. See '{output_path.name}'.")
        elif output_path.exists():
            log.info("gNATSGO query returned no data for the AOI; an empty marker file was created.")
        else:
            log.warning("gNATSGO data fetch did not produce an output file.")

    except Exception as e:
        log.error(f"An error occurred during gNATSGO processing: {e}", exc_info=True)

# --- Execute gNATSGO Workflow ---
acquire_gnatsgo_tabular_data(master_aodf)
