In [4]:
# === 1. Configuration & Setup ===

# --- Core Libraries ---
import os
import re
import sys
import math
import time
import requests
import threading
import geopandas as gpd
import pandas as pd
import psutil
from pathlib import Path
from shapely.ops import unary_union
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.lines import Line2D


# --- Robust Path Resolution ---
def find_project_root(marker='README.md'):
    """Find the project root by searching upwards for a marker file."""
    path = Path.cwd().resolve()
    while path.parent != path:
        if (path / marker).exists():
            return path
        path = path.parent
    raise FileNotFoundError(f"Project root with marker '{marker}' not found.")

PROJECT_ROOT = find_project_root()
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"


# --- Input Paths (from Notebook 1.1) ---
STUDY_AREAS_GPKG = PROC_DIR / "study_areas.gpkg"
VALIDATION_AREA_GPKG = PROC_DIR / "validation_area.gpkg"
INTERNAL_HOLDOUT_GPKG = PROC_DIR / "internal_holdout.gpkg"


# --- Output Paths (for this Notebook) ---
DEM_OUT_DIR = RAW_DIR / "dem_30m"
CLIMATE_OUT_DIR = RAW_DIR / "climate_prism"
SOILS_OUT_DIR = RAW_DIR / "soils_gnatsgo"
DEM_OUT_DIR.mkdir(parents=True, exist_ok=True)
CLIMATE_OUT_DIR.mkdir(parents=True, exist_ok=True)
SOILS_OUT_DIR.mkdir(parents=True, exist_ok=True)


# --- System-Aware Parallelism (Dynamic Resource Allocation) ---
PHYSICAL_CORES = psutil.cpu_count(logical=False)
MAX_CONCURRENT_DOWNLOADS = max(1, PHYSICAL_CORES - 1)


# --- Data Source Configuration ---
# A. USGS 3DEP 1-Arcsecond (30m) DEMs
S3_BASE_URL = "https://prd-tnm.s3.amazonaws.com/StagedProducts/Elevation/1"
S3_URL_PATTERNS = [
    "TIFF/current/{key}/USGS_1_{key}.tif",
    "TIFF/{key}/USGS_1_{key}.tif",
    "GeoTIFF/{key}/USGS_1_{key}.tif",
    "TIFF/current/{key}/USGS_NED_1_{key}.tif",
    "TIFF/{key}/USGS_NED_1_{key}.tif",
]


# --- Global Session for HTTP Requests ---
DL_SESSION = requests.Session()
DL_SESSION.headers.update({
    "User-Agent": f"UGA-TDA-Research/1.0 (Contact: {os.getenv('USER', 'user')}@uga.edu; WEPP Analysis)",
})


# --- Print Setup Summary ---
print("--- Configuration Summary ---")
print(f"Project Root:       {PROJECT_ROOT}")
print(f"Output DEMs:        {DEM_OUT_DIR}")
print(f"Output Climate:     {CLIMATE_OUT_DIR}")
print(f"Output Soils:       {SOILS_OUT_DIR}")
print(f"Concurrent Workers: {MAX_CONCURRENT_DOWNLOADS} (Based on {PHYSICAL_CORES} physical cores)")
print("-" * 29)

--- Configuration Summary ---
Project Root:       /workspace
Output DEMs:        /workspace/data/raw/dem_30m
Output Climate:     /workspace/data/raw/climate_prism
Output Soils:       /workspace/data/raw/soils_gnatsgo
Concurrent Workers: 9 (Based on 10 physical cores)
-----------------------------


In [5]:
# === 2. Create Master Area of Interest (AOI) ===

print("Loading and merging all study area boundaries...")

# Load all three boundary files generated by notebook 1.1
gdf_study = gpd.read_file(STUDY_AREAS_GPKG, layer='study_areas')
gdf_val = gpd.read_file(VALIDATION_AREA_GPKG, layer='validation_area')
gdf_holdout = gpd.read_file(INTERNAL_HOLDOUT_GPKG, layer='internal_holdout')

# Concatenate all geometries into a single GeoDataFrame
gdf_all = pd.concat([gdf_study, gdf_val, gdf_holdout], ignore_index=True)
print(f"Loaded a total of {len(gdf_all)} polygons.")

# Dissolve into a single master geometry for consistent data coverage
master_aoi_geom = unary_union(gdf_all.geometry)
master_aoi_gdf = gpd.GeoDataFrame(geometry=[master_aoi_geom], crs=gdf_all.crs)

# Reproject to WGS84 (EPSG:4326) for querying and tile calculations
master_aoi_wgs84 = master_aoi_gdf.to_crs("EPSG:4326")

print("✅ Master AOI created and reprojected to EPSG:4326.")
display(master_aoi_wgs84)
master_aoi_wgs84.explore()

Loading and merging all study area boundaries...


ERROR 1: PROJ: proj_create_from_database: Open of /opt/conda/envs/app/share/proj failed


Loaded a total of 7 polygons.
✅ Master AOI created and reprojected to EPSG:4326.


Unnamed: 0,geometry
0,"MULTIPOLYGON (((-97.18027 26.07179, -97.2122 2..."


In [6]:
# === 3. Downloader Utility Functions ===

_KEY_RE = re.compile(r"(?i)(?:USGS_1_|USGS_NED_1_)?([ns]\d{2}[ew]\d{3})")

def _key_from_name(name: str):
    """Extracts a tile key (e.g., n35w084) from a filename."""
    m = _KEY_RE.search(name)
    return m.group(1).lower() if m else None

def disk_tile_keys(folder: Path):
    """Scans a directory for raster files and returns a set of their keys."""
    keys = set()
    for p in folder.glob("**/*.tif"):
        k = _key_from_name(p.name)
        if k: keys.add(k)
    return keys

def key_from_lon_lat(lon: float, lat: float) -> str:
    """Generates a tile key from a longitude/latitude coordinate."""
    hemi_ns = "n" if lat >= 0 else "s"
    hemi_ew = "e" if lon >= 0 else "w"
    return f"{hemi_ns}{abs(int(lat)):02d}{hemi_ew}{abs(int(lon)):03d}"

print("Utility functions defined.")

Utility functions defined.


In [7]:
# === 4. Identify Required DEM Tiles via Spatial Analysis ===

print("Performing spatial intersection to find required DEM tile keys...")
minx, miny, maxx, maxy = master_aoi_wgs84.total_bounds

# Generate a grid of all possible 1x1 degree tiles covering the AOI's bounds
lons = range(math.floor(minx), math.ceil(maxx))
lats = range(math.floor(miny), math.ceil(maxy))
tile_geoms = [box(lon, lat, lon + 1, lat + 1) for lon in lons for lat in lats]
tile_keys = [key_from_lon_lat(lon, lat) for lon in lons for lat in lats]
candidate_tiles_gdf = gpd.GeoDataFrame({"key": tile_keys}, geometry=tile_geoms, crs="EPSG:4326")

# Perform the spatial join to find only the intersecting tiles
intersecting_tiles = gpd.sjoin(candidate_tiles_gdf, master_aoi_wgs84, how="inner", predicate="intersects")
required_dem_keys = sorted(intersecting_tiles["key"].unique())

print(f"Identified {len(required_dem_keys)} required tiles (vs. {len(candidate_tiles_gdf)} in bounding box).")
print(f"Skipping {len(candidate_tiles_gdf) - len(required_dem_keys)} unnecessary downloads.")

Performing spatial intersection to find required DEM tile keys...


NameError: name 'box' is not defined

In [8]:
# === 5. DEM Pre-flight Check ===

print("Scanning local directory for existing DEM tiles...")
on_disk_dem_keys = disk_tile_keys(DEM_OUT_DIR)
missing_dem_keys = sorted(set(required_dem_keys) - on_disk_dem_keys)

print("\n--- DEM Coverage Summary ---")
print(f"   Tiles Required: {len(required_dem_keys)}")
print(f"    Tiles On Disk: {len(on_disk_dem_keys)}")
print(f"    Tiles Missing: {len(missing_dem_keys)}")

if not missing_dem_keys:
    print("\n✅ All required DEM tiles are already on disk. No download needed.")
else:
    print(f"\nSample of missing DEM keys: {missing_dem_keys[:10]}")

Scanning local directory for existing DEM tiles...


NameError: name 'required_dem_keys' is not defined

In [None]:
# === 6. S3 Probe & Parallel Download Functions ===

def _head_ok(url: str, timeout: int = 15):
    """Checks if a URL is valid and accessible using a HEAD request."""
    try:
        r = DL_SESSION.head(url, allow_redirects=True, timeout=timeout)
        # Fallback for servers that don't allow HEAD
        if r.status_code in (403, 405):
            r = DL_SESSION.get(url, headers={"Range": "bytes=0-0"}, stream=True, timeout=timeout)
        return r.status_code == 200
    except requests.RequestException:
        return False

def build_dem_download_jobs(keys_to_find: list) -> tuple:
    """Probes S3 for valid DEM URLs and returns a list of jobs."""
    jobs = []
    not_found_keys = []
    print(f"\nProbing S3 for download URLs for {len(keys_to_find)} missing DEM tiles...")

    for key in tqdm(keys_to_find, desc="S3 Probe", unit="key"):
        found_url = False
        for pattern in S3_URL_PATTERNS:
            url = f"{S3_BASE_URL}/{pattern.format(key=key)}"
            if _head_ok(url):
                filename = f"USGS_1_{key}.tif"
                jobs.append({"url": url, "out_path": DEM_OUT_DIR / filename, "key": key})
                found_url = True
                break
        if not found_url:
            not_found_keys.append(key)
    return jobs, not_found_keys

def _download_job(job: dict, stats: dict, retry_limit: int = 3, timeout_sec: int = 60):
    """Worker function to download a single file."""
    url, out_path = job["url"], job["out_path"]
    if out_path.exists():
        with stats["lock"]: stats["skipped"] += 1
        return f"SKIP (exists): {out_path.name}"

    for attempt in range(1, retry_limit + 1):
        try:
            with DL_SESSION.get(url, stream=True, timeout=timeout_sec) as r:
                r.raise_for_status()
                tmp_path = out_path.with_suffix(out_path.suffix + ".part")
                with open(tmp_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=1 << 20): # 1MB chunks
                        f.write(chunk)
                tmp_path.rename(out_path)
            return f"OK: {out_path.name}"
        except requests.RequestException as e:
            if attempt < retry_limit: time.sleep(2 * attempt)
            else: return f"FAIL: {out_path.name} ({e})"
    return f"FAIL (retries exhausted): {out_path.name}"

def execute_downloads(jobs: list, description: str):
    """Executes a list of download jobs in parallel with a progress bar."""
    if not jobs:
        print(f"No new files to download for {description}.")
        return

    stats = {"lock": threading.Lock(), "skipped": 0}
    print(f"⬇️  Starting parallel download for {len(jobs)} {description} files...")

    with ThreadPoolExecutor(max_workers=MAX_CONCURRENT_DOWNLOADS) as pool:
        future_to_job = {pool.submit(_download_job, job, stats): job for job in jobs}
        results = [
            future.result()
            for future in tqdm(as_completed(future_to_job), total=len(jobs), desc=description, unit="file")
        ]

    failures = [r for r in results if r.startswith("FAIL")]
    if failures:
        print(f"\n⚠️  Completed with {len(failures)} failures (see logs for details).")
    print(f"\n✅ Download process complete. Skipped {stats['skipped']} existing files.")

print("High-performance downloader functions defined.")

In [None]:
# === 7. Execute DEM Download Workflow ===

if 'ipykernel' in sys.modules and missing_dem_keys:
    # 1. Build the list of download jobs by probing S3
    dem_download_jobs, dem_unavailable_keys = build_dem_download_jobs(missing_dem_keys)

    if dem_download_jobs:
        # 2. Display a preview of the download plan
        df_plan = pd.DataFrame([{"id": j["key"], "url": j["url"]} for j in dem_download_jobs])
        print("\n--- DEM Download Plan ---")
        display(df_plan.head())

        # 3. Plot the missing tiles
        fig, ax = plt.subplots(figsize=(12, 8))
        master_aoi_wgs84.boundary.plot(ax=ax, color="k", linewidth=1.5, zorder=2)
        candidate_tiles_gdf.plot(ax=ax, facecolor="none", edgecolor='gray', linewidth=0.2, zorder=1)
        missing_geoms = candidate_tiles_gdf[candidate_tiles_gdf['key'].isin(missing_dem_keys)].geometry
        gpd.GeoDataFrame(geometry=missing_geoms).plot(ax=ax, facecolor="orange", edgecolor="darkred", alpha=0.6, zorder=3)
        ax.set_title(f"Coverage Map: {len(missing_dem_keys)} Missing DEM Tiles (Orange)")
        legend_elements = [Line2D([0],[0], color="k", lw=1.5, label="AOI Boundary"), Rectangle((0,0),1,1, facecolor="orange", alpha=0.6, label="Missing Tile")]
        ax.legend(handles=legend_elements, loc='best')
        plt.show()

        # 4. Execute the downloads
        execute_downloads(dem_download_jobs, "DEM Tiles")
    else:
        print("\nProbe complete. No downloadable files found for the missing DEM keys.")
else:
    print("\nSkipping DEM download execution (no missing keys or not in an interactive session).")

In [None]:
# === 8. DEM Post-flight Check ===

print("\nRunning post-flight verification for DEM data...")
on_disk_keys_after = disk_tile_keys(DEM_OUT_DIR)
missing_after = sorted(set(required_dem_keys) - on_disk_keys_after)

# --- Text Summary ---
print("---- DEM Coverage (Post-Download) ----")
print(f"  Tiles Required: {len(required_dem_keys)}")
print(f"   Tiles On Disk: {len(on_disk_keys_after)}")
print(f"   Still Missing: {len(missing_after)} (likely unavailable from source)")

if not missing_after:
    print("\n✅ All required DEM tiles are now on disk.")

# --- Map Visualization ---
fig, ax = plt.subplots(figsize=(12, 8))
master_aoi_wgs84.boundary.plot(ax=ax, color="k", linewidth=1.5, zorder=3)

# Plot acquired tiles in green
acquired_geoms = candidate_tiles_gdf[candidate_tiles_gdf['key'].isin(on_disk_keys_after)].geometry
if not acquired_geoms.empty:
    gpd.GeoDataFrame(geometry=acquired_geoms).plot(ax=ax, facecolor="green", edgecolor="white", alpha=0.7, zorder=2)

# Plot unavailable tiles in orange
unavailable_geoms = candidate_tiles_gdf[candidate_tiles_gdf['key'].isin(missing_after)].geometry
if not unavailable_geoms.empty:
    gpd.GeoDataFrame(geometry=unavailable_geoms).plot(ax=ax, facecolor="orange", edgecolor="darkred", alpha=0.6, zorder=1)

ax.set_title(f"Post-Download DEM Coverage: {len(on_disk_keys_after)} / {len(required_dem_keys)} Tiles Acquired", fontsize=16)
legend_elements = [
    Line2D([0],[0], color="k", lw=1.5, label="AOI Boundary"),
    Rectangle((0,0),1,1, facecolor="green", alpha=0.7, label=f"Acquired ({len(on_disk_keys_after)})"),
    Rectangle((0,0),1,1, facecolor="orange", alpha=0.6, label=f"Unavailable ({len(missing_after)})")
]
ax.legend(handles=legend_elements, loc='best')
plt.show()