# Get target toponyms

Retrieve and format a list of toponyms and unique hydrographic identifiers (IDs) associated with a set of input locations
Possible inputs include: a table of coordinates, a vector dataset (points, lines or polygons), or specific basin IDs
The toponyms and IDs are retrieved from the US National Hydrography Dataset Plus (NHDPlus), Global Administrative Areas (GADM) dataset, and HydroATLAS.
This scripts only defines functions, which are then used in create_location_filter_string.ipynb

In [1]:
%run download_geographic_refs.ipynb
%run set_up.py 

import geopandas as gpd
import numpy as np
import pandas as pd #require pip install pyarrow
from shapely.geometry import Point
import re
import warnings

verbose=True #Whether to print detailed progress messages

In [7]:
test_pts_path = os.path.join(datdir, 'test_gages', 'test_gages.shp') #Example shapefile to show how the code functions: retire

wbd_path = os.path.join(nhd_dir, 'WBD_National_GDB.gdb') #Local path to US Watershed Boundary Dataset
hu12_parquet = os.path.join(nhd_dir, 'wbd_hu12list.parquet') #Local path to output table of all HUC12

basinatlas_path = os.path.join(hydroatlas_dir,  'BasinATLAS_v10.gdb')
basinatlas11_parquet = os.path.join(hydroatlas_dir, 'basinatlas_lev11_idlist.parquet') Local path to output table of all HydroBASINS level 11
#geoglows_vpu_path = os.path.join(geoglows_dir, 'vpu-boundaries.gpkg')
gadm_path = os.path.join(gadm_dir, 'gadm_410-levels.gpkg')

In [8]:
def _expand_basin_idlist(in_id_list: List,              
                         in_refids_parquet: str,       
                         refids_col: str,               
                         out_id_range: List[int]):       
    """
    Expands a list of basin IDs by extracting IDs at different levels from a reference Parquet file.

    Args:
        in_id_list (list): List of input basin IDs.  Can be integers or strings.
        in_refids_parquet (str): Path to the reference Parquet file.
        refids_col (str): Name of the column in the Parquet file containing the full basin IDs.
        out_id_range (list): List of integer levels to extract.  e.g., [6, 9, 12]

    Returns:
        pandas.DataFrame: DataFrame containing the expanded basin IDs.  The returned DataFrame
                          will have columns named based on `refids_col` and levels in `out_id_range`.
                          The data type of the output columns will match the input type of `in_id_list`.

    Raises:
        TypeError: If the length of input IDs is inconsistent with the length in reference table.
        FileNotFoundError: If the input Parquet file does not exist.
        ValueError:  If `out_id_range` contains values greater than the maximum ID length.
                   Or if an empty DataFrame is returned by filtering.
    """

    id_all_pd = pd.read_parquet(in_refids_parquet)

    # --- Determine input type and maximum ID length ---
    in_id_type = type(in_id_list[0])
    if in_id_type == str:
      in_id_len = len(in_id_list[0])
    elif in_id_type == int:
      in_id_len = len(str(in_id_list[0]))
    else:
      raise TypeError("in_id_list must be a list of strings or integers")

    # --- Determine the reference ID type and length ---
    first_ref_id = id_all_pd[refids_col].iloc[0] #Get first item in col
    refid_type = type(first_ref_id)

    if refid_type == str:
      refid_len = len(first_ref_id)
    elif pd.api.types.is_integer_dtype(refid_type) or  refid_type == np.int64:
      refid_len = len(str(first_ref_id))
    else:
      raise TypeError(f"{refids_col} must contain strings or integers")


    if in_id_len > refid_len:
        raise ValueError(f"Input ID length ({in_id_len}) is greater than reference ID length ({refid_len})")

    # --- Input Validation on out_id_range ---
    if any(level > refid_len for level in out_id_range):
        raise ValueError(f"out_id_range values cannot exceed reference ID length ({refid_len})")

    # --- Filtering ---
    # Convert to string for consistent prefix matching, then convert back later
    id_all_pd[refids_col] = id_all_pd[refids_col].astype(str)
    in_id_list_str = [str(x) for x in in_id_list]

    id_pd = id_all_pd[id_all_pd[refids_col].str.startswith(tuple(in_id_list_str))].copy()

    if id_pd.empty:
        raise ValueError("No matching IDs found. Check in_id_list and refids_col.")

    # --- Column Expansion ---
    # Determine the base name for column
    colroot = re.sub(r'[0-9]+', '', refids_col)
    # Loop through each requested ID level (e.g., 6, 9, 12).
    for id_level in out_id_range:
        col_name = f'{colroot}{id_level}'
        if col_name not in id_pd.columns:
            id_pd.loc[:, col_name] = id_pd[refids_col].str[:id_level]
            # Convert back to original type, if necessary
            if in_id_type == int:
                id_pd.loc[:, col_name] = pd.to_numeric(id_pd[col_name],
                                                       errors='raise') 

    # Convert refids_col back to original type
    if pd.api.types.is_integer_dtype(refid_type) or  refid_type == np.int64:
        id_pd[refids_col] = pd.to_numeric(id_pd[refids_col], 
                                          errors='raise')

    return id_pd

In [9]:
def _format_gdf_tojoin(in_xytab: str = None, 
                       lon_col: str = None, 
                       lat_col: str = None, 
                       in_crs: str = "EPSG:4326",
                       in_vector: str = None, 
                       hull: bool = True):
    """
    Creates a GeoDataFrame from either XY coordinates in a table or a vector file,
    optionally calculating the convex hull of the resulting geometry.

    Note: This function expects EITHER in_xytab (with lon/lat cols) OR in_vector 
          to be provided, not both. If both are provided, in_vector will take precedence.

    Args:
        in_xytab (str, optional): Path to a table file (e.g., CSV, TSV) containing point coordinates. 
                                   Defaults to None. Assumed delimited by whitespace/tabs by pd.read_table.
        lon_col (str, optional): Name of the longitude column in the table file specified by in_xytab. 
                                 Required if in_xytab is used. Defaults to None.
        lat_col (str, optional): Name of the latitude column in the table file specified by in_xytab. 
                                 Required if in_xytab is used. Defaults to None.
        in_vector (str, optional): Path to a vector file (e.g., Shapefile, GeoJSON, GeoPackage) readable by GeoPandas.
                                   Defaults to None.
        hull (bool, optional): If True, computes the convex hull of all geometries in the resulting 
                               GeoDataFrame, returning a GDF with a single polygon geometry. 
                               Defaults to True.

    Returns:
        geopandas.GeoDataFrame: A GeoDataFrame containing geometries derived from the input. 
                                If hull=True, it contains a single row with the convex hull polygon.
                                If hull=False, it contains geometries read from the input source.
                                Returns None or raises an error if input arguments are invalid or files not found.

    Raises:
        FileNotFoundError: If the path provided in in_xytab or in_vector does not exist.
        KeyError: If lon_col or lat_col are not found in the DataFrame read from in_xytab.
        AttributeError: If lon_col or lat_col are None when in_xytab is provided.
        Exception: Depending on pandas or geopandas file reading errors.
    """
    
    # --- Branch 1: Create GeoDataFrame from an XY table ---
    if in_xytab:
        points_df = pd.read_table(in_xytab) 
        # Create a GeoPandas GeoDataFrame from the pandas DataFrame.
        gdf_to_join = gpd.GeoDataFrame(
            points_df,
            geometry=[Point(xy) for xy 
                      in zip(points_df[lon_col], points_df[lat_col])] 
            crs=in_crs
        )
     
    # --- Branch 2: Read GeoDataFrame directly from a vector file ---
    if in_vector:
        # If both in_xytab and in_vector were provided, this will overwrite 
        # the gdf_to_join created from in_xytab.
        gdf_to_join = gpd.read_file(in_vector)

    # --- Optional Step: Calculate Convex Hull ---
    if hull and gdf_to_join is not None:
        # The result is wrapped in a new GeoDataFrame containing a single row with this hull polygon.
        # The CRS (Coordinate Reference System) is preserved from the original GeoDataFrame.
        gdf_to_join = gpd.GeoDataFrame(
            geometry=[gdf_to_join.geometry.union_all().convex_hull], 
            crs=gdf_to_join.crs
        )
        
    # Return the final GeoDataFrame (either original geometries, the convex hull, or potentially None if no input provided).
    return gdf_to_join 

In [14]:
#Create a list of PFAF_ID for basins level 11
def create_basinatlas11_list(basinatlas_path: str, 
                             out_basinatlas11_parquet: str, 
                             verbose: bool = True):
    """
    Creates or loads a list of PFAF_ID values for BasinATLAS level 11 basins.

    This function acts as a cache. If the output Parquet file exists, it loads it.
    Otherwise, it reads the specific layer from the input BasinATLAS file,
    extracts, renames, and types the PFAF_ID column, saves it to Parquet, 
    and then returns it.

    Args:
        basinatlas_path (str): Path to the input BasinATLAS dataset 
                              (e.g., GeoPackage or File Geodatabase).
        out_basinatlas11_parquet (str): Path where the extracted list of 
                                        level 11 PFAF IDs will be saved 
                                        or loaded from (as a Parquet file).
        verbose (bool, optional): If True, prints messages about generating 
                                  or loading the file. Defaults to True.

    Returns:
        pandas.DataFrame: A DataFrame containing a single column named 'PFAF_ID11' 
                          with the level 11 basin IDs as nullable integers.

    Raises:
        FileNotFoundError: If basinatlas_path does not exist when generation is needed.
        Exception: Potentially other exceptions from geopandas or pandas file IO 
                   or if the specified layer/column doesn't exist in basinatlas_path.
    """
    
    # --- Check if the processed list already exists as a Parquet file ---
    if not os.path.exists(out_basinatlas11_parquet):
        if verbose:
            print(f'Generating a list of PFAF ID level 11 and saving it to '
                  f'{out_basinatlas11_parquet}')
              
        # Read the specific layer containing level 11 basins from the input file.
        basinatlas11_list = gpd.read_file(
            filename=basinatlas_path,        
            layer='BasinATLAS_v10_lev11',    
            columns=['PFAF_ID'],         
            # WARNING: Hardcoded row count. This assumes the layer *always* has exactly 
            # 1,031,785 rows. 
            rows=1031785,                    
            ignore_geometry=True   
        ).\
        astype(pd.Int64Dtype()).\ # Convert the 'PFAF_ID' column to pandas nullable integer type (Int64).(avoid NAs)
        rename(columns={"PFAF_ID": "PFAF_ID11"}) # Rename the column for clarity (indicating it's level 11).
         
        basinatlas11_list.to_parquet(out_basinatlas11_parquet)

    # --- File Loading Branch ---
    else:
        if verbose:
             print(f"Found existing file. Loading BasinATLAS level 11 list from {out_basinatlas11_parquet}")
             
        # Load the list directly from the existing Parquet file.
        basinatlas11_list = pd.read_parquet(out_basinatlas11_parquet)
        
    return basinatlas11_list

In [None]:
def get_matching_hydrobasin(in_basinatlas_path: str,
                            in_xytab: str = None, lon_col: str = None, lat_col: str = None,
                            in_vector: str = None, 
                            in_id_list: list = None, 
                            in_refids_parquet: str = None,
                            hull: bool = True, 
                            sjoin_predicate: str = 'intersects'):
    """
    Finds corresponding HydroBASINS IDs (levels 3-11) based on input spatial data 
    (XY coordinates in a table, points, polygons) or a provided list of basin IDs.

    Workflow:
    1. If spatial data (in_xytab or in_vector) is provided:
       a. Formats the input into a GeoDataFrame (optionally using convex hull).
       b. Reads level 6 HydroBASINS polygons.
       c. Performs a spatial join to find level 6 basins intersecting the input geometry.
       d. Extracts the list of matching level 6 PFAF IDs. This list overrides any `in_id_list` passed as input.
    2. If only `in_id_list` is provided (and no spatial data):
       a. Uses the provided `in_id_list` directly.
    3. Expands the determined list of basin IDs (from step 1d or 2a) to levels 3-11 
       using a reference Parquet file containing level 11 IDs.

    Args:
        in_basinatlas_path (str): Path to the HydroBASINS dataset (e.g., GeoPackage).
        in_xytab (str, optional): Path to table with XY coordinates. Defaults to None.
        lon_col (str, optional): Longitude column name in in_xytab. Defaults to None.
        lat_col (str, optional): Latitude column name in in_xytab. Defaults to None.
        in_vector (str, optional): Path to vector file. Defaults to None.
        in_id_list (list, optional): A pre-defined list of basin IDs (typically level 6 PFAF_IDs 
                                     if bypassing spatial join). Defaults to None. 
                                     Note: This is ignored if in_xytab or in_vector is provided.
        in_refids_parquet (str, optional): Path to reference Parquet file containing level 11 
                                           PFAF IDs (required for expansion). Defaults to None.
        hull (bool, optional): If True and using spatial input, compute convex hull before 
                               spatial join. Defaults to True.
        sjoin_predicate (str, optional): Spatial predicate for the join ('intersects', 'within', etc.). 
                                         Defaults to 'intersects'.

    Returns:
        pandas.DataFrame: DataFrame with expanded basin IDs for levels 3 through 11, 
                          corresponding to the input features or ID list.

    Raises:
        Depends on the underlying functions: FileNotFoundError, KeyError, ValueError, etc.
    """
    
    # --- Step 1: Determine initial basin IDs (Level 6) ---
    if in_xytab or in_vector:
        # --- Workflow based on spatial input ---
        # Format the input spatial data into a GeoDataFrame using the helper function.
        # 'hull=True' (default) will create a convex hull polygon before joining.
        gdf_to_join = _format_gdf_tojoin(in_xytab=in_xytab, lon_col=lon_col, lat_col=lat_col, 
                                         in_vector=in_vector, 
                                         hull=hull)
         
        # Read the level 6 BasinATLAS layer
        bas_lev6 = gpd.read_file(filename=in_basinatlas_path, 
                                 layer='BasinATLAS_v10_lev06', # Specific layer for level 6
                                 columns=['PFAF_ID'] # Only load necessary column
                                ).rename(columns={"PFAF_ID": "PFAF_ID6"})
     
        # Perform a spatial join to find which level 6 basins match the input geometry.
        # 'gdf_to_join' is reprojected to match the CRS of the basin layer before joining.
        matched_bas = gpd.sjoin(gdf_to_join.to_crs(crs=bas_lev6.crs),
                                bas_lev6, 
                                how='left', 
                                predicate=sjoin_predicate)
                                
        # Extract the list of unique, non-null level 6 PFAF IDs found from the spatial join.
        # IMPORTANT: This result overrides any `in_id_list` provided as an argument 
        # if spatial data (in_xytab or in_vector) was given.
        in_id_list = matched_bas['PFAF_ID6'].dropna().unique().tolist()

    # --- Step 2: Expand the determined list of basin IDs to levels 3-11 ---
    # This step runs using either the `in_id_list` from the spatial join (if performed)
    # or the `in_id_list` provided directly as input (if no spatial data was given).
    
     # Call the helper function to expand the list of IDs (assumed level 6) 
     # to levels 3 through 11 using the reference Parquet file.
     pfaf_pd = _expand_basin_idlist(
         in_id_list=in_id_list,              
         in_refids_parquet=in_refids_parquet,
         refids_col='PFAF_ID11',           
         out_id_range=range(3, 12)  
     )

    return pfaf_pd

In [None]:
#TO BE COMPLETED
def get_hydroatlas_data():
    print('Getting HydroATLAS data')

In [None]:
def create_huc12_list(wbd_path: str, 
                      out_hu12_parquet: str,
                      verbose: bool = True):
    """
    Creates or loads a list of NHD HUC12 codes from a Watershed Boundary Dataset (WBD).

    This function acts as a cache. If the output Parquet file exists, it loads it.
    Otherwise, it reads the specified WBD layer, extracts the HUC12 codes, 
    saves the list to a Parquet file, and then returns it.

    Args:
        wbd_path (str): Path to the Watershed Boundary Dataset 
                       (e.g., GeoPackage or File Geodatabase).
        out_hu12_parquet (str): Path where the extracted list of HUC12 codes
                                will be saved or loaded from (as a Parquet file).
        verbose (bool, optional): If True, prints messages about generating or 
                                  loading the file. Defaults to True.

    Returns:
        pandas.DataFrame: A DataFrame containing a single column named 'huc12' 
                          with the HUC12 codes (likely as strings or objects, 
                          depending on source data).

    Raises:
        FileNotFoundError: If wbd_path does not exist when generation is needed.
        Exception: Potentially other exceptions from geopandas or pandas file IO 
                   or if the specified layer/column doesn't exist in wbd_path.
    """
    
    if not os.path.exists(out_hu12_parquet):
        # --- File Generation Branch ---        
        if verbose:
            print(f'Generating a list of HUC 12 and saving it to '
                  f'{out_hu12_parquet}')
            
        # Read the specific layer containing HUC12 units from the WBD dataset.
        wbdhu12_list = gpd.read_file(
            filename=wbd_path,       
            layer='WBDHU12',                     
            columns=['huc12'],         
            ignore_geometry=True     
        ) 
        
        wbdhu12_list.to_parquet(out_hu12_parquet)
        
    else:
        # --- File Loading Branch ---
        if verbose:
            print(f"Found existing file. Loading HUC12 list from {out_hu12_parquet}")
        wbdhu12_list = pd.read_parquet(out_hu12_parquet)
        
    return wbdhu12_list

In [None]:
def get_matching_NHD_HU(in_wbd_path: str,
                        in_xytab: str = None, lon_col: str = None, lat_col: str = None,
                        in_vector: str = None, 
                        in_id_list: list = None, 
                        in_refids_parquet: str = None,
                        hull: bool = True, 
                        sjoin_predicate: str = 'intersects'):
    """
    Finds corresponding NHD Hydrologic Unit Codes (HUC2 to HUC12) based on 
    input spatial data (points, polygons) or a provided list of HUC IDs.

    Workflow:
    1. If spatial data (in_xytab or in_vector) is provided:
       a. Formats the input into a GeoDataFrame (optionally using convex hull).
       b. Reads HUC6 (level 6) polygons from the Watershed Boundary Dataset (WBD).
       c. Performs a spatial join to find HUC6 units intersecting the input geometry.
       d. Extracts the list of matching HUC6 codes. This list overrides any `in_id_list` passed as input.
    2. If only `in_id_list` is provided (and no spatial data):
       a. Uses the provided `in_id_list` directly (assumed to be HUC6 codes).
    3. Expands the determined list of HUC IDs (from step 1d or 2a) to levels 2, 4, 6, 8, 10, 12 
       using a reference Parquet file containing HUC12 codes.

    Args:
        in_wbd_path (str): Path to the NHD Watershed Boundary Dataset (WBD) 
                          (e.g., GeoPackage or File Geodatabase).
        in_xytab (str, optional): Path to table with XY coordinates. Defaults to None.
        lon_col (str, optional): Longitude column name in in_xytab. Defaults to None.
        lat_col (str, optional): Latitude column name in in_xytab. Defaults to None.
        in_vector (str, optional): Path to vector file. Defaults to None.
        in_id_list (list, optional): A pre-defined list of HUC IDs (typically HUC6 codes 
                                     if bypassing spatial join). Defaults to None. 
                                     Note: This is ignored if in_xytab or in_vector is provided.
        in_refids_parquet (str, optional): Path to reference Parquet file containing HUC12 codes 
                                           (required for expansion). Defaults to None.
        hull (bool, optional): If True and using spatial input, compute convex hull before 
                               spatial join. Defaults to True.
        sjoin_predicate (str, optional): Spatial predicate for the join ('intersects', 'within', etc.). 
                                         Defaults to 'intersects'.

    Returns:
        pandas.DataFrame: DataFrame with expanded HUC codes for levels 2, 4, 6, 8, 10, and 12, 
                          corresponding to the input features or ID list.

    Raises:
        Depends on the underlying functions: FileNotFoundError, KeyError, ValueError, etc.
    """

    # --- Step 1: Determine initial HUC IDs (Level 6) ---
    if in_xytab or in_vector:
        # --- Workflow based on spatial input ---
        # Format the input spatial data into a GeoDataFrame using the helper function.
        # 'hull=True' (default) will create a convex hull polygon before joining.
        gdf_to_join = _format_gdf_tojoin(in_xytab=in_xytab, lon_col=lon_col, lat_col=lat_col, 
                                         in_vector=in_vector, 
                                         hull=hull)
         
        wbdhu6 = gpd.read_file(filename=in_wbd_path, 
                               layer='WBDHU6',        
                               columns=['huc6']    
                              )
     
        # Perform a spatial join to find which HUC6 units match the input geometry.
        # Input geometry ('gdf_to_join') is reprojected to match the CRS of the HUC6 layer.
        matched_nhd = gpd.sjoin(gdf_to_join.to_crs(crs=wbdhu6.crs), 
                                wbdhu6, 
                                how='left', 
                                predicate=sjoin_predicate)
                                
        # Extract the list of HUC6 codes found from the spatial join result.
        # IMPORTANT: This result overrides any `in_id_list` provided as an argument 
        # if spatial data (in_xytab or in_vector) was given.
        in_id_list = matched_nhd.huc6.values.tolist()
         
    # --- Step 2: Expand the determined list of HUC IDs to levels 2-12 ---
    # This step runs using either the `in_id_list` from the spatial join (if performed)
    # or the `in_id_list` provided directly as input (if no spatial data was given).
    
    # Call the helper function to expand the list of HUC IDs (assumed HUC6) 
    # to other HUC levels using the reference Parquet file (assumed HUC12).
    huc_pd = _expand_basin_idlist(
        in_id_list=in_id_list,                
        in_refids_parquet=in_refids_parquet, 
        refids_col='huc12',                  
        out_id_range=range(2, 14, 2)          
    )

    return huc_pd

In [None]:
def get_nhd_hydronyms(in_hucs: pd.DataFrame,
                      in_wbd_path: str,  
                      out_dir: str,
                      huc_range: range = range(2, 14, 2),
                      flatten: bool = True,
                      verbose: bool = True
                     ):
    """
    Retrieves hydronyms (basin names and river names) associated with input HUC units.

    Steps:
    1. Reads basin names for specified HUC levels from the Watershed Boundary Dataset (WBD).
    2. Downloads NHDPlus High Resolution (HR) data for relevant HUC4 units if not already present locally.
    3. Extracts significant river names (StreamOrder >= 6 with GNIS names) from the NHDPlus HR flowlines.
    4. Returns either a flattened set of all unique hydronyms or a dictionary containing detailed results.

    Args:
        in_hucs (pd.DataFrame): DataFrame containing HUC columns (e.g., 'huc2', 'huc4', ..., 'huc12').
                                Column names must contain the HUC level number (e.g., 'huc6').
        in_wbd_path (str): Path to the Watershed Boundary Dataset (WBD) 
                          (e.g., GeoPackage or File Geodatabase).
        out_dir (str): Directory where NHDPlus HR data will be downloaded to/read from.
        huc_range (range, optional): Specifies which HUC levels to retrieve basin names for. 
                                     Defaults to range(2, 14, 2) (HUCs 2, 4, 6, 8, 10, 12).
        flatten (bool, optional): If True, returns a single set containing all unique basin and river names. 
                                  If False, returns a dictionary with the basin DataFrame and river names dict.
                                  Defaults to True.
        verbose (bool, optional): If True, prints status messages. Defaults to True.

    Returns:
        set | dict: If flatten=True, a set of unique hydronym strings.
                    If flatten=False, a dictionary:
                        {'basins_all_pd': DataFrame with added basin name columns, 
                         'rivers_huc4_dict': dict mapping HUC4 codes to lists of river names}.

    Raises:
        ValueError: If HUC level cannot be extracted from a column name in `in_hucs`.
        FileNotFoundError: If WBD or NHDPlus HR GDB paths are invalid.
        KeyError: If expected columns ('huc4', 'huc8', HUC columns matching huc_range) are missing in `in_hucs`.
        Exception: Depending on underlying geopandas/pandas/download function errors.
    """
    # Initial status message.
    print('Getting NHD basin names')

    # --- Stage 1: Get Basin Names from WBD ---
    for coln in in_hucs.columns:
        # Extract the numeric part (HUC level) from the column name using regex.
        huc_len_str = re.sub(r'[a-zA-Z]+', '', coln)
        
        # Attempt to convert the extracted string to an integer.
        try:
            huc_len = int(huc_len_str)
        except ValueError:
            # If conversion fails (e.g., column name has no digits), skip this column.
            if verbose:
                print(f"Skipping column '{coln}': Could not extract HUC level.")
            continue # Skip to the next column

        if huc_len in huc_range:
            layer_name = f'WBDHU{huc_len}'  
            huc_col_in_wbd = f'huc{huc_len}'
            
            if verbose:
                 print(f"Reading {layer_name} to get names for {coln}")
                 
            try:
                wbd = gpd.read_file(filename=in_wbd_path, 
                                    layer=layer_name,
                                    columns=[huc_col_in_wbd, 'name'], 
                                    ignore_geometry=True
                                   )
                
                # Merge the basin names into the input DataFrame based on the HUC column.
                in_hucs = in_hucs.merge(wbd, 
                                        left_on=coln,         
                                        right_on=huc_col_in_wbd,
                                        how='left').\
                                  rename(columns={"name": f"{coln}_name"})
                if coln != huc_col_in_wbd:
                    in_hucs = in_hucs.drop(columns=[huc_col_in_wbd])

            except Exception as e:
                # Handle errors during file/layer reading or merging.
                print(f"Warning: Could not read or merge names for {layer_name}. Error: {e}")

    # --- Stage 2: Ensure NHDPlus HR Data is Available ---
    # Get the list of unique HUC4 codes present in the input data.
    huc4_list = in_hucs.huc4.unique()
    # Dictionary to store paths to the downloaded NHDPlus HR GeoDatabases.
    nhd_huc4_pathdict = {}
    for huc in huc4_list:
        download_nhdplus_hr_hu4(
            hu4=huc,         
            out_dir=out_dir,   
            verbose=False 
        )
        nhd_huc4_pathdict[huc] = os.path.join(
            out_dir,
            f'NHDPLUS_H_{huc}_HU4_GDB.gdb') # Standard NHDPlus HR GDB naming convention.
     
    # --- Stage 3: Get River Names from NHDPlus HR ---
    #NHD FCode values representing streams/rivers and artificial paths to consider.
    # 46000: Stream/River
    # 46003: Stream/River: Intermittent
    # 46006: Stream/River: Perennial
    # 46007: Stream/River: Ephemeral
    # 55800: Artificial path
    fcode_sel_list = [46000, 46006, 46003, 46007, 55800]
    # Dictionary to store river names found, keyed by HUC4.
    huc4_rivnames_dict = {}

    for huc4 in nhd_huc4_pathdict:
        if verbose:
            print(f"Processing rivers for HUC4: {huc4}")
            
        # Read NHDFlowline and NHDPlusFlowlineVAA tables for the current HUC4.
        # suppress specific UserWarnings from pyogrio related to geometry types 
        # with M values (common in NHD), as geometry is ignored anyway.
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning, module="pyogrio.raw")
            
            try:
                # Read flowline attributes
                flowlines_gpd = gpd.read_file(
                    filename=nhd_huc4_pathdict[huc4], 
                    layer='NHDFlowline',
                    columns=['NHDPlusID', 'ReachCode', 'GNIS_Name', 'FCode'],
                    ignore_geometry=True 
                )
                
                # Read VAA table attributes
                vaa_pd = gpd.read_file(
                    filename=nhd_huc4_pathdict[huc4], 
                    layer='NHDPlusFlowlineVAA',
                    columns=['NHDPlusID', 'StreamOrde'],
                    ignore_geometry=True
                )
            except Exception as e:
                 print(f"Warning: Could not read NHD layers for HUC4 {huc4}. Skipping. Error: {e}")
                 continue

        # --- Filter Flowlines to relevant types ---
        flowlines_gpd['huc8'] = flowlines_gpd['ReachCode'].str[:8] 
        
        # Get the unique HUC8s present in the input DataFrame that fall within the current HUC4.
        huc8_sel = in_hucs[in_hucs['huc4']==huc4]['huc8'].unique()
        
        # Subset the flowlines to only those within the relevant HUC8s.
        flowlines_sub = flowlines_gpd[flowlines_gpd['huc8'].isin(huc8_sel)].\
                        merge(vaa_pd, how='inner', on='NHDPlusID')
     
        # Apply filters to identify significant named rivers:
        # 1. FCode must be in the predefined list of stream/river types.
        # 2. Stream Order must be 6 or greater (identifying larger rivers).
        # 3. GNIS_Name must not be null (must have a name).
        rivnames = flowlines_sub[
            (flowlines_sub['FCode'].isin(fcode_sel_list)) 
            & (flowlines_sub['StreamOrde'] >= 6) 
            & (flowlines_sub['GNIS_Name'].notna())
        ]['GNIS_Name'].unique()

        huc4_rivnames_dict[huc4] = rivnames

    # --- Stage 4: Format and Return Output ---
    if flatten:
        # Create a single set containing all unique hydronyms.
        basin_name_cols = [f'huc{lev}_name' for lev in huc_range if f'huc{lev}_name' in in_hucs.columns]
        basin_names_set = set(pd.melt(in_hucs, value_vars=basin_name_cols)['value'].dropna().unique())
        river_names_set = {name for name_list in huc4_rivnames_dict.values() for name in name_list}
        all_hydronyms_set = basin_names_set.union(river_names_set)
        return(all_hydronyms_set)
    else:
        #Return Dictionary 
        out_dict = {}
        out_dict['basins_all_pd'] = in_hucs
        out_dict['rivers_huc4_dict'] = huc4_rivnames_dict
        return(out_dict)

In [None]:
def get_nhd_data(in_hucs_pd: pd.DataFrame,
                 out_dir: str, 
                 verbose: bool = True):
    """
    Downloads (if necessary) and reads NHDPlus HR flowline and VAA data 
    for specified HUC4 units, returning a dictionary of merged GeoDataFrames.

    Steps:
    1. Identifies unique HUC4s from the input DataFrame.
    2. Downloads NHDPlus HR data for each unique HUC4 if not already present in out_dir.
    3. For each HUC4, reads the NHDFlowline (with geometry) and NHDPlusFlowlineVAA layers.
    4. Merges the flowline geometry and attributes with the VAA attributes.
    5. Returns a dictionary mapping each HUC4 code to its corresponding merged GeoDataFrame.

    Args:
        in_hucs_pd (pd.DataFrame): DataFrame containing a 'huc4' column indicating which 
                                   HUC4 units' data is needed.
        out_dir (str): Directory where NHDPlus HR data will be downloaded to/read from.
        verbose (bool, optional): If True, prints status messages (e.g., which HUC4 is being processed). 
                                  Defaults to True.

    Returns:
        dict: A dictionary where keys are HUC4 codes (str) and values are 
              GeoDataFrames containing the merged NHDFlowline (geometry & attributes) 
              and NHDPlusFlowlineVAA (attributes) data for that HUC4.

    Raises:
        FileNotFoundError: If NHDPlus HR GDB paths are invalid after attempting download.
        KeyError: If the 'huc4' column is missing in `in_hucs_pd`.
        Exception: Depending on underlying geopandas/pandas/download function errors.
    """
    # Initial status message.
    print('Getting NHD data for all HU4')
     
    # --- Stage 1: Ensure NHDPlus HR Data is Available ---
    huc4_list = in_hucs_pd.huc4.unique()
    nhd_huc4_pathdict = {}
    for huc in huc4_list:
        download_nhdplus_hr_hu4(
            hu4=huc,            
            out_dir=out_dir,   
            verbose=False      
        )
        # Construct the expected path to the downloaded GeoDatabase and store it.
        nhd_huc4_pathdict[huc] = os.path.join(
            out_dir,
            f'NHDPLUS_H_{huc}_HU4_GDB.gdb') # Standard NHDPlus HR GDB naming.

    # --- Stage 2: Read and Merge Data for Each HUC4 ---
    nhdplus_huc4_dict = {}
    for huc4 in nhd_huc4_pathdict:
        if verbose:
            print(f"Processing NHD data for HUC4: {huc4}")
            
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning, module="pyogrio.raw")
            
            try:
                # Read the NHDFlowline layer, including geometry.
                flowlines_gpd = gpd.read_file(
                    filename=nhd_huc4_pathdict[huc4], 
                    layer='NHDFlowline',
                    ignore_geometry=False # Load the spatial geometry for flowlines.
                )
                
                # Read the NHDPlusFlowlineVAA (Value Added Attributes) table.
                vaa_pd = gpd.read_file(
                    filename=nhd_huc4_pathdict[huc4], 
                    layer='NHDPlusFlowlineVAA',
                    ignore_geometry=False 
                )
            except Exception as e:
                 print(f"Warning: Could not read NHD layers for HUC4 {huc4}. Skipping. Error: {e}")
                 continue # Skip to the next HUC4

        # Merge the flowline GeoDataFrame (with geometry) with the VAA DataFrame (attributes).
        flowlines_vaa = flowlines_gpd.merge(
            vaa_pd, 
            how='inner',        # Keep only matching NHDPlusIDs.
            on='NHDPlusID',     # Join key.
            suffixes=('', '_vaa') # Suffix for overlapping column names from VAA.
        )
        nhdplus_huc4_dict[huc4] = flowlines_vaa
     
    return(nhdplus_huc4_dict)

In [None]:
def get_geoglows_vpu(in_geoglows_vpu_path: str,
                     in_xytab: str = None, lon_col: str = None, lat_col: str = None,
                     in_vector: str = None, 
                     in_id_list: list = None, # NOTE: This argument is accepted but not used.
                     hull: bool = True, sjoin_predicate: str = 'intersects'):
    """
    Determines the GEOGloWS VPU (Vector Processing Unit) identifiers corresponding 
    to input spatial data (points or polygons).

    This function uses spatial joining to find VPUs that intersect with the provided
    input geometry. It does *not* use the `in_id_list` argument YET.

    Args:
        in_geoglows_vpu_path (str): Path to the GEOGloWS VPU boundaries dataset 
                                   (e.g., GeoPackage, Shapefile).
        in_xytab (str, optional): Path to table with XY coordinates. Defaults to None.
        lon_col (str, optional): Longitude column name in in_xytab. Defaults to None.
        lat_col (str, optional): Latitude column name in in_xytab. Defaults to None.
        in_vector (str, optional): Path to vector file. Defaults to None.
        in_id_list (list, optional): Accepted argument but currently unused by this function. 
                                     Defaults to None.
        in_refids_parquet (str, optional): Accepted argument but currently unused by this function. 
                                          Defaults to None.
        hull (bool, optional): If True and using spatial input, compute convex hull of input 
                               before spatial join. Defaults to True.
        sjoin_predicate (str, optional): Spatial predicate for the join ('intersects', 'within', etc.). 
                                         Defaults to 'intersects'.

    Returns:
        list: A list of VPU identifiers (from the 'VPU' column of the VPU layer) that spatially 
              match the input data based on the predicate. Returns None if no spatial input 
              (in_xytab or in_vector) is provided. May contain duplicates or None values if the 
              left spatial join doesn't find matches for all input features.

    Raises:
        Depends on the underlying functions: FileNotFoundError, KeyError, ValueError, etc.
    """
     vpu_list = None 
    
    # --- Process only if spatial input (points table or vector file) is provided ---
    if in_xytab or in_vector:
        # --- Step 1: Format Input Geometry ---
        # Standardize the input spatial data into a GeoDataFrame.
        # 'hull=True' (default) will create a convex hull polygon of the input features first.
        gdf_to_join = _format_gdf_tojoin(in_xytab=in_xytab, lon_col=lon_col, lat_col=lat_col, 
                                         in_vector=in_vector, 
                                         hull=hull)
         
        # --- Step 2: Read VPU Boundaries ---
        vpus = gpd.read_file(filename=in_geoglows_vpu_path, 
                             layer='vpu-boundaries', 
                             columns=['VPU']  
                            )
     
        # --- Step 3: Perform Spatial Join ---
        matched_vpus = gpd.sjoin(gdf_to_join.to_crs(crs=vpus.crs), # Ensure CRS match
                                 vpus, 
                                 how='left', 
                                 predicate=sjoin_predicate) 
                                 
        # --- Step 4: Extract Matched VPU IDs ---
        # This list might contain duplicates if an input feature overlaps multiple VPUs (unlikely for VPUs)
        # It might contain None/NaN if a feature in gdf_to_join (after hull potentially) 
        # did not match any VPU based on the predicate in the left join.
        vpu_list = matched_vpus['VPU'].tolist().unique()

    return(vpu_list)

In [None]:
def get_gadm_lev1_dict(in_gadm_path: str,
                       in_xytab: str = None, lon_col: str = None, lat_col: str = None,
                       in_vector: str = None, 
                       in_id_list: list = None, # NOTE: This argument is accepted but not used.
                       hull: bool = True, sjoin_predicate: str = 'intersects'):
    """
    Finds GADM level 1 administrative units that spatially match input geometry.

    This function takes input spatial data (points or polygons), finds GADM level 1 
    administrative units that intersect (or match based on predicate) with it, 
    and returns a GeoDataFrame containing those matching GADM units along with 
    attributes from both sources. 
    It does *not* use the `in_id_list` or `in_refids_parquet` arguments (YET).

    Args:
        in_gadm_path (str): Path to the GADM dataset (e.g., GeoPackage).
        in_xytab (str, optional): Path to table with XY coordinates. Defaults to None.
        lon_col (str, optional): Longitude column name in in_xytab. Defaults to None.
        lat_col (str, optional): Latitude column name in in_xytab. Defaults to None.
        in_vector (str, optional): Path to vector file. Defaults to None.
        in_id_list (list, optional): Accepted argument but currently unused by this function. 
                                     Defaults to None.
        in_refids_parquet (str, optional): Accepted argument but currently unused by this function. 
                                          Defaults to None.
        hull (bool, optional): If True and using spatial input, compute convex hull of input 
                               before spatial join. Defaults to True.
        sjoin_predicate (str, optional): Spatial predicate for the join ('intersects', 'within', etc.). 
                                         Defaults to 'intersects'.

    Returns:
        geopandas.GeoDataFrame | None: 
            If spatial input is provided, returns a GeoDataFrame containing the GADM level 1 
            units that spatially match the input geometry (based on the predicate). 
            The geometry in the returned GeoDataFrame is that of the GADM units. 
            Attributes from both the GADM layer and the input data are included.
            Returns None if no spatial input (in_xytab or in_vector) is provided.
            Note: The function name suggests a Dictionary return type, but it returns a GeoDataFrame.

    Raises:
        Depends on the underlying functions: FileNotFoundError, KeyError, ValueError, etc.
    """

    matched_adm_units = None
    
    # --- Process only if spatial input (points table or vector file) is provided ---
    if in_xytab or in_vector:
        # Standardize the input spatial data into a GeoDataFrame 
        # 'hull=True' (default) will create a convex hull polygon of the input features first.
        gdf_to_join = _format_gdf_tojoin(in_xytab=in_xytab, lon_col=lon_col, lat_col=lat_col, 
                                         in_vector=in_vector, 
                                         hull=hull)
         
        gadm_gpd = gpd.read_file(filename=in_gadm_path, layer='ADM_1')
     
        # Spatially join the GADM level 1 units with the input geometry.
        matched_adm_units = gpd.sjoin(
            gadm_gpd, # Keep geometry and attributes from GADM units...
            gdf_to_join.to_crs(crs=gadm_gpd.crs), 
            how='inner',
            predicate=sjoin_predicate
        )

    return (matched_adm_units)

In [None]:
#Run functions
hu12_list = create_huc12_list(wbd_path, hu12_parquet)

basinatlas11_list = create_basinatlas11_list(
    basinatlas_path, 
    basinatlas11_parquet)

# test_huc_pd = get_matching_NHD_HU(
#     in_wbd_path=wbd_path,
#     in_vector=test_pts_path,
#     in_refids_parquet=hu12_parquet,
#     hull=True,
#     sjoin_predicate='intersects'
# )
# #print(test_huc_pd)
# #in_id_list = in_umrb_huc4s = [f'07{str(i).zfill(2)}' for i in range(2,15)]

# test_nhd_hydronyms = get_nhd_hydronyms(
#     in_hucs=test_huc_pd,
#     in_wbd_path=wbd_path,
#     out_dir = os.path.join(nhd_dir, 'nhdplus_hr'),
#     huc_range=[2, 4, 6],
#     verbose=False
# )
# len(test_nhd_hydronyms)

# test_pfaf_pd = get_matching_hydrobasin(
#     in_basinatlas_path=basinatlas_path,
#     in_vector=test_pts_path,
#     #in_id_list=None, 
#     in_refids_parquet=basinatlas11_parquet,
#     hull=True,
#     sjoin_predicate='intersects'
# )
# #print(test_pfaf_pd)

# test_pfaf_pd_idlist = get_matching_hydrobasin(
#     in_basinatlas_path=basinatlas_path,
#     in_id_list=[742873, 742875, 742876], 
#     in_refids_parquet=basinatlas11_parquet
# )
# #print(test_pfaf_pd_idlist)


# test_vpu_list = get_geoglows_vpu(
#     in_geoglows_vpu_path=geoglows_vpu_path,
#     in_vector=test_pts_path,
#     hull=True,
#     sjoin_predicate='intersects'
# )
# print(test_vpu_list)

# test_gadm_lev1 = get_gadm_lev1_dict(
#     in_gadm_path=gadm_path,
#     in_vector=test_pts_path,
#     hull=True,
#     sjoin_predicate='intersects'
# )

In [None]:
################IN DEVELOPMENT ###############################

In [None]:
def get_geoglows_hydronyms(in_geoglows_path, verbose=True):

    country_tab_path = os.path.join(in_geoglows_path, 
                                    'tables', 'v2-countries-table.parquet')
    country_pd = pd.read_parquet(country_tab_path)

    meta_tab_path = os.path.join(in_geoglows_path, 
                                    'tables', 'package-metadata-table.parquet')
    meta_pd = pd.read_parquet(meta_tab_path)

    model_tab_path = os.path.join(in_geoglows_path, 
                                    'tables', 'v2-model-table.parquet')
    model_pd = pd.read_parquet(model_tab_path)

    print('Getting geoglows river names')
    for vpu in test_vpu_list[0]:
        streams_gpd= gpd.read_file(
                filename=nhd_huc4_pathdict[huc4], 
                layer='NHDFlowline',
                ignore_geometry=False
        )
          
# geoglows_path = os.path.join(datdir, 'geoglows')
# get_geoglows_hydronyms(in_geoglows_path=geoglows_path, verbose=True)


