In [3]:
%run download_geographic_refs.ipynb
%run set_up.py 

import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import Point
import nltk
from nltk.util import ngrams
import re

verbose=True

In [4]:
wbd_path = os.path.join(nhd_dir, 'WBD_National_GDB.gdb')
hu12_parquet = os.path.join(nhd_dir, 'wbd_hu12list.parquet')
basinatlas_path = os.path.join(hydroatlas_dir,  'BasinATLAS_v10.gdb')
basinatlas11_parquet = os.path.join(hydroatlas_dir, 'basinatlas_lev11_idlist.parquet')
geoglows_vpu_path = os.path.join(geoglows_dir, 'vpu-boundaries.gpkg')

In [5]:
#Create a list of NHD HUC12
def create_huc12_list(wbd_path, 
                      out_hu12_parquet,
                      verbose=True):
    if not os.path.exists(out_hu12_parquet):
        if verbose:
            print(f'Generating a list of HUC 12 and saving it to \
            {out_hu12_parquet}')
        wbdhu12_list = gpd.read_file(filename=wbd_path, 
                                     layer='WBDHU12', 
                                     rows=105000,
                                     columns=['huc12'],
                                     ignore_geometry=True)
        wbdhu12_list.to_parquet(out_hu12_parquet)
    else:
        wbdhu12_list = pd.read_parquet(out_hu12_parquet)
    return(wbdhu12_list)

hu12_list = create_huc12_list(wbd_path, hu12_parquet)

In [6]:
#Create a list of PFAF_ID for basins level 11
def create_basinatlas11_list(basinatlas_path, 
                             out_basinatlas11_parquet, 
                             verbose=True):
    if not os.path.exists(out_basinatlas11_parquet):
        if verbose:
            print(f'Generating a list of PFAF ID level 11 and saving it to \
            {out_basinatlas11_parquet}')
            
        basinatlas11_list = gpd.read_file(
            filename=basinatlas_path, 
            layer='BasinATLAS_v10_lev11', 
            columns=['PFAF_ID'],
            rows=1031785,
            ignore_geometry=True).\
        astype(pd.Int64Dtype()).\
        rename(columns={"PFAF_ID": "PFAF_ID11"})
        
        basinatlas11_list.to_parquet(out_basinatlas11_parquet)
    else:
        basinatlas11_list = pd.read_parquet(out_basinatlas11_parquet)
    return(basinatlas11_list)

basinatlas11_list = create_basinatlas11_list(
    basinatlas_path, 
    basinatlas11_parquet)

In [7]:
def _expand_basin_idlist(in_id_list,
                         in_refids_parquet,
                         refids_col,
                         out_id_range):
    """
    Expands a list of basin IDs by extracting IDs at different levels from a reference Parquet file.

    Args:
        in_id_list (list): List of input basin IDs.  Can be integers or strings.
        in_refids_parquet (str): Path to the reference Parquet file.
        refids_col (str): Name of the column in the Parquet file containing the full basin IDs.
        out_id_range (list): List of integer levels to extract.  e.g., [6, 9, 12]

    Returns:
        pandas.DataFrame: DataFrame containing the expanded basin IDs.  The returned DataFrame
                          will have columns named based on `refids_col` and levels in `out_id_range`.
                          The data type of the output columns will match the input type of `in_id_list`.

    Raises:
        TypeError: If the length of input IDs is inconsistent with the length in reference table.
        FileNotFoundError: If the input Parquet file does not exist.
        ValueError:  If `out_id_range` contains values greater than the maximum ID length.
                   Or if an empty DataFrame is returned by filtering.
    """

    id_all_pd = pd.read_parquet(in_refids_parquet)

    # Determine input type and maximum ID length
    in_id_type = type(in_id_list[0])
    if in_id_type == str:
      in_id_len = len(in_id_list[0])
    elif in_id_type == int:
      in_id_len = len(str(in_id_list[0]))
    else:
      raise TypeError("in_id_list must be a list of strings or integers")

    # Determine the reference ID type and length
    first_ref_id = id_all_pd[refids_col].iloc[0] #Get first item in col
    refid_type = type(first_ref_id)

    if refid_type == str:
      refid_len = len(first_ref_id)
    elif pd.api.types.is_integer_dtype(refid_type) or  refid_type == np.int64:
      refid_len = len(str(first_ref_id))
    else:
      raise TypeError(f"{refids_col} must contain strings or integers")


    if in_id_len > refid_len:
        raise ValueError(f"Input ID length ({in_id_len}) is greater than reference ID length ({refid_len})")

    # --- Input Validation on out_id_range ---
    if any(level > refid_len for level in out_id_range):
        raise ValueError(f"out_id_range values cannot exceed reference ID length ({refid_len})")

    # --- Filtering ---
    # Convert to string for consistent prefix matching, then convert back later
    id_all_pd[refids_col] = id_all_pd[refids_col].astype(str)
    in_id_list_str = [str(x) for x in in_id_list]

    id_pd = id_all_pd[id_all_pd[refids_col].str.startswith(tuple(in_id_list_str))].copy()

    if id_pd.empty:
        raise ValueError("No matching IDs found. Check in_id_list and refids_col.")

    # --- Column Expansion ---
    colroot = re.sub(r'[0-9]+', '', refids_col)
    for id_level in out_id_range:
        col_name = f'{colroot}{id_level}'
        if col_name not in id_pd.columns:
            id_pd.loc[:, col_name] = id_pd[refids_col].str[:id_level]
            # Convert back to original type, if necessary
            if in_id_type == int:
                id_pd.loc[:, col_name] = pd.to_numeric(id_pd[col_name],
                                                       errors='raise') 

    # Convert refids_col back to original type
    if pd.api.types.is_integer_dtype(refid_type) or  refid_type == np.int64:
        id_pd[refids_col] = pd.to_numeric(id_pd[refids_col], 
                                          errors='raise')

    return id_pd

In [8]:
def _format_gdf_tojoin(in_xytab=None, lon_col=None, lat_col=None, 
                       in_vector=None, hull=True):
    #Read xy table
    if in_xytab:
        points_df = pd.read_table(point_locations_path)
        gdf_to_join = gpd.GeoDataFrame(
            points_df,
            geometry=[Point(xy) for xy 
                      in zip(points_df[lon_col], points_df[lat_col])]
        )
    
    #Read vector layer
    if in_vector:
        gdf_to_join = gpd.read_file(in_vector)

    if hull:
        # Create convex hull using union_all() (current recommended method)
        gdf_to_join = gpd.GeoDataFrame(
            geometry=[gdf_to_join.geometry.union_all().convex_hull], 
            crs=gdf_to_join.crs
        )
    return(gdf_to_join)

In [9]:
def get_matching_NHD_HU(in_wbd_path,
                        in_xytab=None, lon_col=None, lat_col=None,
                        in_vector=None, 
                        in_id_list=None, 
                        in_refids_parquet=None,
                        hull=True, sjoin_predicate='intersects'):

    #If points or polygons are provided ----------------------------------------
    if in_xytab or in_vector:
        gdf_to_join = _format_gdf_tojoin(in_xytab, lon_col, lat_col, 
                                         in_vector, 
                                         hull)
        
        #Get NHD WBD
        wbdhu6 = gpd.read_file(filename=in_wbd_path, 
                               layer='WBDHU6',
                               columns=['huc6']
                              )
    
        #Spatially join to hydrologic units
        matched_nhd = gpd.sjoin(gdf_to_join.to_crs(crs=wbdhu6.crs), 
                               wbdhu6, 
                               how='left', 
                               predicate=sjoin_predicate)
        in_id_list = matched_nhd.huc6.values.tolist()
        
    huc_pd = _expand_basin_idlist(
        in_id_list, 
        in_refids_parquet, 
        refids_col='huc12', 
        out_id_range=range(2, 14, 2))

    return(huc_pd)

test_pts_path = os.path.join(datdir, 'test_gages', 'test_gages.shp')
test_huc_pd = get_matching_NHD_HU(
    in_wbd_path=wbd_path,
    in_vector=test_pts_path,
    in_refids_parquet=hu12_parquet,
    hull=True,
    sjoin_predicate='intersects'
)
print(test_huc_pd)

#in_id_list = in_umrb_huc4s = [f'07{str(i).zfill(2)}' for i in range(2,15)]

               huc12 huc2  huc4    huc6      huc8       huc10
0       070200090402   07  0702  070200  07020009  0702000904
1       070200030503   07  0702  070200  07020003  0702000305
2       070200030602   07  0702  070200  07020003  0702000306
6       070200030701   07  0702  070200  07020003  0702000307
7       070300040804   07  0703  070300  07030004  0703000408
...              ...  ...   ...     ...       ...         ...
101242  071200010601   07  0712  071200  07120001  0712000106
101243  071200011301   07  0712  071200  07120001  0712000113
101244  071200010501   07  0712  071200  07120001  0712000105
101246  071200011002   07  0712  071200  07120001  0712000110
101248  071200030306   07  0712  071200  07120003  0712000303

[11145 rows x 6 columns]


In [10]:
def get_matching_hydrobasin(in_basinatlas_path,
                            in_xytab=None, lon_col=None, lat_col=None,
                            in_vector=None, 
                            in_id_list=None, 
                            in_refids_parquet=None,
                            hull=True, sjoin_predicate='intersects'):
    #If points or polygons are provided ----------------------------------------
    if in_xytab or in_vector:
        gdf_to_join = _format_gdf_tojoin(in_xytab, lon_col, lat_col, 
                                         in_vector, 
                                         hull)
        
        #Reach NHD WBD
        bas_lev6 = gpd.read_file(filename=in_basinatlas_path, 
                                 layer='BasinATLAS_v10_lev06',
                                 columns=['PFAF_ID']
                                ).rename(columns={"PFAF_ID": "PFAF_ID6"})
    
        #Spatially join to hydrologic units
        matched_bas = gpd.sjoin(gdf_to_join.to_crs(crs=bas_lev6.crs), 
                               bas_lev6, 
                               how='left', 
                               predicate=sjoin_predicate)
        in_id_list = matched_bas.PFAF_ID6.tolist()

    pfaf_pd = _expand_basin_idlist(
        in_id_list, 
        in_refids_parquet, 
        refids_col='PFAF_ID11', 
        out_id_range=range(3, 12))

    return(pfaf_pd)

test_pfaf_pd = get_matching_hydrobasin(
    in_basinatlas_path=basinatlas_path,
    in_vector=test_pts_path,
    #in_id_list=None, 
    in_refids_parquet=basinatlas11_parquet,
    hull=True,
    sjoin_predicate='intersects'
)
print(test_pfaf_pd)

test_pfaf_pd_idlist = get_matching_hydrobasin(
    in_basinatlas_path=basinatlas_path,
    in_id_list=[742873, 742875, 742876], 
    in_refids_parquet=basinatlas11_parquet
)
print(test_pfaf_pd_idlist)


  return ogr_read(


          PFAF_ID11 PFAF_ID3 PFAF_ID4 PFAF_ID5 PFAF_ID6 PFAF_ID7  PFAF_ID8  \
904887  72582801010      725     7258    72582   725828  7258280  72582801   
904957  72582801031      725     7258    72582   725828  7258280  72582801   
904959  72582801020      725     7258    72582   725828  7258280  72582801   
905050  72582801032      725     7258    72582   725828  7258280  72582801   
905108  72582803010      725     7258    72582   725828  7258280  72582803   
...             ...      ...      ...      ...      ...      ...       ...   
933377  74289780900      742     7428    74289   742897  7428978  74289780   
933386  74288609310      742     7428    74288   742886  7428860  74288609   
933387  74288609200      742     7428    74288   742886  7428860  74288609   
933532  74288609320      742     7428    74288   742886  7428860  74288609   
933590  74288609330      742     7428    74288   742886  7428860  74288609   

         PFAF_ID9   PFAF_ID10  
904887  725828010  7258280101  

In [11]:
def get_geoglows_vpu(in_geoglows_vpu_path,
                     in_xytab=None, lon_col=None, lat_col=None,
                     in_vector=None, 
                     in_id_list=None, 
                     in_refids_parquet=None,
                     hull=True, sjoin_predicate='intersects'):
    
    if in_xytab or in_vector:
        gdf_to_join = _format_gdf_tojoin(in_xytab, lon_col, lat_col, 
                                         in_vector, 
                                         hull)
        
        #Reach NHD WBD
        vpus = gpd.read_file(filename=in_geoglows_vpu_path, 
                                 layer='vpu-boundaries',
                                 columns=['VPU']
                                )
    
        #Spatially join to hydrologic units
        matched_vpus = gpd.sjoin(gdf_to_join.to_crs(crs=vpus.crs), 
                                 vpus, 
                                 how='left', 
                                 predicate=sjoin_predicate)
        vpu_list = matched_vpus.VPU.tolist()

    return(vpu_list)

test_vpu_list = get_geoglows_vpu(
    in_geoglows_vpu_path=geoglows_vpu_path,
    in_vector=test_pts_path,
    hull=True,
    sjoin_predicate='intersects'
)
print(test_vpu_list)

['714', '709']


In [21]:
def get_nhd_hydronyms(in_hucs, in_wbd_path,  out_dir,
                      huc_range=range(2, 14, 2)
                    ):
    print('Getting NHD basin names')
    #If panda dataframe
    #Get basin names------------------------------------------------------------
    for coln in in_hucs.columns:
        huc_len = re.sub(r'[a-zA-Z]+', '', coln)
        if int(huc_len) is None:
            raise ValueError(f"HUC level cannot be extracted from {coln}")
        if int(huc_len) in huc_range:
            wbd = gpd.read_file(filename=in_wbd_path, 
                                layer=f'WBDHU{huc_len}',
                                columns=[coln, 'name'],
                                ignore_geometry=True
                               )
            in_hucs = in_hucs.merge(wbd, on=coln, how='left').\
            rename(columns={"name": f"{coln}_name"})

    #Download data by HU4 if needed
    huc4_list = in_hucs.huc4.unique()
    nhd_huc4_pathdict = {}
    for huc in huc4_list:
        download_nhdplus_hr_hu4(
            hu4=huc, 
            out_dir=out_dir, 
            verbose=False
        )
        nhd_huc4_pathdict[huc] = os.path.join(
            out_dir,
            f'NHDPLUS_H_{huc}_HU4_GDB.gdb')
    #print(nhd_huc4_pathdict)
    
    #Get river names------------------------------------------------------------
    #NHD flow line types: FCode attribute to subset
    # 46000: Stream/River
    # 46003: Stream/River: Hydrographic Category = Intermittent
    # 46006: Stream/River: Hydrographic Category = Perennial
    # 46007: Stream/River: Hydrographic Category = Ephemeral
    # 55800: Artificial path'''
    fcode_sel_list = [46000, 46006, 46003, 46007, 55800]
    huc4_rivnames_dict = {}
    for huc4 in nhd_huc4_pathdict:
        print(huc4)
        #Read flowlines
        flowlines_gpd = gpd.read_file(
            filename=nhd_huc4_pathdict[huc4], 
            layer='NHDFlowline',
            columns=['NHDPlusID', 'ReachCode', 'GNIS_Name', 'FCode'],
            ignore_geometry=True
        )
        vaa_pd = gpd.read_file(
            filename=nhd_huc4_pathdict[huc4], 
            layer='NHDPlusFlowlineVAA',
            columns=['NHDPlusID', 'StreamOrde'],
            ignore_geometry=True
        )


        #reachcode: The first eight digits are the WBD_HUC8.
        #The next six digits are randomly assigned, 
        #sequential numbers that are unique within a HUC8.
        flowlines_gpd['huc8'] = flowlines_gpd['ReachCode'].str[:8] 
        huc8_sel = in_hucs[in_hucs['huc4']==huc4]['huc8'].unique()
        flowlines_sub = flowlines_gpd[flowlines_gpd['huc8'].isin(huc8_sel)].\
        merge(vaa_pd, how='inner', on='NHDPlusID')
    
        rivnames = flowlines_sub[(
            (flowlines_sub['FCode'].isin(fcode_sel_list)) 
            & (flowlines_sub['StreamOrde'] >=6)
            & (flowlines_sub['GNIS_Name'].notna())
        )].GNIS_Name.unique()

        huc4_rivnames_dict[huc4] = rivnames

    #Return dictionary with basin names and river names
    out_dict = {}
    out_dict['basins_all_pd'] = in_hucs
    out_dict['rivers_huc4_dict'] = huc4_rivnames_dict

    return(out_dict)

test_nhd_hydronyms = get_nhd_hydronyms(
    in_hucs=test_huc_pd,
    in_wbd_path=wbd_path,
    out_dir = os.path.join(nhd_dir, 'nhdplus_hr'),
    huc_range=[2, 4, 6, 8]
)

#Create a set with all unique hydronyms from basins and rivers from NHD
test_nhd_hydronyms_set = set([
    *pd.melt(test_nhd_hydronyms['basins_all_pd'], 
             value_vars=[f'huc{lev}_name' for lev in [2, 4, 6]]
            ).value.unique(),
    *set({x for v in test_nhd_hydronyms['rivers_huc4_dict'].values() for x in v})
])
len(test_nhd_hydronyms_set)

Getting NHD basin names
0702


  return ogr_read(


0703


  return ogr_read(


0709


  return ogr_read(


0707


  return ogr_read(


0704


  return ogr_read(


0701


  return ogr_read(


0512


  return ogr_read(


0713


  return ogr_read(


0802


  return ogr_read(


0705


  return ogr_read(


0514


  return ogr_read(


0711


  return ogr_read(


1023


  return ogr_read(


0712


  return ogr_read(


0708


  return ogr_read(


0714


  return ogr_read(


0706


  return ogr_read(


0710


  return ogr_read(


0801


  return ogr_read(


1028


  return ogr_read(


1030


  return ogr_read(


1024


  return ogr_read(


1101


  return ogr_read(


0511


  return ogr_read(


0604


  return ogr_read(


1029


  return ogr_read(


0513


  return ogr_read(


0401


  return ogr_read(


0403


  return ogr_read(


In [56]:
def get_nhd_data(in_hucs_pd, out_dir):
    print('Getting NHD data for all HU4')
    
    #Download data by HU4 if needed
    huc4_list = in_hucs_pd.huc4.unique()
    nhd_huc4_pathdict = {}
    for huc in huc4_list:
        download_nhdplus_hr_hu4(
            hu4=huc, 
            out_dir=out_dir, 
            verbose=False
        )
        nhd_huc4_pathdict[huc] = os.path.join(
            out_dir,
            f'NHDPLUS_H_{huc}_HU4_GDB.gdb')

    nhdplus_huc4_dict = {}
    for huc4 in nhd_huc4_pathdict:
        print(huc4)
        #Read flowlines
        flowlines_gpd = gpd.read_file(
            filename=nhd_huc4_pathdict[huc4], 
            layer='NHDFlowline',
            ignore_geometry=False
        )
        vaa_pd = gpd.read_file(
            filename=nhd_huc4_pathdict[huc4], 
            layer='NHDPlusFlowlineVAA',
            ignore_geometry=False
        )

        flowlines_vaa = flowlines_gpd.merge(
            vaa_pd, how='inner', on='NHDPlusID', suffixes=('', '_vaa'))

        nhdplus_huc4_dict[huc4] = flowlines_vaa
    
    return(nhdplus_huc4_dict)

test_nhdplus_gpd_dict = get_nhd_data(
    in_hucs_pd=test_huc_pd[test_huc_pd['huc4'].isin(['0511', '1030'])], 
    out_dir=os.path.join(nhd_dir, 'nhdplus_hr')
)

Getting NHD data for all HU4
1030


  return ogr_read(


0511


  return ogr_read(


Unnamed: 0,Permanent_Identifier,FDate,Resolution,GNIS_ID,GNIS_Name,LengthKM,ReachCode,FlowDir,WBArea_Permanent_Identifier,FType,...,MinElevRaw,MaxElevSmo,MinElevSmo,Slope,SlopeLenKm,ElevFixed,HWType,HWNodeSqKm,StatusFlag,VPUID_vaa
0,31954731,2016-05-25 00:00:00+00:00,2,,,0.042435,05110003000792,1,120007605,558,...,11818.0,11818.0,11818.0,0.000010,0.042435,1,,,A,0511
1,31954765,2012-03-28 06:48:00+00:00,2,,,0.157000,05110003002341,1,31960569,558,...,11995.0,11995.0,11995.0,0.000010,0.057000,0,0.0,0.2514,A,0511
2,31956873,2016-05-25 00:00:00+00:00,2,,,0.121340,05110003008664,1,31962626,558,...,13695.0,13695.0,13695.0,0.000010,0.121340,1,,,A,0511
3,31956956,2016-05-25 00:00:00+00:00,2,00507017,Wolf Lick Creek,0.503370,05110003000051,1,120007605,558,...,12124.0,12170.0,12170.0,0.000010,0.503370,0,,,A,0511
4,31957304,2012-09-06 00:00:00+00:00,2,00499011,Mud River,1.806182,05110003001458,1,120007605,558,...,12580.0,12731.0,12731.0,0.000010,1.806182,0,,,A,0511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49469,{8E13A622-40D2-4CCD-841D-59196904AFA0},2012-08-29 00:00:00+00:00,2,,,1.004036,05110002018710,1,,420,...,20093.0,20968.0,20968.0,0.000010,0.904036,1,0.0,0.0032,A,0511
49470,{44144425-099D-49E6-B062-08DF6384DF97},2012-08-29 00:00:00+00:00,2,,,2.009940,05110002018711,1,,420,...,20270.0,21882.0,20968.0,0.004785,1.909940,0,0.0,0.2841,A,0511
49471,{F56B549C-35ED-46B9-9E7E-620B6DEF2DD0},2012-08-29 00:00:00+00:00,2,,,0.246632,05110002018980,1,,420,...,22770.0,23035.0,22770.0,0.018072,0.146632,0,0.0,0.0135,A,0511
49472,{DA116B40-7208-45CA-9ECC-099F379A4F73},2012-08-29 00:00:00+00:00,2,,,0.453731,05110002018993,1,,420,...,16365.0,16367.0,16367.0,0.000010,0.453731,0,,,A,0511


In [None]:
def get_geoglows_hydronyms():
    print('Getting geoglows river names')



In [None]:

        
def get_hydroatlas_data():
    print('Getting HydroATLAS data')

In [None]:
def load_spatial_data(point_locations_path, nhd_path, hydrobasins_path, admin_units_path):
    # Convert your points to GeoDataFrame

    
    # Load boundary files
    nhd_gdf = gpd.read_file(nhd_path)
    hydrobasins_gdf = gpd.read_file(hydrobasins_path)
    admin_gdf = gpd.read_file(admin_units_path)
    
    return points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf

In [None]:
# 2. Perform spatial joins to get catchment information
def get_catchment_info(point_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf):
    # Spatial joins
    points_nhd = gpd.sjoin(point_gdf, nhd_gdf, how='left', op='within')
    points_hydrobasins = gpd.sjoin(point_gdf, hydrobasins_gdf, how='left', op='within')
    points_admin = gpd.sjoin(point_gdf, admin_gdf, how='left', op='within')
    
    # Collect all relevant names
    location_names = set()
    
    # Add names from each source (adjust column names as needed)
    name_columns = {
        'nhd': ['BASIN_NAME', 'RIVER_NAME'],
        'hydrobasins': ['HYBAS_NAME'],
        'admin': ['COUNTY_NAME', 'STATE_NAME', 'COUNTRY_NAME']
    }
    
    for df, cols in zip([points_nhd, points_hydrobasins, points_admin], 
                       name_columns.values()):
        for col in cols:
            if col in df.columns:
                location_names.update(df[col].dropna().unique())
    
    return location_names

In [None]:
# 3. Process OpenAlex records
def process_openalex_records(openalex_df, location_names):
    # Function to generate n-grams
    def get_ngrams(text, n_range=(1, 3)):
        if pd.isna(text):
            return set()
        
        text = str(text).lower()
        tokens = nltk.word_tokenize(text)
        all_ngrams = set()
        
        for n in range(n_range[0], n_range[1] + 1):
            text_ngrams = set(' '.join(gram) for gram in ngrams(tokens, n))
            all_ngrams.update(text_ngrams)
            
        return all_ngrams

    # Convert location names to lowercase for matching
    location_names = set(name.lower() for name in location_names)
    
    # Process each record
    matched_records = []
    
    for _, record in openalex_df.iterrows():
        # Combine all text fields
        text_fields = [
            str(record.get('title', '')),
            str(record.get('abstract', '')),
            str(record.get('keywords', ''))
        ]
        
        combined_text = ' '.join(text_fields).lower()
        
        # Generate n-grams from the combined text
        record_ngrams = get_ngrams(' '.join(text_fields))
        
        # Check for matches
        if any(location in combined_text for location in location_names) or \
           any(location in record_ngrams for location in location_names):
            matched_records.append(record)
    
    return pd.DataFrame(matched_records)

In [None]:
# Main execution
def main():
    # Load your data (replace with actual file paths)
    points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf = load_spatial_data(
        'points.csv',
        'nhd.shp',
        'hydrobasins.shp',
        'admin_units.shp'
    )
    
    # Get catchment information
    location_names = get_catchment_info(points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf)
    
    # Load and process OpenAlex records
    openalex_df = pd.read_csv('openalex_records.csv')  # Replace with actual file path
    
    # Filter records based on location names
    matched_records = process_openalex_records(openalex_df, location_names)
    
    # Save results
    matched_records.to_csv('matched_records.csv', index=False)
    