In [24]:
%run download_geographic_refs.ipynb
%run set_up.py 

import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import nltk
from nltk.util import ngrams
import re

verbose=True

In [55]:
#Create a list of NHD HUC12
def create_huc12_list(wbd_path, 
                      out_hu12_parquet,
                      verbose=True):
    if not os.path.exists(out_hu12_parquet):
        if verbose:
            print(f'Generating a list of HUC 12 and saving it to \
            {out_hu12_parquet}')
        wbdhu12_list = gpd.read_file(filename=wbd_path, 
                                     layer='WBDHU12', 
                                     rows=105000,
                                     columns=['huc12'],
                                     ignore_geometry=True)
        wbdhu12_list.to_parquet(out_hu12_parquet)
    else:
        wbdhu12_list = pd.read_parquet(out_hu12_parquet)
    return(wbdhu12_list)

wbd_path = os.path.join(nhd_dir, 'WBD_National_GDB.gdb')
hu12_parquet = os.path.join(nhd_dir, 'wbd_hu12list.parquet')
hu12_list = create_huc12_list(wbd_path, hu12_parquet)

In [67]:
#Create a list of PFAF_ID for basins level 11
def create_basinatlas11_list(basinatlas_path, 
                             out_basinatlas11_parquet, 
                             verbose=True):
    if not os.path.exists(out_basinatlas11_parquet):
        if verbose:
            print(f'Generating a list of PFAF ID level 11 and saving it to \
            {out_basinatlas11_parquet}')
            
        basinatlas11_list = gpd.read_file(
            filename=basinatlas_path, 
            layer='BasinATLAS_v10_lev11', 
            columns=['PFAF_ID'],
            rows=1031785,
            ignore_geometry=True).\
        rename(columns={"PFAD_ID": "PFAF_ID11"})
        
        basinatlas11_list.to_parquet(out_basinatlas11_parquet)
    else:
        basinatlas11_list = pd.read_parquet(out_basinatlas11_parquet)
    return(basinatlas11_list)

basinatlas_path = os.path.join(
    hydroatlas_dir, 
    'BasinATLAS_v10.gdb')
basinatlas11_parquet = os.path.join(
    hydroatlas_dir, 
    'basinatlas_lev11_idlist.parquet')
basinatlas11_list = create_basinatlas11_list(
    basinatlas_path, 
    basinatlas11_parquet)

Generating a list of PFAF ID level 11 and saving it to             D:\WWF_SBTN\BTT_analysis\data\hydroatlas\basinatlas_lev11_idlist.parquet


               huc12 huc2  huc4    huc6      huc8       huc10
0       070200090402   07  0702  070200  07020009  0702000904
1       070200030503   07  0702  070200  07020003  0702000305
2       070200030602   07  0702  070200  07020003  0702000306
6       070200030701   07  0702  070200  07020003  0702000307
7       070300040804   07  0703  070300  07030004  0703000408
...              ...  ...   ...     ...       ...         ...
101242  071200010601   07  0712  071200  07120001  0712000106
101243  071200011301   07  0712  071200  07120001  0712000113
101244  071200010501   07  0712  071200  07120001  0712000105
101246  071200011002   07  0712  071200  07120001  0712000110
101248  071200030306   07  0712  071200  07120003  0712000303

[5152 rows x 6 columns]


In [165]:
# 1. Load your spatial data

#point locations
#geographic coordinate table
#huc number


def _subset_expand_huclist(in_id_list, in_refids_parquet):
    in_huc_len = len(in_id_list[0])
    huc_all_pd = pd.read_parquet(in_refids_parquet)
    
    huc_pd = huc_all_pd[
    huc_all_pd['huc12'].str[:(in_huc_len)].isin(in_id_list)
    ].copy()
    
    for huc_level in huc_range:
        if f'huc{huc_level}' not in huc_pd:
            huc_pd.loc[:, f'huc{huc_level}'] = huc_pd['huc12'].str[:(huc_level)]
    return(huc_pd)

def get_matching_NHD_HU(in_wbd_path,
                        in_points=None, lon_col=None, lat_col=None,
                        in_polygons=None, 
                        in_id_list=None, 
                        in_refids_parquet=None,
                        hull=True, sjoin_predicate='intersects'):

    #If a list of HUCs are provided --------------------------------------------
    if in_id_list and in_refids_parquet:
        huc_range = range(2, 14, 2)
        if (not isinstance(in_id_list, 'str')) or (len(in_id_list) in huc_range) :
            raise TypeError("in_id_list argument must be a string of even number \
            of digits between 2 and 12")
        huc_pd = _subset_expand_huclist(in_id_list, in_refids_parquet)

    #If points or polygons are provided ----------------------------------------
    elif in_points or in_polygons:
        #Read points
        if in_points:
            points_ext = os.path.splitext(in_points)[1]
            if points_ext =='.csv':
                points_df = pd.read_csv(point_locations_path)
                gdf_to_join = gpd.GeoDataFrame(
                    points_df,
                    geometry=[Point(xy) for xy 
                              in zip(points_df[lon_col], points_df[lat_col])]
                )
            elif points_ext in ['.gpkg', '.shp', '.gdb']:
                gdf_to_join = gpd.read_file(in_points)
            else:
                raise TypeError("in_points type not recognized: can be .csv, .gpkg, .shp, or .gdb")
                
        #Read polygons
        if in_polygons:
            gdf_to_join = gpd.read_file(in_polygons)
    
        if hull:
            # Create convex hull using union_all() (current recommended method)
            gdf_to_join = gpd.GeoDataFrame(
                geometry=[gdf_to_join.geometry.union_all().convex_hull], 
                crs=gdf_to_join.crs
            )
            
        #Reach NHD WBD
        wbdhu6 = gpd.read_file(filename=in_wbd_path, 
                               layer='WBDHU6',
                               columns=['huc6']
                              )
    
        #Spatially join to hydrologic units
        points_nhd = gpd.sjoin(gdf_to_join.to_crs(crs=wbdhu6.crs), 
                               wbdhu6, 
                               how='left', 
                               predicate=sjoin_predicate)
        huc_pd = _subset_expand_huclist(
             points_nhd.huc6.values.tolist(), 
            in_refids_parquet)

    return(huc_pd)

test_pts_path = os.path.join(datdir, 'test_gages', 'test_gages.shp')
test_huc_pd = get_matching_NHD_HU(
    in_wbd_path=wbd_path,
    in_points=test_pts_path,
    #lon_col=None, lat_col=None,
    #in_polygons=None, 
    #in_id_list=None, 
    in_refids_parquet=hu12_parquet,
    hull=True,
    sjoin_predicate='intersects'
)
print(test_huc_pd)

#in_id_list = in_umrb_huc4s = [f'07{str(i).zfill(2)}' for i in range(2,15)]

               huc12 huc2  huc4    huc6      huc8       huc10
0       070200090402   07  0702  070200  07020009  0702000904
1       070200030503   07  0702  070200  07020003  0702000305
2       070200030602   07  0702  070200  07020003  0702000306
6       070200030701   07  0702  070200  07020003  0702000307
7       070300040804   07  0703  070300  07030004  0703000408
...              ...  ...   ...     ...       ...         ...
101242  071200010601   07  0712  071200  07120001  0712000106
101243  071200011301   07  0712  071200  07120001  0712000113
101244  071200010501   07  0712  071200  07120001  0712000105
101246  071200011002   07  0712  071200  07120001  0712000110
101248  071200030306   07  0712  071200  07120003  0712000303

[8704 rows x 6 columns]


In [None]:
def get_matching_hydrobasin(in_points=None, in_polygon=None, 
                            in_table=None, lon_col=None, lat_col=None,
                            in_id=None, hull=True):


def get_matching_geoglows_vpu(in_points=None, in_polygon=None, 
                              in_table=None, lon_col=None, lat_col=None,
                              in_id=None, hull=True):
    

def load_spatial_data(point_locations_path, nhd_path, hydrobasins_path, admin_units_path):
    # Convert your points to GeoDataFrame

    
    # Load boundary files
    nhd_gdf = gpd.read_file(nhd_path)
    hydrobasins_gdf = gpd.read_file(hydrobasins_path)
    admin_gdf = gpd.read_file(admin_units_path)
    
    return points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf

In [None]:
# 2. Perform spatial joins to get catchment information
def get_catchment_info(point_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf):
    # Spatial joins
    points_nhd = gpd.sjoin(point_gdf, nhd_gdf, how='left', op='within')
    points_hydrobasins = gpd.sjoin(point_gdf, hydrobasins_gdf, how='left', op='within')
    points_admin = gpd.sjoin(point_gdf, admin_gdf, how='left', op='within')
    
    # Collect all relevant names
    location_names = set()
    
    # Add names from each source (adjust column names as needed)
    name_columns = {
        'nhd': ['BASIN_NAME', 'RIVER_NAME'],
        'hydrobasins': ['HYBAS_NAME'],
        'admin': ['COUNTY_NAME', 'STATE_NAME', 'COUNTRY_NAME']
    }
    
    for df, cols in zip([points_nhd, points_hydrobasins, points_admin], 
                       name_columns.values()):
        for col in cols:
            if col in df.columns:
                location_names.update(df[col].dropna().unique())
    
    return location_names

In [None]:
# 3. Process OpenAlex records
def process_openalex_records(openalex_df, location_names):
    # Function to generate n-grams
    def get_ngrams(text, n_range=(1, 3)):
        if pd.isna(text):
            return set()
        
        text = str(text).lower()
        tokens = nltk.word_tokenize(text)
        all_ngrams = set()
        
        for n in range(n_range[0], n_range[1] + 1):
            text_ngrams = set(' '.join(gram) for gram in ngrams(tokens, n))
            all_ngrams.update(text_ngrams)
            
        return all_ngrams

    # Convert location names to lowercase for matching
    location_names = set(name.lower() for name in location_names)
    
    # Process each record
    matched_records = []
    
    for _, record in openalex_df.iterrows():
        # Combine all text fields
        text_fields = [
            str(record.get('title', '')),
            str(record.get('abstract', '')),
            str(record.get('keywords', ''))
        ]
        
        combined_text = ' '.join(text_fields).lower()
        
        # Generate n-grams from the combined text
        record_ngrams = get_ngrams(' '.join(text_fields))
        
        # Check for matches
        if any(location in combined_text for location in location_names) or \
           any(location in record_ngrams for location in location_names):
            matched_records.append(record)
    
    return pd.DataFrame(matched_records)

In [None]:
# Main execution
def main():
    # Load your data (replace with actual file paths)
    points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf = load_spatial_data(
        'points.csv',
        'nhd.shp',
        'hydrobasins.shp',
        'admin_units.shp'
    )
    
    # Get catchment information
    location_names = get_catchment_info(points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf)
    
    # Load and process OpenAlex records
    openalex_df = pd.read_csv('openalex_records.csv')  # Replace with actual file path
    
    # Filter records based on location names
    matched_records = process_openalex_records(openalex_df, location_names)
    
    # Save results
    matched_records.to_csv('matched_records.csv', index=False)
    