In [8]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import nltk
from nltk.util import ngrams
import re

In [None]:
# 1. Load your spatial data
def load_spatial_data(point_locations_path, nhd_path, hydrobasins_path, admin_units_path):
    # Convert your points to GeoDataFrame
    points_df = pd.read_csv(point_locations_path)
    points_gdf = gpd.GeoDataFrame(
        points_df, 
        geometry=[Point(xy) for xy in zip(points_df.longitude, points_df.latitude)]
    )
    
    # Load boundary files
    nhd_gdf = gpd.read_file(nhd_path)
    hydrobasins_gdf = gpd.read_file(hydrobasins_path)
    admin_gdf = gpd.read_file(admin_units_path)
    
    return points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf

In [None]:
# 2. Perform spatial joins to get catchment information
def get_catchment_info(point_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf):
    # Spatial joins
    points_nhd = gpd.sjoin(point_gdf, nhd_gdf, how='left', op='within')
    points_hydrobasins = gpd.sjoin(point_gdf, hydrobasins_gdf, how='left', op='within')
    points_admin = gpd.sjoin(point_gdf, admin_gdf, how='left', op='within')
    
    # Collect all relevant names
    location_names = set()
    
    # Add names from each source (adjust column names as needed)
    name_columns = {
        'nhd': ['BASIN_NAME', 'RIVER_NAME'],
        'hydrobasins': ['HYBAS_NAME'],
        'admin': ['COUNTY_NAME', 'STATE_NAME', 'COUNTRY_NAME']
    }
    
    for df, cols in zip([points_nhd, points_hydrobasins, points_admin], 
                       name_columns.values()):
        for col in cols:
            if col in df.columns:
                location_names.update(df[col].dropna().unique())
    
    return location_names

In [None]:
# 3. Process OpenAlex records
def process_openalex_records(openalex_df, location_names):
    # Function to generate n-grams
    def get_ngrams(text, n_range=(1, 3)):
        if pd.isna(text):
            return set()
        
        text = str(text).lower()
        tokens = nltk.word_tokenize(text)
        all_ngrams = set()
        
        for n in range(n_range[0], n_range[1] + 1):
            text_ngrams = set(' '.join(gram) for gram in ngrams(tokens, n))
            all_ngrams.update(text_ngrams)
            
        return all_ngrams

    # Convert location names to lowercase for matching
    location_names = set(name.lower() for name in location_names)
    
    # Process each record
    matched_records = []
    
    for _, record in openalex_df.iterrows():
        # Combine all text fields
        text_fields = [
            str(record.get('title', '')),
            str(record.get('abstract', '')),
            str(record.get('keywords', ''))
        ]
        
        combined_text = ' '.join(text_fields).lower()
        
        # Generate n-grams from the combined text
        record_ngrams = get_ngrams(' '.join(text_fields))
        
        # Check for matches
        if any(location in combined_text for location in location_names) or \
           any(location in record_ngrams for location in location_names):
            matched_records.append(record)
    
    return pd.DataFrame(matched_records)

In [None]:
# Main execution
def main():
    # Load your data (replace with actual file paths)
    points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf = load_spatial_data(
        'points.csv',
        'nhd.shp',
        'hydrobasins.shp',
        'admin_units.shp'
    )
    
    # Get catchment information
    location_names = get_catchment_info(points_gdf, nhd_gdf, hydrobasins_gdf, admin_gdf)
    
    # Load and process OpenAlex records
    openalex_df = pd.read_csv('openalex_records.csv')  # Replace with actual file path
    
    # Filter records based on location names
    matched_records = process_openalex_records(openalex_df, location_names)
    
    # Save results
    matched_records.to_csv('matched_records.csv', index=False)
    