In [None]:
import geopandas as gpd
import pandas as pd
from collections import defaultdict, deque

# === Step 1: Load full HydroBASINS and lake-linked catchments
hydro = gpd.read_file("Datasets/na/hybas_lake_na_lev08_v1c.shp")[['HYBAS_ID', 'NEXT_DOWN', 'geometry']]
lake_catchments = gpd.read_file("Datasets/final/lake_catchments_na_cleaned.shp")[['Lake_ID', 'HYBAS_ID']]

# === Step 2: Build flow graph (HYBAS_ID -> list of upstream catchments)
flow_dict = pd.Series(hydro['NEXT_DOWN'].values, index=hydro['HYBAS_ID']).to_dict()
reverse_graph = defaultdict(set)
for src, dst in flow_dict.items():
    if dst > 0:
        reverse_graph[dst].add(src)

# === Step 3: For each lake, trace all upstream HYBAS_IDs (preserving duplicates)
upstream_rows = []

for lake_id, group in lake_catchments.groupby('Lake_ID'):
    visited = set()
    queue = deque(group['HYBAS_ID'])

    while queue:
        current = queue.popleft()
        if current in visited:
            continue
        visited.add(current)
        queue.extend(reverse_graph.get(current, []))

    # For each upstream catchment, assign to this lake
    matches = hydro[hydro['HYBAS_ID'].isin(visited)].copy()
    matches['Lake_ID'] = lake_id
    upstream_rows.append(matches[['Lake_ID', 'geometry']])

# === Step 4: Combine all with duplicated catchments preserved
merged_gdf = gpd.GeoDataFrame(pd.concat(upstream_rows, ignore_index=True), crs=hydro.crs)

print(f"✅ Total rows (Lake_ID, HYBAS_ID pairs): {len(merged_gdf)}")
print(f"📌 Unique lakes covered: {merged_gdf['Lake_ID'].nunique()}")


# Optional: save it
#merged_gdf.to_file("Datasets/final/upstream_catchments_by_lake.gpkg", driver="GPKG")
# === Step 5: Dissolve upstream catchments into one polygon per Lake_ID ===
dissolved = merged_gdf.dissolve(by='Lake_ID', as_index=False)

print(f"✅ Created dissolved GeoDataFrame with {len(dissolved)} Lake_IDs.")

# === Step 6: Save to GeoPackage (best format for large polygons) ===
output_path = "Datasets/final/upstream_catchments_dissolved.gpkg"
dissolved.to_file(output_path, driver="GPKG")

print(f"✅ Saved dissolved upstream catchments to: {output_path}")

✅ Total rows (Lake_ID, HYBAS_ID pairs): 65705
📌 Unique lakes covered: 466


In [5]:
from collections import defaultdict, deque
import geopandas as gpd
import pandas as pd

# Load HydroBASINS and lake-linked catchments
hydro = gpd.read_file("Datasets/na/hybas_lake_na_lev08_v1c.shp")[['HYBAS_ID', 'NEXT_DOWN', 'geometry']]
lake_catchments = gpd.read_file("Datasets/final/lake_catchments_na_cleaned.shp")[['Lake_ID', 'HYBAS_ID']]

# Build reverse graph (who flows to whom)
flow_dict = pd.Series(hydro['NEXT_DOWN'].values, index=hydro['HYBAS_ID']).to_dict()
reverse_graph = defaultdict(set)
for src, dst in flow_dict.items():
    if dst > 0:
        reverse_graph[dst].add(src)

# Map: which HYBAS_IDs belong to which lakes
hybas_to_lakes = lake_catchments.groupby("HYBAS_ID")["Lake_ID"].apply(set).to_dict()

# Upstream tracing that stops at other lake-assigned catchments
lake_to_upstream = defaultdict(set)

for lake_id, group in lake_catchments.groupby("Lake_ID"):
    visited = set()
    queue = deque(group['HYBAS_ID'])

    while queue:
        current = queue.popleft()
        if current in visited:
            continue
        visited.add(current)

        # Stop if this catchment belongs to another lake too
        linked_lakes = hybas_to_lakes.get(current, set())
        if linked_lakes - {lake_id}:
            continue  # don't trace upstream from other lakes

        queue.extend(reverse_graph.get(current, []))

    lake_to_upstream[lake_id] = visited

# Assemble final GeoDataFrame
records = []
for lake_id, hybas_ids in lake_to_upstream.items():
    subset = hydro[hydro['HYBAS_ID'].isin(hybas_ids)].copy()
    subset['Lake_ID'] = lake_id
    records.append(subset[['Lake_ID', 'geometry']])

merged_gdf = gpd.GeoDataFrame(pd.concat(records, ignore_index=True), crs=hydro.crs)

# Optional: dissolve
dissolved = merged_gdf.dissolve(by='Lake_ID', as_index=False)
dissolved.to_file("Datasets/final/upstream_stop_at_other_lakes.gpkg", driver="GPKG")
print("✅ Saved alternative dissolved catchments (stopping at other lakes).")


✅ Saved alternative dissolved catchments (stopping at other lakes).
