In [3]:
import os
import re
import geopandas as gpd

# Paths
project_root = os.getcwd()
lake_file = os.path.abspath(os.path.join(project_root, "Datasets/lakes/CCILakesV202.shp"))
csv_folder = os.path.abspath(os.path.join(project_root, "Datasets/CNR/CHLA/"))

# Step 1: Load and print some sample lake IDs from shapefile
lake_gdf = gpd.read_file(lake_file)
lake_ids_raw = lake_gdf['Lake_ID'].dropna()

print("Sample shapefile LAKE_IDs:")
print(lake_ids_raw.head(10))

# Normalize them as strings of integers
lake_ids = set(lake_ids_raw.astype(int).astype(str))
print(f"\nNormalized LAKE_IDs (shapefile): {sorted(list(lake_ids))[:10]}")

# Step 2: Get lake IDs from filenames
csv_files = os.listdir(csv_folder)
lake_ids_in_folder = set()

for filename in csv_files:
    match = re.match(r"ID(\d+)_", filename)
    if match:
        lake_ids_in_folder.add(match.group(1))

print(f"\nSample lake IDs from filenames: {sorted(list(lake_ids_in_folder))[:10]}")

# Step 3: Normalize both sets
lake_ids_folder_normalized = {str(int(id)) for id in lake_ids_in_folder}

print(f"\nNormalized LAKE_IDs (from folder): {sorted(list(lake_ids_folder_normalized))[:10]}")

# Step 4: Find difference
missing_lakes = lake_ids - lake_ids_folder_normalized

print(f"\nTotal in shapefile: {len(lake_ids)}")
print(f"Total in folder: {len(lake_ids_folder_normalized)}")
print(f"Missing lake IDs: {sorted(list(missing_lakes))}")


Sample shapefile LAKE_IDs:
0           10.0
1          100.0
2    100000001.0
3    100000002.0
4    100000003.0
5    100000004.0
6    100000011.0
7    100000012.0
8    100000013.0
9    100000014.0
Name: Lake_ID, dtype: float64

Normalized LAKE_IDs (shapefile): ['10', '100', '100000001', '100000002', '100000003', '100000004', '100000011', '100000012', '100000013', '100000014']

Sample lake IDs from filenames: ['10', '100', '100000001', '100000002', '100000003', '100000004', '100000011', '100000012', '100000013', '100000014']

Normalized LAKE_IDs (from folder): ['10', '100', '100000001', '100000002', '100000003', '100000004', '100000011', '100000012', '100000013', '100000014']

Total in shapefile: 2024
Total in folder: 2012
Missing lake IDs: ['1054', '112', '168', '2', '228', '300009430', '337', '390', '443', '498', '53', '723']


In [8]:
import os
import re
# Set folder paths
chla_folder = "Datasets/CNR/CHLA/"
turb_folder = "Datasets/CNR/turbidity/"

# Regex pattern to extract lake ID
pattern = re.compile(r"ID(\d+)_")

# Helper to extract IDs from filenames in a folder
def extract_ids(folder_path):
    ids = set()
    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            ids.add(match.group(1))
    return ids

# Extract IDs
chla_ids = extract_ids(chla_folder)
turb_ids = extract_ids(turb_folder)

# Find mismatches
only_in_chla = chla_ids - turb_ids
only_in_turb = turb_ids - chla_ids
not_in_both = only_in_chla.union(only_in_turb)

# Output
print(f"IDs only in CHLA: {sorted(only_in_chla)}")
print(f"IDs only in TURB: {sorted(only_in_turb)}")
print(f"IDs NOT in both: {sorted(not_in_both)}")
print(f"\nNumber of lakes with both: {len(chla_ids & turb_ids)}")


IDs only in CHLA: ['1054', '1062', '157', '188', '198', '210', '2132', '2517', '27', '28', '300009430', '300013431', '323', '3476', '351', '3566', '382', '383', '483', '64', '71', '723', '833']
IDs only in TURB: []
IDs NOT in both: ['1054', '1062', '157', '188', '198', '210', '2132', '2517', '27', '28', '300009430', '300013431', '323', '3476', '351', '3566', '382', '383', '483', '64', '71', '723', '833']

Number of lakes with both: 1992


In [9]:
import os
import re

# Set folder paths
chla_folder = "Datasets/CNR/CHLA/"
turb_folder = "Datasets/CNR/turbidity/"

# Regex pattern to extract lake ID
pattern = re.compile(r"ID(\d+)_")

# Helper to extract IDs from filenames in a folder
def extract_ids(folder_path):
    ids = set()
    for filename in os.listdir(folder_path):
        match = pattern.match(filename)
        if match:
            ids.add(match.group(1))
    return ids

# Extract IDs
chla_ids = extract_ids(chla_folder)
turb_ids = extract_ids(turb_folder)

# Find mismatches
only_in_chla = chla_ids - turb_ids
only_in_turb = turb_ids - chla_ids
not_in_both = only_in_chla.union(only_in_turb)

# Output
print(f"IDs only in CHLA: {sorted(only_in_chla)}")
print(f"IDs only in TURB: {sorted(only_in_turb)}")
print(f"IDs NOT in both: {sorted(not_in_both)}")
print(f"\nNumber of lakes with both: {len(chla_ids & turb_ids)}")


IDs only in CHLA: []
IDs only in TURB: ['1054', '300009430', '723']
IDs NOT in both: ['1054', '300009430', '723']

Number of lakes with both: 2012
