In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import time
from pathlib import Path

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import ee
import geemap

In [None]:
# Force re-authentication
ee.Authenticate()

In [None]:
PROJECT_ID = "miningcandb"  
ee.Initialize(project=PROJECT_ID) 
print("✅ Earth Engine is now initialized with project:", PROJECT_ID)

# Import cleaned datasets

In [None]:
# Canada SHP files
canada_boundary = gpd.read_file("data/Sources/Canada_boundary/Canada_boundary.shp")
canada_provinces = gpd.read_file("data/Sources/Canada_provinces_boundary/lpr_000b21a_e.shp")

In [None]:
# MinCan datasets
min_can_df = pd.read_csv(
    r'data/cleaned_data/min_can_cleaned_df.csv')

In [None]:
# NPRI datasets
nrcan_mining_df = pd.read_csv(
    r'data/cleaned_data/nrcan_mining_df.csv')
nrcan_manufacturing_df = pd.read_excel(
    r'data/NRCan/Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)/metal_work.xlsx')
nrcan_critical_df = pd.read_excel(
    r'data/NRCan/Critical minerals advanced projects, mines and processing facilities in Canada/Critical minerals advanced projects, mines and processing facilities.xlsx'
)

In [None]:
# GHG from large facilities datasets
ghg_facility_metal_mining_df = pd.read_csv(
    r'data/cleaned_data/ghg_facility_metal_mining_df.csv')
ghg_facility_metal_manufacturing_df = pd.read_csv(
    r'data/cleaned_data/ghg_facility_metal_manufacturing_df.csv')

In [None]:
# NPRI dataset
npri_metal_mining_df = pd.read_csv(
    r'data/cleaned_data/npri_metal_mining_df.csv')
npri_metal_manufacturing_df = pd.read_csv(
    r'data/cleaned_data/npri_metal_manufacturing_df.csv')

In [None]:
# Tailings dataset 
tailings_df = pd.read_csv(
    r'data/cleaned_data/tailings_df.csv')

In [None]:
# Climate Trace dataset
ctrace_annual = pd.read_csv(
    r'data/cleaned_data/ctrace_annual.csv')

# 1.a) Create parents table

In [None]:
from db_creation_function import populate_table_df, assign_id

In [None]:
# We first create an empty df with the columns we want to have in the facility_df
main_columns = {
    #"facility_id": "text",
    "name": "text",
    "reported_company": "text",
    "longitude": "float",
    "latitude": "float",
    "city": "text", # not specified in the project df 
    "province": "text",
    "status": "text", # active or projects 
    "activity_status": "text", # only for projects
    "development_stage": "text", # only for projects
    "facility_type": "text", # mining, manufacturing or projects
    "mining_processing_type": "text",
    "commodity_group": "text",
    "primary_commodity": "text",
    "commodities": "text",
    "source": "text"  
}

main_df = pd.DataFrame(columns=main_columns.keys())
main_df

In [None]:
column_mapping_main = {
    
      "nrcan_mining_df": {
        "OPERATIO_E": "name",
        "OWNER_E": "reported_company",
        "LONGITUDE": "longitude",
        "LATITUDE": "latitude",
        "CITY_E": "city",
        "PROVINCE_E": "province",
        "FAC_DESC_E": "mining_processing_type",
        "COM_GROU_E": "commodity_group", 
        "COM_DESC_E": "commodities"
        
    },
    "nrcan_manufacturing_df": {
        "OPERATIO_E": "name",
        "OWNER_E": "reported_company",
        "LONGITUDE": "longitude",
        "LATITUDE": "latitude",
        "PROVINCE_E": "province",
        "FAC_TYPE_E": "mining_processing_type",
        "COM_GROU_E": "commodity_group",
        "COM_DESC_E": "commodities"
    }, 
    
    
    "nrcan_critical_df": {
        "PropertyNa": "name",
        "OperatorOw": "reported_company",
        "Longitude": "longitude",
        "Latitude": "latitude",
        "ProvincesE": "province",
        "Developmen": "development_stage",
        "ActivitySt": "activity_status",
        "Commoditie": "commodities",
    }
}

In [None]:
main_df = populate_table_df(column_mapping_main, 
                                   main_df, 
                                   dynamic_columns = {
                                       "facility_type": {
                                           "nrcan_mining_df": "mining",
                                            "nrcan_manufacturing_df": "manufacturing",
                                           "nrcan_critical_df": "project"
                                                           },
                                       "status": {
                                            "nrcan_mining_df": "active_facility",
                                            "nrcan_manufacturing_df": "active_facility",
                                            "nrcan_critical_df": "project_facility"
                                                           },
                                                      }, 
                            
                                   source_dfs = {"nrcan_mining_df": nrcan_mining_df,
                                                 "nrcan_manufacturing_df": nrcan_manufacturing_df,
                                                 "nrcan_critical_df": nrcan_critical_df}
                                   )

In [None]:
# Compute primary_commodity from commodities
if "commodities" in main_df.columns:
    main_df["primary_commodity"] = main_df["commodities"].apply(
            lambda x: x.split(",")[0].strip() if isinstance(x, str) and x else None
        )

In [None]:
main_gdf = assign_id(
    main_df,
    canada_provinces,
    id_column = "main_id",
    prefix='MAIN'
)
main_gdf

In [None]:
main_gdf.to_file(r'data/Tables/GeoJSON/main_gdf.geojson', driver="GeoJSON")
main_gdf.to_csv(r'data/Tables/CSV/main_gdf.csv', index=False)

# 1.b) Create children tables

In [None]:
from db_creation_function import add_year

## Children tables with coordinates

### Create tailings table

In [None]:
tailings_gdf = assign_id(tailings_df, canada_provinces, id_column='tailing_id', prefix='TAIL')
tailings_gdf

In [None]:
tailings_gdf = add_year(tailings_gdf, 2019)

In [None]:
tailings_gdf

In [None]:
tailings_gdf.to_file(r'data/Tables/GeoJSON/tailings_gdf.geojson', driver="GeoJSON")
tailings_gdf.to_csv(r'data/Tables/CSV/tailings_gdf.csv', index=False)

### Create ClimateTRACE table

In [None]:
# Let's choose the year 2023
ctrace_df = ctrace_annual[ctrace_annual['year'] == 2023]
ctrace_df = ctrace_df.drop(columns='year').reset_index(drop=True)

In [None]:
# We rename geometry columns for ID assignation
ctrace_df = ctrace_df.rename(columns={
    'lat': 'latitude',
    'lon': 'longitude',
}
)

In [None]:
ctrace_gdf = assign_id(
    ctrace_df, 
    canada_provinces,
    id_column = 'climate_trace_id',
    prefix="CTRACE")

In [None]:
ctrace_gdf = add_year(ctrace_gdf, 2023)

In [None]:
ctrace_gdf

In [None]:
ctrace_gdf.to_file(r'data/Tables/GeoJSON/ctrace_gdf.geojson', driver="GeoJSON")
ctrace_gdf.to_csv(r'data/Tables/CSV/ctrace_gdf.csv', index=False)

### Create MinCan table 

In [None]:
min_can_gdf = assign_id(min_can_df, 
                              canada_provinces = canada_provinces,
                              id_column="mincan_id", prefix="MINCAN")
min_can_gdf

In [None]:
min_can_gdf.to_file(r'data/Tables/GeoJSON/min_can_gdf.geojson', driver="GeoJSON")
min_can_gdf.to_csv(r'data/Tables/CSV/min_can_gdf.csv', index=False)

### Create GHG table

In [None]:
# We first create an empty df with the columns we want to have in the ghg_df
ghg_columns = {
    "facility_name_ghg": "text",
    "longitude": "float",   
    "latitude": "float",
    "city": "text",
    "province": "text",
    "year": "integer",
    "sector": "text",
    "ghge_ktCO2eq": "float", # column 
    "facility_url" : "url" # column Facility information
}

ghg_df = pd.DataFrame(columns=ghg_columns.keys())
ghg_df

In [None]:
# Based on concordances tables
column_mapping_ghg = {
    "ghg_facility_metal_mining_df": {
        "Facility name": "facility_name_ghg",
        "Latitude": "latitude",
        "Longitude": "longitude",
        "City": "city",
        "Province": "province",
        "Report year": "year",
        "Industry classification": "sector",
        "Total emissions": "ghge_ktCO2eq",
        "Facility information": "facility_url",
 
    },
    
    "ghg_facility_metal_manufacturing_df": {
        "Facility name": "facility_name_ghg",
        "Latitude": "latitude",
        "Longitude": "longitude",
        "City": "city",
        "Province": "province",
        "Report year": "year",
        "Industry classification": "sector",
        "Total emissions": "ghge_ktCO2eq",
        "Facility information": "facility_url",
    }, 
    }

In [None]:
ghg_df = populate_table_df(
                           column_mapping_ghg, 
                           ghg_df,
                            source_dfs = {
                                  "ghg_facility_metal_mining_df": ghg_facility_metal_mining_df,
                                  "ghg_facility_metal_manufacturing_df": ghg_facility_metal_manufacturing_df}
)

In [None]:
ghg_gdf = assign_id(
    ghg_df,
    canada_provinces,
    id_column = "ghg_id",
    prefix='ghg'
)


In [None]:
ghg_gdf = add_year(ghg_gdf, 2022)
ghg_gdf

In [None]:
ghg_gdf.to_file(r'data/Tables/GeoJSON/ghg_gdf.geojson', driver="GeoJSON")
ghg_gdf.to_csv(r'data/Tables/CSV/ghg_gdf.csv', index=False)

### Create pollutant table based on NPRI dataset

In [None]:
# We first create an empty df with the columns we want to have in the ghg_df
pollutant_columns = {
    #"id": "text",
    "facility_name_npri": "text",
    "company_name_npri": "text",
    "facility_type": "text",
    "longitude": "float",
    "latitude": "float",
    "year": "integer",
    "terrestrial_ecozone": "text",
    "watershed": "text",
    "substance_name_npri": "text",
    "substance_name_ecoinvent": "text",
    "substance_unit": "text",
    "air_emissions_stack_emissions": "float",
    "air_emissions_storage_/_handling": "float",
    "air_emissions_fugitive_emissions": "float",
    "air_emissions_spills": "float",
    "air_emissions_other": "float",
    "air_emissions_total": "float",
    "water_releases_direct_discharge": "float",
    "water_releases_spills": "float",
    "water_releases_leaks": "float",
    "water_releases_total": "float",
    "water_releases_receiving_waterbody_(english)": "float",
    "water_releases_receiving_waterbody_(french)": "float",
    "land_releases_spills": "float",
    "land_releases_leaks": "float",
    "land_releases_other": "float",
    "land_releases_total": "float",
    "total_releases_total_releases_(excluding_road_dust)": "float",
    "total_releases_road_dust_emissions": "float",
    "total_releases_total_releases_including_road_dust": "float",
    "off-site_disposal_land_fill": "float",
    "off-site_disposal_land_treatment": "float",
    "off-site_disposal_underground_injection": "float",
    "off-site_disposal_tailings": "float",
    "off-site_disposal_waste_rock": "float",
    "on-site_disposal_total_on-site": "float",
    "on-site_disposal_land_fill": "float",
    "on-site_disposal_land_treatment": "float",
    "on-site_disposal_underground_injection": "float",
    "off-site_disposal_storage": "float",
    "on-site_disposal_tailings": "float",
    "on-site_disposal_waste_rock": "float",
    "off-site_disposal_total_off-site": "float",
    "transfers_for_treatment_physical_treatment": "float",
    "transfers_for_treatment_chemical_treatment": "float",
    "transfers_for_treatment_biological_treatment": "float",
    "transfers_for_treatment_incineration": "float",
    "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant": "float",
    "transfers_for_treatment_total": "float",
    "total_on/off_treatment_total": "float",
    "transfers_for_recycling_energy_recovery": "float",
    "transfers_for_recycling_solvent_recovery": "float",
    "transfers_for_recycling_organic_compound_recovery": "float",
    "transfers_for_recycling_metal_recovery": "float",
    "transfers_for_recycling_inorganic_compound_recovery": "float",
    "transfers_for_recycling_acid_or_base_recovery": "float",
    "transfers_for_recycling_catalyst_recovery": "float",
    "transfers_for_recycling_pollution_abatement_residue_recovery": "float",
    "transfers_for_recycling_used_oil_recovery": "float",
    "transfers_for_recycling_other": "float",
    "transfers_for_recycling_total": "float",
    "grand_total_total_releases,_disposals_and_transfers_for_recycling": "float",
}

# Create an empty DataFrame with the specified columns
pollutant_df = pd.DataFrame(columns=pollutant_columns.keys())

In [None]:
pollutant_df

In [None]:
column_mapping_pollutant = {
    "npri_metal_mining_df": {
        "Facility Name": "facility_name_npri",
        "Company Name": "company_name_npri", 
        "Latitude": "latitude",
        "Longitude": "longitude",
        "Year": "year",
        "Terrestrial ecozone (English) ": "terrestrial_ecozone", # space 
        "Watershed (English)": "watershed",
        "Substance Name (English)": "substance_name_npri",
        "EI": "substance_name_ecoinvent",
        "Units": "substance_unit",
        "air_emissions_stack_emissions": "air_emissions_stack_emissions",
        "air_emissions_storage_/_handling": "air_emissions_storage_/_handling",
        "air_emissions_fugitive_emissions": "air_emissions_fugitive_emissions",
        "air_emissions_spills": "air_emissions_spills",
        "air_emissions_other": "air_emissions_other",
        "air_emissions_total": "air_emissions_total",
        "water_releases_direct_discharge": "water_releases_direct_discharge",
        "water_releases_spills": "water_releases_spills",
        "water_releases_leaks": "water_releases_leaks",
        "water_releases_total": "water_releases_total",
        "water_releases_receiving_waterbody_(english)": "water_releases_receiving_waterbody_(english)",
        "water_releases_receiving_waterbody_(french)": "water_releases_receiving_waterbody_(french)",
        "land_releases_spills": "land_releases_spills",
        "land_releases_leaks": "land_releases_leaks",
        "land_releases_other": "land_releases_other",
        "land_releases_total": "land_releases_total",
        "total_releases_total_releases_(excluding_road_dust)": "total_releases_total_releases_(excluding_road_dust)",
        "total_releases_road_dust_emissions": "total_releases_road_dust_emissions",
        "total_releases_total_releases_including_road_dust": "total_releases_total_releases_including_road_dust",
        "off-site_disposal_land_fill": "off-site_disposal_land_fill",
        "off-site_disposal_land_treatment": "off-site_disposal_land_treatment",
        "off-site_disposal_underground_injection": "off-site_disposal_underground_injection",
        "off-site_disposal_tailings": "off-site_disposal_tailings",
        "off-site_disposal_waste_rock": "off-site_disposal_waste_rock",
        "on-site_disposal_total_on-site": "on-site_disposal_total_on-site",
        "on-site_disposal_land_fill": "on-site_disposal_land_fill",
        "on-site_disposal_land_treatment": "on-site_disposal_land_treatment",
        "on-site_disposal_underground_injection": "on-site_disposal_underground_injection",
        "off-site_disposal_storage": "off-site_disposal_storage",
        "on-site_disposal_tailings": "on-site_disposal_tailings",
        "on-site_disposal_waste_rock": "on-site_disposal_waste_rock",
        "off-site_disposal_total_off-site": "off-site_disposal_total_off-site",
        "transfers_for_treatment_physical_treatment": "transfers_for_treatment_physical_treatment",
        "transfers_for_treatment_chemical_treatment": "transfers_for_treatment_chemical_treatment",
        "transfers_for_treatment_biological_treatment": "transfers_for_treatment_biological_treatment",
        "transfers_for_treatment_incineration": "transfers_for_treatment_incineration",
        "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant": "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant",
        "transfers_for_treatment_total": "transfers_for_treatment_total",
        "total_on/off_treatment_total": "total_on/off_treatment_total",
        "transfers_for_recycling_energy_recovery": "transfers_for_recycling_energy_recovery",
        "transfers_for_recycling_solvent_recovery": "transfers_for_recycling_solvent_recovery",
        "transfers_for_recycling_organic_compound_recovery": "transfers_for_recycling_organic_compound_recovery",
        "transfers_for_recycling_metal_recovery": "transfers_for_recycling_metal_recovery",
        "transfers_for_recycling_inorganic_compound_recovery": "transfers_for_recycling_inorganic_compound_recovery",
        "transfers_for_recycling_acid_or_base_recovery": "transfers_for_recycling_acid_or_base_recovery",
        "transfers_for_recycling_catalyst_recovery": "transfers_for_recycling_catalyst_recovery",
        "transfers_for_recycling_pollution_abatement_residue_recovery": "transfers_for_recycling_pollution_abatement_residue_recovery",
        "transfers_for_recycling_used_oil_recovery": "transfers_for_recycling_used_oil_recovery",
        "transfers_for_recycling_other": "transfers_for_recycling_other",
        "transfers_for_recycling_total": "transfers_for_recycling_total",
        "grand_total_total_releases,_disposals_and_transfers_for_recycling": "grand_total_total_releases,_disposals_and_transfers_for_recycling",
    },
    
    "npri_metal_manufacturing_df": {
        "Facility Name": "facility_name_npri",
        "Latitude": "latitude",
        "Longitude": "longitude",
        "Year": "year",
        "Terrestrial ecozone (English) ": "terrestrial_ecozone", # space 
        "Watershed (English)": "watershed",
        "Substance Name (English)": "substance_name_npri",
        "EI": "substance_name_ecoinvent",
        "Units": "substance_unit",
        "air_emissions_stack_emissions": "air_emissions_stack_emissions",
        "air_emissions_storage_/_handling": "air_emissions_storage_/_handling",
        "air_emissions_fugitive_emissions": "air_emissions_fugitive_emissions",
        "air_emissions_spills": "air_emissions_spills",
        "air_emissions_other": "air_emissions_other",
        "air_emissions_total": "air_emissions_total",
        "water_releases_direct_discharge": "water_releases_direct_discharge",
        "water_releases_spills": "water_releases_spills",
        "water_releases_leaks": "water_releases_leaks",
        "water_releases_total": "water_releases_total",
        "water_releases_receiving_waterbody_(english)": "water_releases_receiving_waterbody_(english)",
        "water_releases_receiving_waterbody_(french)": "water_releases_receiving_waterbody_(french)",
        "land_releases_spills": "land_releases_spills",
        "land_releases_leaks": "land_releases_leaks",
        "land_releases_other": "land_releases_other",
        "land_releases_total": "land_releases_total",
        "total_releases_total_releases_(excluding_road_dust)": "total_releases_total_releases_(excluding_road_dust)",
        "total_releases_road_dust_emissions": "total_releases_road_dust_emissions",
        "total_releases_total_releases_including_road_dust": "total_releases_total_releases_including_road_dust",
        "off-site_disposal_land_fill": "off-site_disposal_land_fill",
        "off-site_disposal_land_treatment": "off-site_disposal_land_treatment",
        "off-site_disposal_underground_injection": "off-site_disposal_underground_injection",
        "off-site_disposal_tailings": "off-site_disposal_tailings",
        "off-site_disposal_waste_rock": "off-site_disposal_waste_rock",
        "on-site_disposal_total_on-site": "on-site_disposal_total_on-site",
        "on-site_disposal_land_fill": "on-site_disposal_land_fill",
        "on-site_disposal_land_treatment": "on-site_disposal_land_treatment",
        "on-site_disposal_underground_injection": "on-site_disposal_underground_injection",
        "off-site_disposal_storage": "off-site_disposal_storage",
        "on-site_disposal_tailings": "on-site_disposal_tailings",
        "on-site_disposal_waste_rock": "on-site_disposal_waste_rock",
        "off-site_disposal_total_off-site": "off-site_disposal_total_off-site",
        "transfers_for_treatment_physical_treatment": "transfers_for_treatment_physical_treatment",
        "transfers_for_treatment_chemical_treatment": "transfers_for_treatment_chemical_treatment",
        "transfers_for_treatment_biological_treatment": "transfers_for_treatment_biological_treatment",
        "transfers_for_treatment_incineration": "transfers_for_treatment_incineration",
        "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant": "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant",
        "transfers_for_treatment_total": "transfers_for_treatment_total",
        "total_on/off_treatment_total": "total_on/off_treatment_total",
        "transfers_for_recycling_energy_recovery": "transfers_for_recycling_energy_recovery",
        "transfers_for_recycling_solvent_recovery": "transfers_for_recycling_solvent_recovery",
        "transfers_for_recycling_organic_compound_recovery": "transfers_for_recycling_organic_compound_recovery",
        "transfers_for_recycling_metal_recovery": "transfers_for_recycling_metal_recovery",
        "transfers_for_recycling_inorganic_compound_recovery": "transfers_for_recycling_inorganic_compound_recovery",
        "transfers_for_recycling_acid_or_base_recovery": "transfers_for_recycling_acid_or_base_recovery",
        "transfers_for_recycling_catalyst_recovery": "transfers_for_recycling_catalyst_recovery",
        "transfers_for_recycling_pollution_abatement_residue_recovery": "transfers_for_recycling_pollution_abatement_residue_recovery",
        "transfers_for_recycling_used_oil_recovery": "transfers_for_recycling_used_oil_recovery",
        "transfers_for_recycling_other": "transfers_for_recycling_other",
        "transfers_for_recycling_total": "transfers_for_recycling_total",
        "grand_total_total_releases,_disposals_and_transfers_for_recycling": "grand_total_total_releases,_disposals_and_transfers_for_recycling",
    }
}


In [None]:
pollutant_df = populate_table_df(column_mapping_pollutant, 
                                    pollutant_df,
                                    dynamic_columns = {
                                        "facility_type": {
                                            "npri_metal_mining_df": "mining",
                                            "npri_metal_manufacturing_df": "manufacturing"
                                        },
                                    },
                                    source_dfs = {
                                  "npri_metal_mining_df": npri_metal_mining_df,
                                  "npri_metal_manufacturing_df": npri_metal_manufacturing_df
                                    }
                                    )

In [None]:
pollutant_gdf = assign_id(
    pollutant_df,
    canada_provinces,
    id_column = "pollutant_id",
    prefix='pollutant'
)

In [None]:
pollutant_gdf = add_year(pollutant_gdf, 2023)
pollutant_gdf

In [None]:
pollutant_gdf.to_file(r'data/Tables/GeoJSON/pollutant_gdf.geojson', driver="GeoJSON")
pollutant_gdf.to_csv(r'data/Tables/CSV/pollutant_gdf.csv', index=False)

## Children tables with raster data

In [None]:
import rasterio
from rasterstats import zonal_stats
from rasterio import mask

### Create climate category table

In [None]:
climate_category_gdf = main_gdf.copy()
col_to_keep = ['main_id', 'name', 'longitude', 'latitude', 'geometry']
climate_category_gdf = climate_category_gdf[col_to_keep]

In [None]:
def load_raster(period, scenario, resolution):
    """Load raster for a given period, scenario, and resolution."""
    if scenario:  # future
        path = Path(f"data/Sources/Climate_category/koppen_geiger_tif/{period}/{scenario}/koppen_geiger_{resolution}.tif")
    else:  # historical
        path = Path(f"data/Sources/Climate_category/koppen_geiger_tif/{period}/koppen_geiger_{resolution}.tif")

    if not path.exists():
        raise FileNotFoundError(f"Raster not found: {path}")
    return rasterio.open(path)


def extract_climate_for_facilities(facilities_gdf, periods, scenarios, resolution):
    """
    Extract Köppen-Geiger climate classifications for a list of facilities across time periods and scenarios.

    Returns a GeoDataFrame with: main_id, name, geometry, period, scenario, category, category_name
    """

    results = []

    for period in periods:
        is_future = "_" in period and int(period.split("_")[0]) > 2020
        applicable_scenarios = scenarios if is_future else [None]

        for scenario in applicable_scenarios:
            raster = load_raster(period, scenario, resolution)
            coords = [(geom.x, geom.y) for geom in facilities_gdf.geometry]
            values = list(raster.sample(coords))
            categories = [v[0] if v else None for v in values]

            for i, category in enumerate(categories):
                results.append({
                    "main_id": facilities_gdf.iloc[i]["main_id"],
                    "name": facilities_gdf.iloc[i]["name"],
                    "geometry": facilities_gdf.iloc[i]["geometry"],
                    "year": period,
                    "scenario": scenario if scenario else "historical",
                    "category": category
                })

    # Convert to GeoDataFrame and map climate class name
    gdf = gpd.GeoDataFrame(results, geometry="geometry", crs=facilities_gdf.crs)

    return gdf

In [None]:
periods = ["1991_2020", "2041_2070", "2071_2099"]
scenarios = ["ssp126", "ssp245", "ssp370", "ssp585"]
resolution = "0p00833333" #0.00833333°, ~1 km

# Extract climate categories
climate_category_gdf = extract_climate_for_facilities(climate_category_gdf, periods, scenarios, resolution)

In [None]:
climate_category_gdf

In [None]:
from constants import koppen_dict

In [None]:
# Map numeric category to descriptive class name
climate_category_gdf["category_name"] = climate_category_gdf["category"].map(koppen_dict)
climate_category_gdf = climate_category_gdf.drop(columns=["category"])

In [None]:
climate_category_gdf

In [None]:
climate_category_gdf.to_file(r'data/Tables/GeoJSON/climate_category_gdf.geojson', driver="GeoJSON")
climate_category_gdf.to_csv(r'data/Tables/CSV/climate_category_gdf.csv', index=False)

### Create peatland table

In [None]:
peatland_tiff = "data\Sources\Global_Peatland_DB_2022\GLOpeat_GPA22WGS_2cl_1x1km\peatGPA22WGS_2cl.tif"
raster_peatland = rasterio.open(peatland_tiff)

In [None]:
peatland_gdf = main_gdf.copy()
col_to_keep = ['main_id', 'name', 'longitude', 'latitude', 'geometry']
peatland_gdf = peatland_gdf[col_to_keep]

In [None]:
def extract_peatland_presence(geometry, raster):
    """
    Extracts peatland extent or presence from a raster at a given point.

    Parameters:
    - geometry: Shapely Point (longitude, latitude)
    - raster: Opened rasterio dataset (peatland raster)

    Returns:
    - peatland_value (int or float): Value indicating peatland presence (e.g., 0 = no peat, 1 = peat).
    """
    lon, lat = geometry.x, geometry.y  # Get coordinates from geometry

    try:
        # Get row and column index of the pixel
        row, col = raster.index(lon, lat)

        # Read the raster value at that position
        peatland_value = raster.read(1)[row, col]
    except (IndexError, ValueError):
        peatland_value = None  # Point is outside raster bounds or error reading value

    return peatland_value


In [None]:
# Extract peatland values for each point
peatland_gdf["peatland_presence"] = peatland_gdf["geometry"].apply(
    lambda geom: extract_peatland_presence(geom, raster_peatland)
)

In [None]:
from constants import peatland_dict

In [None]:
peatland_gdf["peatland_presence"] = peatland_gdf["peatland_presence"].map(peatland_dict)

In [None]:
peatland_gdf['peatland_presence'].value_counts()

In [None]:
peatland_gdf.to_csv(r'data/Tables/CSV/peatland_gdf.csv', index=False)
peatland_gdf.to_file(r'data/Tables/GeoJSON/peatland_gdf.geojson', driver="GeoJSON")

## Children tables with polygons

### Create land use table table based on Tang & Werner (2023) 

#### Import datasets

In [None]:
tang_gdf = gpd.read_file("data/Sources/Tang_Werner_2023/7894216/74548_projected polygons.shp")
maus_polygons_gdf = gpd.read_file("data/Sources/Maus_2022/global_mining_polygons_v2.gpkg")
maus_validation_points_gdf = gpd.read_file("data/Sources/Maus_2022/validation_points_v2.gpkg")

#### Filter dataset for Canada

Maus et al directly provide an ISO3 country code so it is easy to filter it for Canada.
Tanger & Werner do not provide it so we have to use a SHP file to have the borders for Canada

In [None]:
maus_canada_gdf = maus_polygons_gdf[maus_polygons_gdf['ISO3_CODE'] == 'CAN']
maus_canada_gdf = maus_canada_gdf.reset_index(drop=True)

In [None]:
### We fix the invalid geometries in the Tang & Werner dataset
invalid_geoms = tang_gdf[~tang_gdf.is_valid]
if not invalid_geoms.empty:
    print(f"⚠️ Warning: {len(invalid_geoms)} invalid geometries found. Fixing them...")
    tang_gdf["geometry"] = tang_gdf.buffer(0)  # Attempt to fix invalid geometries
else:
    print("✅ All geometries are valid.")

In [None]:
# Ensure both datasets are in the same CRS before clipping (convert to EPSG:4326)
if tang_gdf.crs != "EPSG:4326":
    tang_gdf = tang_gdf.to_crs(epsg=4326)

if canada_boundary.crs != "EPSG:4326":
    canada_boundary = canada_boundary.to_crs(epsg=4326)

# Clip the global dataset to Canada
tang_canada_gdf = gpd.clip(tang_gdf, canada_boundary)
tang_canada_gdf = tang_canada_gdf.reset_index(drop=True)

In [None]:
tang_canada_gdf = assign_id(tang_canada_gdf,
                            canada_provinces,
                                     id_column = 'tang_id',
                                     prefix="TANG")

In [None]:
maus_canada_gdf = assign_id(maus_canada_gdf, 
                            canada_provinces,
                                     id_column = 'maus_id',
                                     prefix="MAUS")

In [None]:
# Drop useless columns
tang_canada_gdf = tang_canada_gdf.drop(columns=['OBJECTID', 'Name', 'Shape_Le_1', 'Shape_Area', 'longitude', 'latitude'])
maus_canada_gdf = maus_canada_gdf.drop(columns=['ISO3_CODE', 'COUNTRY_NAME', 'AREA', 'longitude', 'latitude'])

In [None]:
from db_creation_function import add_surface_area_polygons

In [None]:
tang_canada_gdf = add_surface_area_polygons(tang_canada_gdf)
maus_canada_gdf = add_surface_area_polygons(maus_canada_gdf)

In [None]:
tang_canada_gdf.to_file("data/Tables/GeoJSON/tang_canada_gdf.geojson", driver="GeoJSON")
maus_canada_gdf.to_file("data/Tables/GeoJSON/maus_canada_gdf.geojson", driver="GeoJSON")

#### A bit of inspection of the polygons

In [None]:
from db_creation_function import analyze_and_compare_polygon_areas

In [None]:
polygons_areas = analyze_and_compare_polygon_areas(maus_canada_gdf, 
                                  tang_canada_gdf,
                                  'Maus et al (2022)', 
                                  'Tang & Werner et al (2023)',
                                  save_path=r'results/maus_tang_comparison_polygons.png'
                                  )
polygons_areas


### Create indigenous table

In [None]:
indigenous_gdf = gpd.read_file(r'data/Sources/Indigenous_lands/Indigenous_comm_lands_v202411/Indigenous_comm_lands_v202411/CommunityLevelData_poly_202411.shp')
indigenous_can_gdf = indigenous_gdf[indigenous_gdf['Country'] == 'Canada']
indigenous_can_gdf

In [None]:
col_to_keep = ['Name', 'Category', 'Data_Src', 'Data_Date', 'geometry'   ]
indigenous_can_gdf = indigenous_can_gdf[col_to_keep] 

In [None]:
# Simplify geometries while keeping essential shape 
tolerance = 100  # 100 meters simplification tolerance
indigenous_can_gdf["geometry"] = indigenous_can_gdf["geometry"].apply(lambda geom: geom.simplify(tolerance, preserve_topology=True))

In [None]:
# Fix invalid geometries after simplification
indigenous_can_gdf["geometry"] = indigenous_can_gdf["geometry"].buffer(0)

In [None]:
indigenous_can_gdf = assign_id(
    indigenous_can_gdf,
    canada_provinces,
    id_column = "indigenous_land_id",
    prefix='IND'
)

In [None]:
indigenous_can_gdf

In [None]:
indigenous_can_gdf.to_file("data/Tables/GeoJSON/indigenous_can_gdf.geojson", driver="GeoJSON")

### Create protected areas table

In [None]:
# Polygons
wpda_gdf_polygons_1 = gpd.read_file(r'data\Sources\Protected_areas\WDPA_WDOECM_Mar2025_Public_CAN_shp\WDPA_WDOECM_Mar2025_Public_CAN_shp_0\WDPA_WDOECM_Mar2025_Public_CAN_shp-polygons.shp')
wpda_gdf_polygons_2 = gpd.read_file(r'data\Sources\Protected_areas\WDPA_WDOECM_Mar2025_Public_CAN_shp\WDPA_WDOECM_Mar2025_Public_CAN_shp_1\WDPA_WDOECM_Mar2025_Public_CAN_shp-polygons.shp')
wpda_gdf_polygons_3 = gpd.read_file(r'data\Sources\Protected_areas\WDPA_WDOECM_Mar2025_Public_CAN_shp\WDPA_WDOECM_Mar2025_Public_CAN_shp_2\WDPA_WDOECM_Mar2025_Public_CAN_shp-polygons.shp')

# Points
wpda_gdf_points_1 = gpd.read_file(r'data\Sources\Protected_areas\WDPA_WDOECM_Mar2025_Public_CAN_shp\WDPA_WDOECM_Mar2025_Public_CAN_shp_0\WDPA_WDOECM_Mar2025_Public_CAN_shp-points.shp')
wpda_gdf_points_2 = gpd.read_file(r'data\Sources\Protected_areas\WDPA_WDOECM_Mar2025_Public_CAN_shp\WDPA_WDOECM_Mar2025_Public_CAN_shp_1\WDPA_WDOECM_Mar2025_Public_CAN_shp-points.shp')
wpda_gdf_points_3 = gpd.read_file(r'data\Sources\Protected_areas\WDPA_WDOECM_Mar2025_Public_CAN_shp\WDPA_WDOECM_Mar2025_Public_CAN_shp_2\WDPA_WDOECM_Mar2025_Public_CAN_shp-points.shp')

In [None]:
# We merge the 3 files together
wpda_gdf_polygons = gpd.GeoDataFrame(pd.concat([
    wpda_gdf_polygons_1, wpda_gdf_polygons_2, wpda_gdf_polygons_3], 
    ignore_index=True))

wpda_gdf_points = gpd.GeoDataFrame(pd.concat([
    wpda_gdf_points_1, wpda_gdf_points_2, wpda_gdf_points_3], 
    ignore_index=True))

In [None]:
wpda_gdf_polygons

In [None]:
wpda_gdf_points

In [None]:
# Convert MULTIPOINT geometries to POINT
def multipoint_to_point(geom):
    """Extracts the first point from a MULTIPOINT."""
    if geom.geom_type == "MultiPoint":
        return geom.geoms[0]  # Extract first point
    return geom  # Return as-is if it's already a Point

In [None]:
wpda_gdf_points["geometry"] = wpda_gdf_points["geometry"].apply(multipoint_to_point)

In [None]:
col_to_keep = ['WDPA_PID', 'NAME', 'DESIG', 'OWN_TYPE', 'MANG_AUTH', 'STATUS_YR', 'geometry']
wpda_gdf_polygons = wpda_gdf_polygons[col_to_keep]
wpda_gdf_points = wpda_gdf_points[col_to_keep]

In [None]:
# Simplify geometries while keeping essential shape 
tolerance = 100  # 100 meters simplification tolerance
wpda_gdf_polygons["geometry"] = wpda_gdf_polygons["geometry"].apply(lambda geom: geom.simplify(tolerance, preserve_topology=True))

In [None]:
# Fix invalid geometries after simplification
wpda_gdf_polygons["geometry"] = wpda_gdf_polygons["geometry"].buffer(0)

In [None]:
wpda_gdf_polygons = assign_id(
    wpda_gdf_polygons,
    canada_provinces,
    id_column = "wpda_id",
    prefix='WPDA'
)

In [None]:
wpda_gdf_points = assign_id(
    wpda_gdf_points,
    canada_provinces,
    id_column = "wpda_id",
    prefix='WPDA'
)

In [None]:
wpda_gdf_polygons

In [None]:
wpda_gdf_polygons.to_file("data/Tables/GeoJSON/wpda_gdf_polygons.geojson", driver="GeoJSON")
wpda_gdf_points.to_file("data/Tables/GeoJSON/wpda_gdf_points.geojson", driver="GeoJSON")

## Children tables from GEE

### Create population table

In [None]:
from gee_extraction import extract_ghsl_population

In [None]:
population_gdf = main_gdf.copy()
col_to_keep = ['main_id', 'name', 'longitude', 'latitude', 'geometry']
population_gdf = population_gdf[col_to_keep]

In [None]:
population_gdf = extract_ghsl_population(population_gdf)

In [None]:
population_gdf

In [None]:
population_gdf.to_file(r'data/Tables/GeoJSON/population_gdf.geojson', driver="GeoJSON")
population_gdf.to_csv(r'data/Tables/CSV/population_gdf.csv', index=False)

### Create Natural Potential Vegetation table

In [None]:
from gee_extraction import extract_npv

In [None]:
npv_gdf = extract_npv(main_gdf)

In [None]:
npv_gdf

In [None]:
from constants import biome_dict

In [None]:
npv_gdf['biome_type'] = npv_gdf['biome_type'].map(biome_dict)
npv_gdf

In [None]:
npv_gdf.to_file("data/Tables/GeoJSON/npv_gdf.geojson", driver="GeoJSON")

### Create land cover table

In [None]:
from gee_extraction import extract_land_cover_type

In [None]:
land_cover_gdf = extract_land_cover_type(main_gdf)

In [None]:
land_cover_gdf

In [None]:
land_cover_gdf.to_file("data/Tables/GeoJSON/land_cover_gdf.geojson", driver="GeoJSON")
land_cover_gdf.to_csv("data/Tables/CSV/land_cover_gdf.csv", index=False)

### Create Water risk table

In [None]:
from gee_extraction import extract_aqueduct

In [None]:
water_risk_gdf = main_gdf.copy()
col_to_keep = ['main_id', 'name', 'longitude', 'latitude', 'geometry']
water_risk_gdf = water_risk_gdf[col_to_keep]

In [None]:
water_risk_gdf = extract_aqueduct(water_risk_gdf)

In [None]:
from constants import water_risk_dict

In [None]:
water_risk_gdf["indicator"] = water_risk_gdf["indicator"].map(water_risk_dict)

In [None]:
water_risk_gdf

In [None]:
water_risk_gdf.to_file("data/Tables/GeoJSON/water_risk_gdf.geojson", driver="GeoJSON")
water_risk_gdf.to_csv("data/Tables/CSV/water_risk_gdf.csv", index=False)

# 2) Matching

In [None]:
from matching_functions import match_facilities
from matching_functions import match_facility_to_polygons_with_buffer
from matching_functions import cluster_sites_and_polygons

## Point to point matching

In [None]:
matching_main_tailings = match_facilities(main_gdf, tailings_gdf,
                     id_col1='main_id', id_col2='tailing_id',
                     name_col1='name', name_col2='mine'
                     )

In [None]:
matching_main_ctrace = match_facilities(main_gdf, ctrace_gdf,
                     id_col1='main_id', id_col2='climate_trace_id',
                    name_col1='name', name_col2='source_name',
                     )

In [None]:
matching_main_ghg = match_facilities(main_gdf, ghg_gdf,
                     id_col1='main_id', id_col2='ghg_id',
                     name_col1='name', name_col2='facility_name_ghg'
                     )

In [None]:
matching_main_pollution = match_facilities(main_gdf, pollutant_gdf,
                     id_col1='main_id', id_col2='pollutant_id',
                     name_col1='name', name_col2='facility_name_npri'
                     )

In [None]:
matching_main_mincan = match_facilities(main_gdf, min_can_gdf,
                     id_col1='main_id', id_col2='mincan_id',
                     name_col1='name', name_col2='namemine'
                     )

In [None]:
matching_main_tailings
matching_main_ctrace
matching_main_ghg
matching_main_pollution
matching_main_mincan

## Point to polygon matching 

### Buffer approach

In [None]:
matching_main_tang = match_facility_to_polygons_with_buffer(main_gdf, tang_canada_gdf, 'main_id', 'tang_id')


In [None]:
matching_main_wpda = match_facility_to_polygons_with_buffer(main_gdf, wpda_polygons_gdf, 'main_id', 'wpda_id')


In [None]:
matching_main_indigenous = match_facility_to_polygons_with_buffer(main_gdf, indigenous_gdf, 'main_id', 'indigenous_land_id')


In [None]:
matching_main_wpda.to_csv(r'data/Tables/Matching/Points_to_polygons/matching_main_wpda_polygons.csv', index=False)
matching_main_indigenous.to_csv(r'data/Tables/Matching/Points_to_polygons/matching_main_indigenous.csv', index=False)

In [None]:
def plot_polygons_per_facility(df):
    """
    Plots a histogram showing the distribution of the number of polygons per facility.
    """
    polygons_per_facility = df.groupby("main_id")["wpda_id"].nunique()
    
    # Adjusted histogram with each count clearly positioned on the x-axis
    plt.figure(figsize=(12, 6))
    bins = range(0, polygons_per_facility.max() + 2)  # Ensure each count has its own bin
    sns.histplot(polygons_per_facility, bins=bins, kde=True, discrete=True)  # Ensure bars align properly
    plt.xlabel("Number of Polygons per Facility")
    plt.ylabel("Count of Facilities")
    plt.title("Dispersion of Polygons per Facility (Properly Aligned Bars)")
    plt.xticks(bins)  # Ensure each integer is labeled
    plt.show()

In [None]:
plot_polygons_per_facility(matching_main_wpda)

### Clustering approach

In [None]:
# Fix the geometries, some are Point (nan, nan)
tailings_gdf = tailings_gdf[
    (tailings_gdf.geometry.notnull()) &
    (~tailings_gdf.geometry.is_empty) &
    (tailings_gdf.geometry.type == "Point") &
    (~tailings_gdf.geometry.apply(lambda g: np.isnan(g.x) or np.isnan(g.y)))
].copy()


In [None]:
def cluster_sites_and_polygons(
    facility_gdf, polygon_gdf, tailing_gdf,
    facility_id_col="main_id",
    polygon_id_col="tang_id",
    tailing_id_col="tailing_id",
    eps_km=10, min_samples=2, crs="EPSG:3978", boundary_step=5):
    """
    Cluster facilities, tailings, and polygons spatially using DBSCAN, and assign shared cluster IDs.

    Each point or polygon will receive a 'cluster_id'.
    A 'check_manually' flag is added when multiple facilities or tailings are grouped together.

    Parameters:
    - facility_gdf (GeoDataFrame): Point data of facilities with a unique ID column.
    - polygon_gdf (GeoDataFrame): Polygon data to match, with a unique ID column.
    - tailing_gdf (GeoDataFrame): Point data of tailings, also with unique ID column.
    - facility_id_col (str): Column name for unique facility ID.
    - polygon_id_col (str): Column name for unique polygon ID.
    - tailing_id_col (str): Column name for unique tailing ID.
    - eps_km (float): Max distance (in km) to cluster together (DBSCAN's `eps`).
    - min_samples (int): Minimum number of samples to form a cluster (DBSCAN).
    - crs (str): CRS for accurate distance computation (default: EPSG:3978 = Canada Albers).
    - boundary_step (int): Sampling step for polygon boundary points (1 = no simplification).

    Returns:
    - facility_gdf (GeoDataFrame): With added 'cluster_id' and 'check_manually' columns.
    - polygon_gdf (GeoDataFrame): Same.
    - tailing_gdf (GeoDataFrame): Same.
    """

    import pandas as pd
    import numpy as np
    import geopandas as gpd
    from shapely.geometry import Point, Polygon, MultiPolygon
    from sklearn.cluster import DBSCAN

    # Ensure consistent CRS
    facility_gdf = facility_gdf.to_crs(crs)
    tailing_gdf = tailing_gdf.to_crs(crs)
    polygon_gdf = polygon_gdf.to_crs(crs).explode(index_parts=False)

    # Clean Z-dimension from polygons
    def to_2d(geom):
        if isinstance(geom, MultiPolygon):
            return MultiPolygon([Polygon([(x, y) for x, y, *_ in poly.exterior.coords]) for poly in geom.geoms])
        elif isinstance(geom, Polygon):
            return Polygon([(x, y) for x, y, *_ in geom.exterior.coords])
        return geom

    polygon_gdf["geometry"] = polygon_gdf["geometry"].apply(to_2d)

    # Extract polygon boundary points
    def boundary_points(geom, pid):
        coords = list(geom.exterior.coords)[::boundary_step]
        return [(pid, Point(c)) for c in coords]

    poly_points = []
    for _, row in polygon_gdf.iterrows():
        poly_points.extend(boundary_points(row["geometry"], row[polygon_id_col]))

    poly_gdf = gpd.GeoDataFrame(poly_points, columns=[polygon_id_col, "geometry"], crs=crs)
    poly_gdf[facility_id_col] = None
    poly_gdf[tailing_id_col] = None

    # Prepare facility and tailing points
    facility_tmp = facility_gdf[[facility_id_col, "geometry"]].copy()
    facility_tmp[polygon_id_col] = None
    facility_tmp[tailing_id_col] = None

    tailing_tmp = tailing_gdf[[tailing_id_col, "geometry"]].copy()
    tailing_tmp[polygon_id_col] = None
    tailing_tmp[facility_id_col] = None

    # Combine all points for clustering
    all_points = pd.concat([facility_tmp, tailing_tmp, poly_gdf], ignore_index=True)
    all_points = all_points[all_points.geometry.notna()].copy()


    # Get coordinates and run clustering
    coords = np.array([(geom.x, geom.y) for geom in all_points.geometry])
    db = DBSCAN(eps=eps_km * 1000, min_samples=min_samples).fit(coords)
    all_points["cluster_id"] = db.labels_

    # Cluster statistics
    cluster_stats = all_points.groupby("cluster_id").agg({
        facility_id_col: lambda x: x.notna().sum(),
        tailing_id_col: lambda x: x.notna().sum(),
    }).rename(columns={
        facility_id_col: "n_facilities",
        tailing_id_col: "n_tailings"
    }).reset_index()

    cluster_stats["check_manually"] = (cluster_stats["n_facilities"] > 1) | (cluster_stats["n_tailings"] > 1)

    # Merge stats back
    all_points = all_points.merge(cluster_stats[["cluster_id", "check_manually"]], on="cluster_id", how="left")

    # Assign back to original GeoDataFrames
    def assign_cluster(df, id_col):
        cluster_info = all_points[[id_col, "cluster_id", "check_manually"]].dropna(subset=[id_col])
        return df.merge(cluster_info, on=id_col, how="left")

    facility_gdf = assign_cluster(facility_gdf, facility_id_col)
    tailing_gdf = assign_cluster(tailing_gdf, tailing_id_col)
    polygon_gdf = assign_cluster(polygon_gdf, polygon_id_col)

    return facility_gdf, polygon_gdf, tailing_gdf

In [None]:
facility_test, polygon_test, tailing_test = cluster_sites_and_polygons(main_gdf, tang_canada_gdf, tailings_gdf)

In [None]:
facility_test

In [None]:
tailing_test

# 3) Exports 

## SQL format

In [None]:
import sqlite3

In [None]:
# Connect to SQLite database
conn = sqlite3.connect("database.sqlite")

# Export facility_df1 and facility_df2 to SQLite
main_gdf.drop(columns="geometry").to_sql("main", conn, if_exists="replace", index=False)
pollutant_gdf.drop(columns="geometry").to_sql("pollution", conn, if_exists="replace", index=False)

# Export match_table to SQLite
matching_main_pollution.to_sql("match_table", conn, if_exists="replace", index=False)

# Close connection
conn.close()

## Maps

In [None]:
#Works but around 69MO, lags a lot

In [None]:
import geopandas as gpd
import folium
import leafmap.foliumap as leafmap
import os


def create_map(
    facility_gdf,
    polygon_gdf,
    tailing_gdf,
    output_html="leafmap_clusters.html",
    screenshot_png="canada_map_screenshot.png",
    simplify_tolerance=0.0005,
    center=(61.0667, -107.9917),  # True Canada center
    zoom=4,
    take_screenshot=False
):
    """
    Creates a lightweight interactive HTML map with facilities, polygons, tailings.
    - Reprojects to EPSG:4326 (required for web mapping)
    - Simplifies polygons for performance
    - Adds Esri Satellite basemap
    - Optionally exports a PNG screenshot (centered over Canada)

    Parameters:
    - output_html: Path to saved HTML map
    - screenshot_png: If set, exports screenshot (requires selenium)
    - take_screenshot: Set to False to skip PNG export
    """

    def clean_gdf(gdf, simplify=False):
        gdf = gdf.copy()
        if gdf.crs is None or gdf.crs.to_epsg() != 4326:
            gdf = gdf.to_crs("EPSG:4326")
        if simplify:
            gdf["geometry"] = gdf["geometry"].simplify(simplify_tolerance, preserve_topology=True)
        gdf = gdf[~gdf.geometry.is_empty & gdf.geometry.notna()].reset_index(drop=True)
        return gdf

    # Clean & simplify
    polygon_clean = clean_gdf(polygon_gdf, simplify=True)
    facility_clean = clean_gdf(facility_gdf)
    tailing_clean = clean_gdf(tailing_gdf)

    # Create folium-based map with satellite basemap
    m = leafmap.Map(center=center, zoom=zoom)
    m.add_basemap("Esri.WorldImagery")

    # Add polygons
    folium.GeoJson(
        polygon_clean,
        name="Polygons",
        style_function=lambda x: {
            "fillColor": "yellow",
            "color": "black",
            "weight": 1,
            "fillOpacity": 0.4,
        },
        tooltip=folium.GeoJsonTooltip(fields=polygon_clean.columns[:2].tolist())
    ).add_to(m)

    # Add facilities
    for _, row in facility_clean.iterrows():
        folium.CircleMarker(
            location=[row.geometry.y, row.geometry.x],
            radius=4,
            color="blue",
            fill=True,
            fill_opacity=0.8,
            popup=f"Facility ID: {row.get('main_id', 'N/A')}"
        ).add_to(m)

    # Add tailings
    for _, row in tailing_clean.iterrows():
        folium.CircleMarker(
            location=[row.geometry.y, row.geometry.x],
            radius=4,
            color="red",
            fill=True,
            fill_opacity=0.8,
            popup=f"Tailing ID: {row.get('tailing_id', 'N/A')}"
        ).add_to(m)

    # Export to HTML
    folium.LayerControl().add_to(m)
    m.to_html(output_html)
    print(f"✅ Map exported to: {output_html}")

    # Optional screenshot with selenium
    if take_screenshot:
        try:
            from selenium import webdriver
            from selenium.webdriver.chrome.options import Options
            import time

            options = Options()
            options.add_argument("--headless=new")
            options.add_argument("--window-size=1920,1080")
            driver = webdriver.Chrome(options=options)
            driver.get("file://" + os.path.abspath(output_html))
            time.sleep(3)  # Wait for tiles to load
            driver.save_screenshot(screenshot_png)
            driver.quit()
            print(f"📸 Screenshot saved to: {screenshot_png}")
        except Exception as e:
            print("⚠️ Screenshot failed:", e)

    return m

In [None]:
create_map(facility_test, polygon_test, tailing_test)