In [4]:
import pandas as pd
import numpy as np
import geopandas as gpd


# Imports

In [5]:
# From datasets see dataset_processing.ipynb
main_gdf = gpd.read_file(r'data/Tables/GeoJSON/main_gdf.geojson')
tailings_gdf = gpd.read_file(r'data/Tables/GeoJSON/tailings_gdf.geojson')
min_can_gdf = gpd.read_file(r'data/Tables/GeoJSON/min_can_gdf.geojson')
ctrace_gdf = gpd.read_file(r'data/Tables/GeoJSON/ctrace_gdf.geojson')
conflict_gdf = gpd.read_file(r'data/Tables/GeoJSON/conflict_gdf.geojson')
ghg_gdf = gpd.read_file(r'data/Tables/GeoJSON/ghg_gdf.geojson')
pollutant_gdf = gpd.read_file(r'data/Tables/GeoJSON/pollutant_gdf.geojson')
climate_category_gdf = gpd.read_file(r'data/Tables/GeoJSON/climate_category_gdf.geojson')
weather_gdf = gpd.read_file(r'data/Tables/GeoJSON/weather_gdf.geojson')
peatland_gdf = gpd.read_file(r'data/Tables/GeoJSON/peatland_gdf.geojson')
prioritization_area_gdf = gpd.read_file(r'data/Tables/GeoJSON/prioritization_area_gdf.geojson')
population_gdf = gpd.read_file(r'data/Tables/GeoJSON/population_gdf.geojson')
water_risk_gdf = gpd.read_file(r'data/Tables/GeoJSON/water_risk_gdf.geojson')
land_cover_gdf = gpd.read_file(r'data/Tables/GeoJSON/land_cover_gdf.geojson')
npv_gdf = gpd.read_file(r'data/Tables/GeoJSON/npv_gdf.geojson')
indigenous_can_gdf = gpd.read_file(r'data/Tables/GeoJSON/indigenous_can_gdf.geojson')
wpda_gdf_polygons = gpd.read_file(r'data/Tables/GeoJSON/wpda_gdf_polygons.geojson')
tang_canada_gdf = gpd.read_file(r'data/Tables/GeoJSON/tang_canada_gdf.geojson')

In [86]:
main_gdf.drop(columns=["longitude", "latitude"], inplace=True)

In [86]:
# From manually collected data
production_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Production')
reserves_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Production')
energy_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Energy')
env_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Env')
env_int_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Env_int')
archetypes_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Archetype')

In [56]:
from db_creation_function import assign_row_id

In [96]:
def assign_row_id_to_collected_data(
    df,
    facility_id_col="main_id",
    row_id_col="row_id",
    prefix="ROW",
    year_col=None,
    scenario_col=None,
    fallback_cols=["facility_group_id", "company_id"]
):
    df = df.copy()

    def is_valid(val):
        return pd.notna(val) and str(val).strip() not in ["", "-"]

    def resolve_facility_id(row):
        val = row.get(facility_id_col)
        if is_valid(val):
            return str(val).strip()
        for col in fallback_cols:
            fallback_val = row.get(col)
            if is_valid(fallback_val):
                return str(fallback_val).strip()
        return "UNKNOWN"

    def extract_hash(fac_id):
        s = str(fac_id)
        parts = s.split("-")
        if len(parts) >= 3:
            return parts[-1]
        else:
            return s

    df["_facility_resolved"] = df.apply(resolve_facility_id, axis=1)
    df["_hash"] = df["_facility_resolved"].apply(extract_hash)

    group_cols = ["_facility_resolved"]
    if year_col:
        group_cols.append(year_col)
    if scenario_col:
        group_cols.append(scenario_col)

    df["_row_index"] = df.groupby(group_cols).cumcount() + 1

    def build_id(row):
        parts = [prefix, row["_hash"]]
        if year_col:
            parts.append(str(row[year_col]))
        if scenario_col:
            parts.append(str(row[scenario_col]))
        parts.append(str(row["_row_index"]))
        return "-".join(parts)

    df[row_id_col] = df.apply(build_id, axis=1)
    df.drop(columns=["_row_index", "_hash", "_facility_resolved"], inplace=True)

    cols = [row_id_col] + [col for col in df.columns if col != row_id_col]
    return df[cols]

In [98]:
# We need to assign row_ids to the manually collected data
production_df = assign_row_id_to_collected_data(production_df, 
                  facility_id_col='main_id',
                  prefix='PROD',
                  year_col='year'           
                  )

reserves_df = assign_row_id_to_collected_data(reserves_df, 
                  facility_id_col='main_id',
                  prefix='RES',
                  year_col='year'           
                  )

energy_df = assign_row_id_to_collected_data(energy_df, 
                  facility_id_col='main_id',
                  prefix='NRJ',
                  year_col='year'           
                  )

env_df = assign_row_id_to_collected_data(env_df, 
                  facility_id_col='main_id',
                  prefix='ENV',
                  year_col='year'           
                  )

env_int_df = assign_row_id_to_collected_data(env_int_df, 
                  facility_id_col='main_id',
                  prefix='ENV_INT',
                  year_col='year'           
                  )

archetypes_df = assign_row_id_to_collected_data(archetypes_df, 
                  facility_id_col='main_id',
                  prefix='ARCH',
                  )

# Create auxiliary tables

## Source table

In [51]:
# For the main table
source_map = {
    "project": "https://open.canada.ca/data/en/dataset/22b2db8a-dc12-47f2-9737-99d3da921751",
    "mining": "https://open.canada.ca/data/en/dataset/000183ed-8864-42f0-ae43-c4313a860720",
    "manufacturing": "https://open.canada.ca/data/en/dataset/000183ed-8864-42f0-ae43-c4313a860720"
}
main_gdf["source_id"] = main_gdf["facility_type"].map(source_map)
main_gdf

Unnamed: 0,main_id,facility_name,facility_group_name,facility_group_id,reported_company,company_name_folder,company_id,longitude,latitude,city,...,status,activity_status,development_stage,facility_type,mining_processing_type,commodity_group,primary_commodity,commodities,source_id,geometry
0,QC-MAIN-5801b453,Abcourt-Barvue,,,Abcourt Mines Inc.,Abcourt Mines Inc,CMP-0a6a3062,-77.679010,48.521540,,...,project_facility,Active,Advanced project,project,,,Zinc,"Zinc, silver",https://open.canada.ca/data/en/dataset/22b2db8...,POINT (-77.67901 48.52154)
1,QC-MAIN-d29e0839,Sleeping Giant,,,Abcourt Mines Inc.,Abcourt Mines Inc,CMP-0a6a3062,-77.974340,49.132590,north of Amos,...,active_facility,,,mining,Concentrator,Precious metals,Gold,"Gold, silver",https://open.canada.ca/data/en/dataset/000183e...,POINT (-77.97434 49.13259)
2,QC-MAIN-f1ff4920,Akasaba West,,,Agnico Eagle Mines Limited,Agnico Eagle Mines Limited,CMP-6265c407,-77.580000,48.040000,,...,project_facility,Active,Advanced project,project,,,Gold,"Gold, copper",https://open.canada.ca/data/en/dataset/22b2db8...,POINT (-77.58000 48.04000)
3,NU-MAIN-4ea8ac9d,Amaruq,Meadowbank complex,GRP-0a2c0d69,Agnico Eagle Mines Limited,Agnico Eagle Mines Limited,CMP-6265c407,-96.697000,65.415000,Baker Lake,...,active_facility,,,mining,"Open-pit, underground",Precious metals,Gold,Gold,https://open.canada.ca/data/en/dataset/000183e...,POINT (-96.69700 65.41500)
4,QC-MAIN-e7e6a960,Canadian Malartic,,,Agnico Eagle Mines Limited,Agnico Eagle Mines Limited,CMP-6265c407,-78.130824,48.122223,Malartic,...,active_facility,,,mining,"Open-pit, concentrator",Precious metals,Gold,"Gold, silver",https://open.canada.ca/data/en/dataset/000183e...,POINT (-78.13082 48.12222)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,ON-MAIN-c5fefb01,Mishi,,,Wesdome Gold Mines Ltd.,Wesdome Gold Mines Ltd,CMP-3d2c4955,-85.452500,48.110280,Wawa,...,active_facility,,,mining,Open-pit,Precious metals,Gold,Gold,https://open.canada.ca/data/en/dataset/000183e...,POINT (-85.45250 48.11028)
266,BC-MAIN-feb13c30,Record Ridge,,,West High Yield Resources Ltd.,West High Yield Resources Ltd,CMP-1023f9c0,-117.884000,49.082000,,...,project_facility,Active,Advanced project,project,,,Magnesium,Magnesium,https://open.canada.ca/data/en/dataset/22b2db8...,POINT (-117.88400 49.08200)
267,YT-MAIN-5e0d8b48,Casino,,,Western Copper and Gold Corporation,Western Copper and Gold Corporation,CMP-2dac9896,-138.833330,62.733330,,...,project_facility,Active,Advanced project,project,,,Copper,"Copper, gold, molybdenum, silver",https://open.canada.ca/data/en/dataset/22b2db8...,POINT (-138.83333 62.73333)
268,ON-MAIN-12c68d49,Eagle's Nest,,,Wyloo,Wyloo,CMP-65b36728,-86.303800,52.742020,,...,project_facility,Active,Advanced project,project,,,Nickel,"Nickel, copper, gold, platinum, palladium",https://open.canada.ca/data/en/dataset/22b2db8...,POINT (-86.30380 52.74202)


In [52]:
# For the datasets
tailings_gdf["source_id"] = 'https://tailing.grida.no/'
min_can_gdf["source_id"] = '10.1038/s41597-024-03116-3'
ctrace_gdf["source_id"] = 'Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory'
conflict_gdf["source_id"] = 'http://journals.librarypublishing.arizona.edu/jpe/article/id/1932/'
ghg_gdf["source_id"] = 'https://www.canada.ca/en/environment-climate-change/services/environmental-indicators/greenhouse-gas-emissions/large-facilities.html'
pollutant_gdf["source_id"] = 'https://www.canada.ca/en/environment-climate-change/services/national-pollutant-release-inventory/tools-resources-data/exploredata.html' 
prioritization_area_gdf['source_id'] = 'https://doi.org/10.5281/zenodo.5006332'
climate_category_gdf['source_id'] = 'https://doi.org/10.1038/s41597-023-02549-6'
weather_gdf['source_id'] = 'https://climate-scenarios.canada.ca/?page=CanDCS6-indices'
peatland_gdf['source_id'] = 'https://greifswaldmoor.de/global-peatland-database-en.html'
population_gdf['source_id'] = 'https://doi.org/10.2905/2FF68A52-5B5B-4A22-8F40-C41DA8332CFE' 
water_risk_gdf['source_id'] = 'https://www.wri.org/data/aqueduct-global-maps-40-data'
land_cover_gdf['source_id'] = 'https://doi.org/10.5067/MODIS/MCD12Q1.061 + https://doi.org/10.5281/zenodo.7254221'
npv_gdf['source_id'] = 'https://doi.org/10.7717/peerj.5457'
indigenous_can_gdf['source_id'] = 'http://www.landmarkmap.org'
wpda_gdf_polygons['source_id'] = 'https://www.protectedplanet.net'
tang_canada_gdf['source_id'] = 'https://doi.org/10.1038/s43247-023-00805-6'
#maus_canada_gdf['source_id'] = 'https://doi.org/10.1038/s41597-022-01547-4'

In [None]:
from db_creation_function import add_source_id_to_collected_data, create_source_table_from_datasets

In [61]:
def add_source_id_to_collected_data(df, company_col="company", facility_col="facility", source_col="source", source_id_col="source_id"):
    """
    Add a human-readable source_id column to a DataFrame using company, facility, and source file name.

    Parameters:
    - df: pandas DataFrame
    - company_col: column name for the company
    - facility_col: column name for the facility
    - source_col: column name for the source file (e.g. PDF, Excel)
    - source_id_col: name of the output column to be added

    Returns:
    - df with a new 'source_id' column (if not already present)
    """
    import pandas as pd
    from pathlib import Path

    if source_id_col not in df.columns:
        def create_source_id(row):
            company = str(row[company_col]).strip().replace(" ", "")
            file_stem = Path(str(row[source_col])).stem.strip().replace(" ", "")
            return f"SRC_{company}_{file_stem}"

        df[source_id_col] = df.apply(create_source_id, axis=1)
    else:
        print(f"ℹ️ '{source_id_col}' already exists. No changes made.")

    return df

In [62]:
reserves_df

Unnamed: 0,row_id,year,level,geography,reported_company_nrcan,name,main_id,facility_group_id,company_id,facility_type,mining_processing_type,commodities_nrcan,commodity,reference_point,material_type,data_type,unit,value,comment,source
0,RES-1f126a43-2023-1,2023,Site-specific,Canada,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Ore,2,Ore processed,Production,t,1.959493e+07,,2023-Sustainability-Performance_data
1,RES-1f126a44-2023-1,2023,Site-specific,Canada,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a44,-,,mining,"Underground, concentrator","Gold, silver",Gold,4,Intermediate metal,Production,oz,6.846400e+05,"""Production reflects Agnico Eagle's 50% intere...",2023-Sustainability-Performance_data
2,RES-1f126a45-2023-1,2023,Site-specific,Canada,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a45,-,,mining,"Underground, concentrator","Gold, silver",Silver,4,Intermediate metal,Production,koz,3.100000e+02,"""Production reflects Agnico Eagle's 50% intere...",2023-Sustainability-Performance_data
3,RES-e7e6a960-2023-1,2023,Site-specific,Canada,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Gold,2,Ore processed,Production,t,2.543485e+07,,2023-Sustainability-Performance_data
4,RES-e7e6a963-2023-1,2023,Site-specific,Canada,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a963,-,,mining,"Open-pit, concentrator",Gold,Gold,4,Intermediate metal,Production,oz,6.774460e+05,Like intermediate metal,2023-Sustainability-Performance_data
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
338,RES-c5fefb01-2023-1,2023,Site-specific,Canada,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Gold,2,Head grade,Technical attribute,g/t,5.900000e+00,,2024-MDA-FINAL
339,RES-c5fefb01-2023-2,2023,Site-specific,Canada,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Gold,2,Recovery rate,Technical attribute,%,9.830000e+01,,2024-MDA-FINAL
340,RES-c5fefb01-2023-3,2023,Site-specific,Canada,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Gold,4,Intermediate metal,Production,Oz,3.553700e+04,,2024-MDA-FINAL
341,RES-c5fefb01-2023-4,2023,Site-specific,Canada,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Gold,2,Ore milled,Production,t,1.911480e+05,,2024-MDA-FINAL


In [63]:
production_df = add_source_id_to_collected_data(production_df, 
                                    company_col="reported_company_nrcan", 
                                    facility_col="name", 
                                    source_col="source")

reserves_df = add_source_id_to_collected_data(reserves_df, 
                                    company_col="reported_company_nrcan", 
                                    facility_col="name", 
                                    source_col="source")

energy_df = add_source_id_to_collected_data(energy_df, 
                                    company_col="reported_company_nrcan", 
                                    facility_col="name", 
                                    source_col="source")

env_df = add_source_id_to_collected_data(env_df, 
                                    company_col="reported_company_nrcan", 
                                    facility_col="name", 
                                    source_col="source")

env_int_df = add_source_id_to_collected_data(env_int_df, 
                                    company_col="reported_company_nrcan", 
                                    facility_col="name", 
                                    source_col="source")

archetypes_df = add_source_id_to_collected_data(archetypes_df, 
                                    company_col="reported_company_nrcan", 
                                    facility_col="name", 
                                    source_col="source")

In [65]:
def create_source_table_from_datasets(dataset_dict, manually_collected_dfs, source_col="source", company_col="company", facility_col="facility"):
    """
    Create a consolidated source table from multiple GeoDataFrames and manually collected data.

    Parameters:
    - dataset_dict: dict mapping GeoDataFrame name (str) to actual df with a single source_id value
    - manually_collected_dfs: list of dataframes that use the 'add_source_id' function
    - source_col, company_col, facility_col: column names in manually collected dfs

    Returns:
    - A pandas DataFrame with columns: source_id, source_provenance, source_name
    """

    import pandas as pd
    from pathlib import Path

    # Step 1: Add known datasets
    dataset_sources = []
    for name, df in dataset_dict.items():
        unique_ids = df['source_id'].dropna().unique()
        for sid in unique_ids:
            dataset_sources.append({
                "source_id": sid.strip(),
                "source_provenance": "dataset",
                "source_name": name
            })

    # Step 2: Add manually collected sources
    manual_sources = []
    for df in manually_collected_dfs:
        if "source_id" not in df.columns:
            continue  # skip if not processed yet
        for _, row in df.dropna(subset=["source_id"]).drop_duplicates(subset=["source_id"]).iterrows():
            source_id = row["source_id"]
            company = str(row.get(company_col, "")).strip()
            facility = str(row.get(facility_col, "")).strip()
            file_path = Path(str(row.get(source_col, "")))
            file_name = file_path.name
            source_name = f"{company} – {facility} ({file_name})".strip(" –()")
            manual_sources.append({
                "source_id": source_id,
                "source_provenance": "report",
                "source_name": source_name
            })

    # Combine, deduplicate
    full_source_table = pd.DataFrame(dataset_sources + manual_sources)
    full_source_table = full_source_table.drop_duplicates(subset=["source_id"]).sort_values("source_provenance")

    return full_source_table


In [68]:
dataset_dict = {
    "Global Tailings Portal": tailings_gdf,
    "A Comprehensive Historical and Geolocalized Database of Mining Activities in Canada": min_can_gdf,
    "ClimateTRACE mineral extraction sector": ctrace_gdf,
    "Environmental Justice Atlas": conflict_gdf,
    "GHG from large facilities": ghg_gdf,
    "National Pollutant Inventory Report (NPRI)": pollutant_gdf,
    "Areas of global importance for conserving terrestrial biodiversity, carbon and water": prioritization_area_gdf,
    "High-Resolution (1 km) Koppen-Geiger Maps for 1901–2099 Based on Constrained CMIP6 Projections": climate_category_gdf,
    "CMIP6 statistically downscaled climate indices (CanDCS-M6)": weather_gdf,
    "Global Peatland Database": peatland_gdf,
    "Global Human Settlement Layer (GHSL)": population_gdf,
    "Aqueduct 4.0": water_risk_gdf,
    "Modis Land Cover and ESA World Cover": land_cover_gdf,
    "Potential Natural Vegetation (NPV)": npv_gdf,
    "Indigenous Peoples’ and Local Community Lands and Territories": indigenous_can_gdf,
    "World Database on Protected Areas (WDPA)": wpda_gdf_polygons,
    "Global Mining Footprint Mapped from High-Resolution Satellite Imagery": tang_canada_gdf,
}

manual_dfs = [production_df, reserves_df, energy_df, env_df, env_int_df, archetypes_df] 

source_table = create_source_table_from_datasets(dataset_dict, manual_dfs)

In [69]:
source_table

Unnamed: 0,source_id,source_provenance,source_name
0,https://tailing.grida.no/,dataset,Global Tailings Portal
16,https://doi.org/10.1038/s43247-023-00805-6,dataset,Global Mining Footprint Mapped from High-Resol...
15,https://www.protectedplanet.net,dataset,World Database on Protected Areas (WDPA)
14,http://www.landmarkmap.org,dataset,Indigenous Peoples’ and Local Community Lands ...
13,https://doi.org/10.7717/peerj.5457,dataset,Potential Natural Vegetation (NPV)
...,...,...,...
70,SRC_SilverLakeResourcesLtd_MDI42C14SE00005,report,MDI42C14SE00005
71,SRC_SilverLakeResourcesLtd_MDI42C14SE00006,report,MDI42C14SE00006
72,SRC_SSRMiningInc_FY2024,report,FY2024
65,SRC_RioTinto_rt-fact-book-addendum,report,rt-fact-book-addendum.xlsx


## Substance table

In [None]:
from db_creation_function import create_substance_table

In [32]:
def create_substance_table(pollutant_gdf, env_df):
    """
    Combine and clean substance names from NPRI and manually collected datasets,
    apply harmonized naming, assign stable substance IDs, and return a master substance table.
    """

    # Step 1: Concatenate and tag provenance
    npri_sub = pollutant_gdf[['substance_name_npri']].copy()
    npri_sub['source'] = 'NPRI'
    npri_sub = npri_sub.rename(columns={'substance_name_npri': 'original_name'})

    manual_sub = env_df[['substance_name']].copy()
    manual_sub['source'] = 'Manual'
    manual_sub = manual_sub.rename(columns={'substance_name': 'original_name'})

    all_substances = pd.concat([npri_sub, manual_sub], ignore_index=True)

    # Step 2: Clean and exclude irrelevant entries
    all_substances['original_name'] = all_substances['original_name'].astype(str).str.strip()
    all_substances = all_substances[all_substances['original_name'] != '-']
    all_substances = all_substances[all_substances['original_name'] != 'nan']

    # Step 3: Harmonization dictionary
    harmonized_map = {
        "Ammonia (total)": "Ammonia",
        "Antimony (and its compounds)": "Antimony",
        "Arsenic (and its compounds)": "Arsenic",
        "Lead (and its compounds)": "Lead",
        "Mercury (and its compounds)": "Mercury",
        "PM10 - Particulate Matter <= 10 Micrometers": "PM10",
        "PM2.5 - Particulate Matter <= 2.5 Micrometers": "PM2.5",
        "Volatile Organic Compounds (Total)": "VOCs",
    }

    all_substances['harmonized_name'] = all_substances['original_name'].replace(harmonized_map)

    # Step 4: Drop duplicates
    substance_table = (
        all_substances
        .drop_duplicates(subset=['harmonized_name'])
        .sort_values('harmonized_name')
        .reset_index(drop=True)
    )

    # Step 5: Generate stable substance_id using SHA1 hash
    def make_id(row):
        raw = f"{row['harmonized_name']}"
        return "SUB" + hashlib.sha1(raw.encode('utf-8')).hexdigest()[:10]

    substance_table['substance_id'] = substance_table.apply(make_id, axis=1)

    # Step 6: Reorder columns
    substance_table = substance_table[[
        'substance_id', 'harmonized_name', 'original_name', 'source'
    ]]

    return substance_table


In [33]:
substance_table = create_substance_table(pollutant_gdf, env_df)

In [34]:
substance_table

Unnamed: 0,substance_id,harmonized_name,original_name,source
0,SUB07971a0766,"1,2,4-Trimethylbenzene","1,2,4-Trimethylbenzene",NPRI
1,SUBbdfdffeb95,1-Nitropyrene,1-Nitropyrene,NPRI
2,SUB0f01e7c1d0,2-Butoxyethanol,2-Butoxyethanol,NPRI
3,SUB324959500a,"7,12-Dimethylbenz[a]anthracene","7,12-Dimethylbenz[a]anthracene",NPRI
4,SUBfee6b26acd,"7H-Dibenzo[c,g]carbazole","7H-Dibenzo[c,g]carbazole",NPRI
...,...,...,...,...
169,SUB787e5ac1a2,Vanadium (and its compounds),Vanadium (and its compounds),NPRI
170,SUBe849a2ff09,Xylene (all isomers),Xylene (all isomers),NPRI
171,SUBae95939c3f,Zinc (and its compounds),Zinc (and its compounds),NPRI
172,SUB090a1237c1,i-Butyl alcohol,i-Butyl alcohol,NPRI


In [40]:
pollutant_gdf_a = pollutant_gdf.merge(
    substance_table[['substance_id', 'original_name']],
    how='left',
    left_on='substance_name_npri',
    right_on='original_name'
).drop(columns='original_name')



In [41]:
env_df

Unnamed: 0,year,reporting_level,reported_company_nrcan,name,main_id,facility_group_id,company_id,facility_type,mining_processing_type,commodity,compartment,flow_type,subflow_type,substance_name,unit,value,comment,source
0,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,SOx,t,315,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
1,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,NOx,t,4802,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
2,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,SOx,t,486,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
3,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,NOx,t,7414,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
4,2023,Site-specific,Agnico Eagle Mines Limited,Goldex,NU-MAIN-8b0264c9,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,SOx,t,38,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2023,Site-specific,Wesdome Gold Mines Ltd,Eagle River,QC-MAIN-9de9bb0d,-,,mining,"Underground, concentrator",Gold,Water,water,Total water discharged,-,m3,324347,,2023-ESG-Data-Tables.xlsx
746,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal,-,m3,168115,,2023-ESG-Data-Tables.xlsx
747,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal|Freshwater withdrawn,-,m3,168115,,2023-ESG-Data-Tables.xlsx
748,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Total water consumption,-,m3,82144,,2023-ESG-Data-Tables.xlsx


In [42]:
substance_table

Unnamed: 0,substance_id,harmonized_name,original_name,source
0,SUB07971a0766,"1,2,4-Trimethylbenzene","1,2,4-Trimethylbenzene",NPRI
1,SUBbdfdffeb95,1-Nitropyrene,1-Nitropyrene,NPRI
2,SUB0f01e7c1d0,2-Butoxyethanol,2-Butoxyethanol,NPRI
3,SUB324959500a,"7,12-Dimethylbenz[a]anthracene","7,12-Dimethylbenz[a]anthracene",NPRI
4,SUBfee6b26acd,"7H-Dibenzo[c,g]carbazole","7H-Dibenzo[c,g]carbazole",NPRI
...,...,...,...,...
169,SUB787e5ac1a2,Vanadium (and its compounds),Vanadium (and its compounds),NPRI
170,SUBe849a2ff09,Xylene (all isomers),Xylene (all isomers),NPRI
171,SUBae95939c3f,Zinc (and its compounds),Zinc (and its compounds),NPRI
172,SUB090a1237c1,i-Butyl alcohol,i-Butyl alcohol,NPRI


In [43]:
env_df.merge(
    substance_table[['substance_id', 'original_name']],
    how='left',
    left_on='substance_name',
    right_on='original_name'
).drop(columns='original_name')

Unnamed: 0,year,reporting_level,reported_company_nrcan,name,main_id,facility_group_id,company_id,facility_type,mining_processing_type,commodity,compartment,flow_type,subflow_type,substance_name,unit,value,comment,source,substance_id
0,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,SOx,t,315,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,SUB2c1715b8f1
1,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,NOx,t,4802,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,SUB515573c582
2,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,SOx,t,486,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,SUB2c1715b8f1
3,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,NOx,t,7414,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,SUB515573c582
4,2023,Site-specific,Agnico Eagle Mines Limited,Goldex,NU-MAIN-8b0264c9,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,SOx,t,38,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,SUB2c1715b8f1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2023,Site-specific,Wesdome Gold Mines Ltd,Eagle River,QC-MAIN-9de9bb0d,-,,mining,"Underground, concentrator",Gold,Water,water,Total water discharged,-,m3,324347,,2023-ESG-Data-Tables.xlsx,
746,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal,-,m3,168115,,2023-ESG-Data-Tables.xlsx,
747,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal|Freshwater withdrawn,-,m3,168115,,2023-ESG-Data-Tables.xlsx,
748,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Total water consumption,-,m3,82144,,2023-ESG-Data-Tables.xlsx,


In [92]:
substance_table.to_csv(r'data/Tables/Auxiliary_tables/substance_table.csv', index=False)

## Compartment table

In [18]:
pollutant_gdf[pollutant_gdf['emission_type'] == 'Water Releases / Rejets à l\'eau']['emission_subtype'].unique()

array(['Direct Discharge', 'Total', 'Receiving Waterbody (English)',
       'Receiving Waterbody (French)', 'Spills'], dtype=object)

In [19]:
# Change the value to 'None' for emission_subtype when emission_type = 'Water Releases / Rejets à l'eau' and 'emission_subtype' is 'Receiving Waterbody (English)' or 'Receiving Waterbody (French)'
pollutant_gdf.loc[
    (pollutant_gdf['emission_type'] == 'Water Releases / Rejets à l\'eau') & 
    (pollutant_gdf['emission_subtype'].isin(['Receiving Waterbody (English)', 'Receiving Waterbody (French)'])),
    'emission_subtype'
] = None

In [None]:
from db_creation_function import create_compartment_table

In [39]:
import pandas as pd
import hashlib

def create_compartment_table(pollutant_gdf, env_df):
    """
    Build a harmonized compartment table with traceable raw labels for merging.
    """

    # --- Define mapping from emission_type to (compartment, pathway)
    compartment_mapping_clean = {
        "Air Emissions / Émissions à l'air": ("Air", "Unspecified"),
        "Water Releases / Rejets à l'eau": ("Water", "Unspecified"),
        "Land Releases /  Rejets au sol": ("Land", "Unspecified"),
        "On-Site Disposal / Élimination sur le site": ("Land", "On-site disposal"),
        "Off-Site Disposal / Élimination hors site": ("Land", "Off-site disposal"),
        "Total Releases / Rejets totaux": (None, "Aggregate"),
        "Transfers for Treatment / Transferts pour traitement": (None, "Transfer for treatment"),
        "Transfers for Recycling / Transferts pour recyclage": (None, "Transfer for recycling"),
        "Total On-Site, Off-Site and Treatment Disposal /\n Élimination sur le site, hors site et pour traitement totale": (None, "Aggregate"),
        "Grand Total": (None, "Grand total")
    }

    # --- NPRI data
    npri_comp = pollutant_gdf[['emission_type', 'emission_subtype']].dropna(subset=['emission_type']).copy()
    npri_comp['source'] = 'NPRI'
    npri_comp[['compartment', 'default_pathway']] = npri_comp['emission_type'].map(compartment_mapping_clean).apply(pd.Series)
    npri_comp['compartment_pathway'] = npri_comp['emission_subtype'].fillna('').str.strip()
    npri_comp.loc[npri_comp['compartment_pathway'] == '', 'compartment_pathway'] = npri_comp['default_pathway']
    npri_comp['raw_compartment_label'] = npri_comp['emission_type']
    npri_comp['raw_pathway_label'] = npri_comp['emission_subtype'].fillna('Unspecified')
    npri_comp = npri_comp.drop(columns=['default_pathway'])

    # --- Manual data
    env_comp = env_df[['compartment']].dropna().copy()
    env_comp = env_comp[env_comp['compartment'] != '-']
    env_comp['compartment_pathway'] = 'Unspecified'
    env_comp['raw_compartment_label'] = env_comp['compartment']
    env_comp['raw_pathway_label'] = 'Unspecified'
    env_comp['source'] = 'Manually collected data'

    # --- Align columns
    env_comp['compartment'] = env_comp['compartment']
    manual_cols = ['compartment', 'compartment_pathway', 'raw_compartment_label', 'raw_pathway_label', 'source']
    npri_cols = manual_cols

    # --- Combine all
    all_comps = pd.concat([
        npri_comp[npri_cols],
        env_comp[manual_cols]
    ], ignore_index=True).drop_duplicates().reset_index(drop=True)

    # --- Create hashed ID
    def make_id(row):
        raw = f"{row['compartment']}_{row['compartment_pathway']}"
        return "CMP" + hashlib.sha1(raw.encode('utf-8')).hexdigest()[:10]

    all_comps['compartment_id'] = all_comps.apply(make_id, axis=1)

    # --- Final structure
    return all_comps[[
        'compartment_id', 'compartment', 'compartment_pathway',
        'raw_compartment_label', 'raw_pathway_label', 'source'
    ]].sort_values(['compartment', 'compartment_pathway'])



In [40]:
compartment_table = create_compartment_table(pollutant_gdf, env_df)

In [42]:
pollutant_gdf_a = pollutant_gdf.merge(
    compartment_table[[
        'compartment_id', 'raw_compartment_label', 'raw_pathway_label'
    ]],
    how='left',
    left_on=['emission_type', 'emission_subtype'],
    right_on=['raw_compartment_label', 'raw_pathway_label']
).drop(columns=['raw_compartment_label', 'raw_pathway_label'])


In [43]:
pollutant_gdf_a

Unnamed: 0,row_id,year,npri_id,facility_name_npri,company_name_npri,facility_type,longitude,latitude,terrestrial_ecozone,watershed,substance_name_npri,substance_name_ecoinvent,substance_unit,emission_type,emission_subtype,value,source_id,substance_id,geometry,compartment_id
0,npri-1568-2023-1,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Ammonia (total),Ammonia,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.636,https://www.canada.ca/en/environment-climate-c...,SUBa2c74fdce8,POINT (-80.08734 48.13032),CMP1718f4466d
1,npri-1568-2023-2,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Arsenic (and its compounds),Arsenic,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.116,https://www.canada.ca/en/environment-climate-c...,SUB526607d180,POINT (-80.08734 48.13032),CMP1718f4466d
2,npri-1568-2023-3,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Carbon monoxide,"Carbon monoxide, fossil",tonnes,Air Emissions / Émissions à l'air,Stack Emissions,76.208,https://www.canada.ca/en/environment-climate-c...,SUB92570cece4,POINT (-80.08734 48.13032),CMP1718f4466d
3,npri-1568-2023-4,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Chromium (and its compounds),Chromium,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.00105,https://www.canada.ca/en/environment-climate-c...,SUB075f575023,POINT (-80.08734 48.13032),CMP1718f4466d
4,npri-1568-2023-5,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Cobalt (and its compounds),Cobalt,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.18,https://www.canada.ca/en/environment-climate-c...,SUB481f3889a8,POINT (-80.08734 48.13032),CMP1718f4466d
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24125,npri-5697-2023-6,2023,5697,Rayette Manufacturing,,manufacturing,-79.49600,43.80800,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",25.1008,https://www.canada.ca/en/environment-climate-c...,SUB4a14c4c889,POINT (-79.49600 43.80800),CMP281acc6045
24126,npri-5698-2023-6,2023,5698,Bowes Manufacturing,,manufacturing,-79.49480,43.80360,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",23.418,https://www.canada.ca/en/environment-climate-c...,SUB4a14c4c889,POINT (-79.49480 43.80360),CMP281acc6045
24127,npri-34697-2023-2,2023,34697,541 Bowes Warehouse,,manufacturing,-79.49707,43.81726,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",16.543746,https://www.canada.ca/en/environment-climate-c...,SUB4a14c4c889,POINT (-79.49707 43.81726),CMP281acc6045
24128,npri-5697-2023-7,2023,5697,Rayette Manufacturing,,manufacturing,-79.49600,43.80800,Mixedwood Plain,St. Lawrence Drainage Area,Volatile Organic Compounds (Total),,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",12.5823,https://www.canada.ca/en/environment-climate-c...,SUB0d60783c59,POINT (-79.49600 43.80800),CMP281acc6045


In [44]:
env_df['compartment_pathway'] = 'Unspecified'
env_df = env_df.merge(
    compartment_table[[
        'compartment_id', 'raw_compartment_label', 'raw_pathway_label'
    ]],
    how='left',
    left_on=['compartment', 'compartment_pathway'],
    right_on=['raw_compartment_label', 'raw_pathway_label']
).drop(columns=['raw_compartment_label', 'raw_pathway_label'])


In [45]:
env_df

Unnamed: 0,year,reporting_level,reported_company_nrcan,name,main_id,facility_group_id,company_id,facility_type,mining_processing_type,commodity,compartment,flow_type,subflow_type,substance_name,unit,value,comment,source,compartment_pathway,compartment_id
0,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,SOx,t,315,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,Unspecified,CMPf3265e9fca
1,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,NOx,t,4802,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,Unspecified,CMPf3265e9fca
2,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,SOx,t,486,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,Unspecified,CMPf3265e9fca
3,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,NOx,t,7414,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,Unspecified,CMPf3265e9fca
4,2023,Site-specific,Agnico Eagle Mines Limited,Goldex,NU-MAIN-8b0264c9,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,SOx,t,38,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx,Unspecified,CMPf3265e9fca
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,2023,Site-specific,Wesdome Gold Mines Ltd,Eagle River,QC-MAIN-9de9bb0d,-,,mining,"Underground, concentrator",Gold,Water,water,Total water discharged,-,m3,324347,,2023-ESG-Data-Tables.xlsx,Unspecified,CMP68bd334352
746,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal,-,m3,168115,,2023-ESG-Data-Tables.xlsx,Unspecified,CMP68bd334352
747,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal|Freshwater withdrawn,-,m3,168115,,2023-ESG-Data-Tables.xlsx,Unspecified,CMP68bd334352
748,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Total water consumption,-,m3,82144,,2023-ESG-Data-Tables.xlsx,Unspecified,CMP68bd334352


# Matching

## Point to point matching

In [31]:
from matching_functions import match_facilities, one_to_many_relationships

### MinCan

Some mines in MinCan dataset have very slightly different name (e.g. LaRonde - Zone 5 and LaRonde), or several companies for the same mines.

In [7]:
lt_main_mincan = match_facilities(main_gdf, min_can_gdf,
                     id_col1='main_id', id_col2='mincan_id',
                     name_col1='facility_name', name_col2='namemine'
                     )

In [8]:
matches_main_mincan = one_to_many_relationships(
    lt_main_mincan,
    id_main_col='main_id',
    id_sat_col='mincan_id',
    distance_threshold_m=2000,
    similarity_threshold=80,
    similarity_metric="token_set"  
)

In [9]:
min_can_gdf = min_can_gdf.merge(
    matches_main_mincan,
    on="mincan_id", 
    how="left"
)

In [11]:
# For SQL
min_can_gdf = min_can_gdf[min_can_gdf["main_id"].notna()].copy()

In [14]:
col_to_keep = ["mincan_id", "main_id", "mine_status", "operation_periods"]
min_can_gdf = min_can_gdf[col_to_keep]
min_can_gdf

Unnamed: 0,mincan_id,main_id,mine_status,operation_periods
4,QC-MINCAN-d1e8b6b1,QC-MAIN-30c1828c,Inactive,1927–1976
53,NL-MINCAN-01316355,NL-MAIN-842ba1b4,Inactive,1997–2010
55,QC-MINCAN-494c7715,QC-MAIN-9de9bb0d,Active,1936–1996; 2005–2013; 2021–open
117,BC-MINCAN-8cc41b53,BC-MAIN-6b4800fe,Active,1972–1998; 2004–open
136,QC-MINCAN-8321e85e,QC-MAIN-e51eda66,Active,1988–open
...,...,...,...,...
871,NL-MINCAN-a34e9244,NL-MAIN-c139de6d,Inactive,1971–2019
872,BC-MINCAN-3e33555f,BC-MAIN-b1fe389a,Inactive,1971–2008
897,SK-MINCAN-b20ab2db,SK-MAIN-bb89158f,Inactive,1983–2018
906,ON-MINCAN-7b29d7b3,ON-MAIN-bfbcd72e,Active,1987–1994; 2012–open


### CTrace 

In [79]:
lt_main_ctrace = match_facilities(main_gdf, ctrace_gdf,
                     id_col1='main_id', id_col2='ctrace_id',
                     name_col1='facility_name', name_col2='source_name'
                     )

In [80]:
matches_main_ctrace = one_to_many_relationships(
    lt_main_ctrace,
    id_main_col='main_id',
    id_sat_col='ctrace_id',
    distance_threshold_m=2000,
    similarity_threshold=80,
    similarity_metric="token_set"  
)

In [81]:
ctrace_gdf = ctrace_gdf.merge(
    matches_main_ctrace,
    on="ctrace_id", 
    how="left"
)

In [82]:
# For SQL
ctrace_gdf = ctrace_gdf[ctrace_gdf["main_id"].notna()].copy()

In [83]:
# We separate emissions and production data
ctrace_ghg_gdf = ctrace_gdf.drop(columns=['activity', 'activity_units', 'capacity', 'capacity_units'])
ctrace_prod_gdf = ctrace_gdf.drop(columns=['emissions_quantity', 'gas'])

### Conflict

In [15]:
lt_main_conflict = match_facilities(main_gdf, conflict_gdf,
                     id_col1='main_id', id_col2='conflict_id',
                     name_col1='facility_name', name_col2='facility_name_ejatlas'
                     )

In [16]:
matches_main_conflict = one_to_many_relationships(
    lt_main_conflict,
    id_main_col='main_id',
    id_sat_col='conflict_id',
    distance_threshold_m=20000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [17]:
conflict_gdf = conflict_gdf.merge(
    matches_main_conflict,
    on="conflict_id",  
    how="left"
)

In [18]:
# For SQL
conflict_gdf = conflict_gdf[conflict_gdf["main_id"].notna()].copy()

In [21]:
conflict_gdf

Unnamed: 0,conflict_id,ej_atlas_id,case_name,facility_name_ejatlas,province,longitude,latitude,geolocation_accuracy,conflict_description,conflict_details,population_affected,conflict_intensity,start_date,end_date,project_status,reference,geometry,main_id
9,NL-CONFLICT-fd990d3e,4135,"Vale Mining in Voisey's Bay, Labrador, Canada",Voisey's Bay,Newfoundland Labrador,-62.103056,56.334722,MEDIUM (Regional level),<p>Voiseyâ€™s Bay nickel mine in Labrador was ...,"Voiseyâ€™s Bay is a â€œ6,000 tonnes-per-day fa...",Rural,"MEDIUM (street protests, visible mobilization)",11/1994,,In operation,"(Mining Data Online, n.d.)\nhttps://miningdata...",POINT (-62.10306 56.33472),NL-MAIN-2d8801d6
10,ON-CONFLICT-928fd562,4145,Port Colborne Class Action Lawsuit Against Val...,Port Colborne,Ontario,-79.25,42.883333,MEDIUM (Regional level),"<p>In Pearson (later Smith) v. Inco, the large...",Raw materials from Valeâ€™s Sudbury mining ope...,Urban,LOW (some local organising),06/2000,04/2012,In operation,,POINT (-79.25000 42.88333),ON-MAIN-2e13aaa0
15,QC-CONFLICT-93cab570,5599,Nouveau Mondeâ€™s Matawinie graphite mine in Q...,Municipality of Saint-Michel-des-Saints in Ma...,Quebec,-73.958848,46.633467,MEDIUM (Regional level),<p>Five kilometers southwest of the municipali...,The project includes 319 mining concessions (c...,Rural,"MEDIUM (street protests, visible mobilization)",01/01/2017,,Under construction,,POINT (-73.95885 46.63347),QC-MAIN-d6748d40


In [22]:
col_to_keep = ["ej_atlas_id",
  "main_id",
  "case_name",
  "start_date",
  "end_date",
  "conflict_description",
  "conflict_details",
  "population_affected",
  "conflict_intensity",
  "project_status"
               ]
conflict_gdf = conflict_gdf[col_to_keep]
conflict_gdf

Unnamed: 0,ej_atlas_id,main_id,case_name,start_date,end_date,conflict_description,conflict_details,population_affected,conflict_intensity,project_status
9,4135,NL-MAIN-2d8801d6,"Vale Mining in Voisey's Bay, Labrador, Canada",11/1994,,<p>Voiseyâ€™s Bay nickel mine in Labrador was ...,"Voiseyâ€™s Bay is a â€œ6,000 tonnes-per-day fa...",Rural,"MEDIUM (street protests, visible mobilization)",In operation
10,4145,ON-MAIN-2e13aaa0,Port Colborne Class Action Lawsuit Against Val...,06/2000,04/2012,"<p>In Pearson (later Smith) v. Inco, the large...",Raw materials from Valeâ€™s Sudbury mining ope...,Urban,LOW (some local organising),In operation
15,5599,QC-MAIN-d6748d40,Nouveau Mondeâ€™s Matawinie graphite mine in Q...,01/01/2017,,<p>Five kilometers southwest of the municipali...,The project includes 319 mining concessions (c...,Rural,"MEDIUM (street protests, visible mobilization)",Under construction


### Tailings

In [24]:
lt_main_tailings = match_facilities(main_gdf, tailings_gdf,
                     id_col1='main_id', id_col2='tailing_id',
                     name_col1='facility_name', name_col2='related_mine'
                     )

In [25]:
matches_main_tailings = one_to_many_relationships(
    lt_main_tailings,
    id_main_col='main_id',
    id_sat_col='tailing_id',
    distance_threshold_m=20000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [26]:
tailings_gdf = tailings_gdf.merge(
    matches_main_tailings,
    on="tailing_id",  
    how="left"
)

In [27]:
# For SQL
tailings_gdf = tailings_gdf[tailings_gdf["main_id"].notna()].copy()

In [28]:
col_to_keep = ["row_id",
  "main_id",
  "year",
  "tailing_id",
  "tsf_name",
  "status",
  "construction_year",
  "raise_type",
  "current_maximum_height",
  "current_tailings_storage",
  "planned_storage_5_years",
  "hazard_categorization",
  "classification_system",
  "geometry"
               ]
tailings_gdf = tailings_gdf[col_to_keep]
tailings_gdf

Unnamed: 0,row_id,main_id,year,tailing_id,tsf_name,status,construction_year,raise_type,current_maximum_height,current_tailings_storage,planned_storage_5_years,hazard_categorization,classification_system,geometry
0,TAIL-12.0-2019-1,NU-MAIN-730aefe3,2019,12.0,North Cell,Active,2009.0,"Downstream, Upstream",31.0,14400000,14400000,Medium,Canadian Dam Association,POINT (-96.04333 65.02156)
1,TAIL-13.0-2019-1,NU-MAIN-730aefe3,2019,13.0,South Cell,Active,2012.0,Downstream,49.0,10420000,10800000,Medium,Canadian Dam Association,POINT (-96.03339 65.01378)
2,TAIL-14.0-2019-1,NU-MAIN-730aefe3,2019,14.0,In pit,Active,2009.0,,,850000,12500000,Low,,POINT (-96.03254 65.00190)
3,TAIL-15.0-2019-1,NU-MAIN-8b0264c9,2019,15.0,Meliadine TSF,Active,2019.0,,5.0,89000,4354000,Medium,Canadian Dam Association,POINT (-92.14034 63.02083)
4,TAIL-16.0-2019-1,QC-MAIN-c0660aec,2019,16.0,South TSF,Active,2007.0,Not Raised,5.0,1664000,2500000,Medium,Canadian Dam Association,POINT (-77.84198 48.05434)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,TAIL-1126.0-2019-1,ON-MAIN-f080c409,2019,1126.0,R1 Tailings,Active,1985.0,Compacted\nOuter Shell,14.0,2900000,4000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)
216,TAIL-1127.0-2019-1,ON-MAIN-f080c409,2019,1127.0,R2 Tailings,Active,1985.0,Compacted\nOuter Shell,13.0,18000000,18000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)
217,TAIL-1128.0-2019-1,ON-MAIN-f080c409,2019,1128.0,R3 Tailings,Active,1985.0,Compacted\nOuter Shell,27.0,39000000,42000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)
218,TAIL-1129.0-2019-1,ON-MAIN-f080c409,2019,1129.0,R4 Tailings,Active,1990.0,Compacted\nOuter Shell,22.0,59000000,63000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)


### GHG

In [11]:
lt_main_ghg = match_facilities(main_gdf, ghg_gdf,
                     id_col1='main_id', id_col2='ghg_id',
                     name_col1='facility_name', name_col2='facility_name_ghg'
                     )

In [12]:
matches_main_ghg = one_to_many_relationships(
    lt_main_ghg,
    id_main_col='main_id',
    id_sat_col='ghg_id',
    distance_threshold_m=10000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [13]:
ghg_gdf = ghg_gdf.merge(
    matches_main_ghg,
    on="ghg_id",  
    how="left"
)

In [14]:
# For SQL
ghg_gdf = ghg_gdf[ghg_gdf["main_id"].notna()].copy()

In [15]:
col_to_keep = ["row_id",
  "ghg_id",             
  "main_id",
  "year",
  "sector",
  "value",
  "unit"
               ]
ghg_gdf = ghg_gdf[col_to_keep]
ghg_gdf

Unnamed: 0,row_id,ghg_id,main_id,year,sector,value,unit
1,GHG-10101-2022-1,10101,NT-MAIN-6c1d6ee6,2022,Diamond Mining,164.88,kilotonnes of carbon dioxide equivalents (kt C...
4,GHG-10188-2022-1,10188,QC-MAIN-084bd95c,2022,Iron Ore Mining,293.10,kilotonnes of carbon dioxide equivalents (kt C...
7,GHG-10240-2022-1,10240,QC-MAIN-a97821c0,2022,Nickel-Copper Ore Mining,129.26,kilotonnes of carbon dioxide equivalents (kt C...
8,GHG-10299-2022-1,10299,QC-MAIN-e25eed27,2022,Iron Ore Mining,909.89,kilotonnes of carbon dioxide equivalents (kt C...
9,GHG-10350-2022-1,10350,NL-MAIN-b64bae7a,2022,Iron Ore Mining,110.93,kilotonnes of carbon dioxide equivalents (kt C...
...,...,...,...,...,...,...,...
158,GHG-11042-2022-1,11042,ON-MAIN-b466361f,2022,"Aluminum Rolling, Drawing, Extruding and Alloying",14.92,kilotonnes of carbon dioxide equivalents (kt C...
169,GHG-11723-2022-1,11723,ON-MAIN-2e13aaa0,2022,Non-Ferrous Metal (except Aluminum) Smelting a...,14.17,kilotonnes of carbon dioxide equivalents (kt C...
170,GHG-11730-2022-1,11730,QC-MAIN-92628f16,2022,Primary Production of Alumina and Aluminum,380.82,kilotonnes of carbon dioxide equivalents (kt C...
178,GHG-12298-2022-1,12298,ON-MAIN-63b394c3,2022,Non-Ferrous Metal (except Aluminum) Smelting a...,330.67,kilotonnes of carbon dioxide equivalents (kt C...


### Pollution

In [20]:
pollutant_gdf

Unnamed: 0,row_id,year,npri_id,facility_name_npri,company_name_npri,facility_type,longitude,latitude,terrestrial_ecozone,watershed,substance_name_npri,substance_name_ecoinvent,substance_unit,emission_type,emission_subtype,value,source_df,geometry
0,npri-1568-2023-1,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Ammonia (total),Ammonia,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.636,npri_metal_mining_df,POINT (-80.08734 48.13032)
1,npri-1568-2023-2,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Arsenic (and its compounds),Arsenic,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.116,npri_metal_mining_df,POINT (-80.08734 48.13032)
2,npri-1568-2023-3,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Carbon monoxide,"Carbon monoxide, fossil",tonnes,Air Emissions / Émissions à l'air,Stack Emissions,76.208,npri_metal_mining_df,POINT (-80.08734 48.13032)
3,npri-1568-2023-4,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Chromium (and its compounds),Chromium,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.00105,npri_metal_mining_df,POINT (-80.08734 48.13032)
4,npri-1568-2023-5,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Cobalt (and its compounds),Cobalt,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.18,npri_metal_mining_df,POINT (-80.08734 48.13032)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24125,npri-5697-2023-6,2023,5697,Rayette Manufacturing,,manufacturing,-79.49600,43.80800,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",25.1008,npri_metal_manufacturing_df,POINT (-79.49600 43.80800)
24126,npri-5698-2023-6,2023,5698,Bowes Manufacturing,,manufacturing,-79.49480,43.80360,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",23.418,npri_metal_manufacturing_df,POINT (-79.49480 43.80360)
24127,npri-34697-2023-2,2023,34697,541 Bowes Warehouse,,manufacturing,-79.49707,43.81726,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",16.543746,npri_metal_manufacturing_df,POINT (-79.49707 43.81726)
24128,npri-5697-2023-7,2023,5697,Rayette Manufacturing,,manufacturing,-79.49600,43.80800,Mixedwood Plain,St. Lawrence Drainage Area,Volatile Organic Compounds (Total),,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",12.5823,npri_metal_manufacturing_df,POINT (-79.49600 43.80800)


In [21]:
lt_main_pollution = match_facilities(main_gdf, pollutant_gdf,
                     id_col1='main_id', id_col2='npri_id',
                     name_col1='facility_name', name_col2='facility_name_npri'
                     )

In [22]:
matches_main_pollution = one_to_many_relationships(
    lt_main_pollution,
    id_main_col='main_id',
    id_sat_col='npri_id',
    distance_threshold_m=10000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [24]:
pollutant_gdf = pollutant_gdf.merge(
    matches_main_pollution,
    on="npri_id",  
    how="left"
)

In [25]:
# For SQL
pollutant_gdf = pollutant_gdf[pollutant_gdf["main_id"].notna()].copy()

In [26]:
col_to_keep = ["row_id",
  "npri_id",
  "main_id",
  "year",
  "terrestrial_ecozone",
  "watershed",
  "substance_name_npri",
  "substance_name_ecoinvent",
  "substance_unit",
  "emission_type",
  "emission_subtype",
  "value"
 ]
pollutant_gdf = pollutant_gdf[col_to_keep]
pollutant_gdf

Unnamed: 0,row_id,npri_id,main_id,year,terrestrial_ecozone,watershed,substance_name_npri,substance_name_ecoinvent,substance_unit,emission_type,emission_subtype,value
0,npri-1568-2023-1,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Ammonia (total),Ammonia,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.636
1,npri-1568-2023-2,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Arsenic (and its compounds),Arsenic,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.116
2,npri-1568-2023-3,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Carbon monoxide,"Carbon monoxide, fossil",tonnes,Air Emissions / Émissions à l'air,Stack Emissions,76.208
3,npri-1568-2023-4,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Chromium (and its compounds),Chromium,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.00105
4,npri-1568-2023-5,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Cobalt (and its compounds),Cobalt,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.18
...,...,...,...,...,...,...,...,...,...,...,...,...
24116,npri-28761-2023-117,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Sulphur dioxide,Sulfur dioxide,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",4.405
24117,npri-28761-2023-118,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Sulphuric acid,Sulfuric acid,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",0.2475
24118,npri-28761-2023-119,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Total particulate matter,No match found,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",29.277299999999997
24119,npri-28761-2023-120,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Total reduced sulphur (expressed as hydrogen s...,No match found,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",0.485


## Point to polygon matching

### Protected lands datasets 

In [156]:
from matching_functions import associate_facilities_near_polygons

In [157]:
wpda_gdf_polygons

Unnamed: 0,WDPA_PID,NAME,DESIG,OWN_TYPE,MANG_AUTH,STATUS_YR,source_id,geometry
0,611_A,Wood Buffalo National Park Of Canada,National Park,State,Parks Canada Agency,1922,https://www.protectedplanet.net,"POLYGON ((-112.87253 59.99877, -111.53006 59.7..."
1,611_B,Wood Buffalo National Park Of Canada,National Park,State,Parks Canada Agency,1922,https://www.protectedplanet.net,"POLYGON ((-112.87253 59.99877, -115.58450 59.9..."
2,612,Kluane National Park Reserve Of Canada,National Park,State,Parks Canada Agency,1972,https://www.protectedplanet.net,"POLYGON ((-141.00187 60.30634, -137.43466 60.7..."
3,613_B,Auyuittuq National Park Of Canada,National Park,State,Parks Canada Agency,1972,https://www.protectedplanet.net,"MULTIPOLYGON (((-64.62292 67.13706, -63.92605 ..."
4,613_A,Auyuittuq National Park Of Canada,National Park,State,Parks Canada Agency,1972,https://www.protectedplanet.net,"MULTIPOLYGON (((-68.23194 67.76712, -67.56846 ..."
...,...,...,...,...,...,...,...,...
13463,555785733,Grosse Île and the Irish Memorial National His...,Lieu historique national,State,Parks Canada Agency,2023,https://www.protectedplanet.net,"MULTIPOLYGON (((-70.67843 47.01639, -70.67803 ..."
13464,555785734,Navy Island National Historic Site,National Historic Site,State,Parks Canada Agency,2023,https://www.protectedplanet.net,"POLYGON ((-79.01037 43.04900, -79.00871 43.060..."
13465,555785735,Fort George National Historic Site (Paradise G...,National Historic Site,State,Parks Canada Agency,2023,https://www.protectedplanet.net,"POLYGON ((-79.06782 43.24368, -79.05901 43.248..."
13466,555785736,Rideau Canal National Historic Site,National Historic Site,State,Parks Canada Agency,2023,https://www.protectedplanet.net,"MULTIPOLYGON (((-76.24341 44.90190, -76.19385 ..."


In [158]:
lt_wpda = associate_facilities_near_polygons(
    main_gdf,
    wpda_gdf_polygons,
    facility_id_col="main_id",
    polygon_id_col="WDPA_PID",
    buffer_km=50,
    crs="EPSG:3978"
)

In [159]:
wpda_gdf_polygons = wpda_gdf_polygons.merge(lt_wpda, on="WDPA_PID", how="left")

In [160]:
# For SQL
wpda_gdf_polygons = wpda_gdf_polygons[wpda_gdf_polygons["main_id"].notna()].copy()

In [161]:
lt_indigenous = associate_facilities_near_polygons(
    main_gdf,
    indigenous_can_gdf,
    facility_id_col="main_id",
    polygon_id_col="indigenous_land_id",
    buffer_km=50,
    crs="EPSG:3978"
)

In [162]:
indigenous_can_gdf = indigenous_can_gdf.merge(lt_indigenous, on="indigenous_land_id", how="left")

In [163]:
# For SQL
indigenous_can_gdf = indigenous_can_gdf[indigenous_can_gdf["main_id"].notna()].copy()

### Land occupation footprint datasets

In [None]:
from matching_functions import assign_polygons_to_points

In [69]:
assigned_df = assign_polygons(
    main_gdf,
    tailings_gdf,
    tang_canada_gdf,
    max_dist_km=10
)


In [70]:
assigned_df

Unnamed: 0,main_id,tailing_id,tang_id,distance_km,relation_type
0,NS-MAIN-c9c77a59,,45954,0.000000,one-to-many
1,NS-MAIN-c9c77a59,,45955,1.186584,one-to-many
2,NL-MAIN-d9036091,,7631,0.811781,one-to-many
3,NL-MAIN-d9036091,,7641,4.647047,one-to-many
4,NL-MAIN-d9036091,,7633,0.000000,many-to-one
...,...,...,...,...,...
789,NT-MAIN-6c1d6ee6,,26078,2.857250,one-to-many
790,NT-MAIN-6c1d6ee6,,43740,0.833606,one-to-many
791,NT-MAIN-6c1d6ee6,,3479,0.000000,one-to-many
792,NT-MAIN-6c1d6ee6,,50385,2.150861,one-to-many


In [71]:
assigned_df['relation_type'].value_counts()

relation_type
one-to-many    707
many-to-one     56
one-to-one      31
Name: count, dtype: int64

In [72]:
polygon_gdf = tang_canada_gdf.merge(assigned_df, on="tang_id", how="left")

In [78]:
polygon_gdf

Unnamed: 0,tang_id,area_km2,geometry,main_id,tailing_id,distance_km,relation_type
0,6383,0.026130,"POLYGON Z ((1795479.979 -83564.531 0.000, 1795...",,,,
1,6388,0.671399,"POLYGON Z ((1796123.448 -80625.526 0.000, 1796...",,,,
2,6390,0.050909,"POLYGON Z ((1796791.634 -80270.697 0.000, 1796...",,,,
3,45954,2.622800,"POLYGON Z ((2466485.124 185586.547 0.000, 2466...",NS-MAIN-c9c77a59,,0.000000,one-to-many
4,45955,0.441869,"POLYGON Z ((2465809.682 185862.507 0.000, 2465...",NS-MAIN-c9c77a59,,1.186584,one-to-many
...,...,...,...,...,...,...,...
1915,3399,0.541024,"POLYGON Z ((-717210.659 1915423.857 0.000, -71...",,,,
1916,3401,1.053173,"POLYGON Z ((-715648.337 1916385.463 0.000, -71...",,,,
1917,26364,0.079894,"POLYGON Z ((-717332.782 1916999.286 0.000, -71...",,,,
1918,43739,1.519488,"POLYGON Z ((-713032.430 1919630.086 0.000, -71...",,,,


# Create final tables

## Environment table

In [None]:
env_id
year
commodity ## do we need it?
compartment_id
flow_type == ghg 
substance_id
unit
value
comment

### CTrace dataset

In [212]:
ctrace_ghg_gdf

Unnamed: 0,ctrace_id,year,source_name,emissions_quantity,gas,latitude,longitude,source_id,geometry,main_id
6,1754055,2023,Coleman Mine,5610.0,co2e_100yr,46.674583,-81.34308,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-81.34308 46.67458),ON-MAIN-6e9b2449
7,1754056,2023,Copper Cliff Mine,6403.0,co2e_100yr,46.48096,-81.052207,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-81.05221 46.48096),ON-MAIN-63b394c3
8,1754057,2023,Copper Mountain Mine,229916.0,co2e_100yr,49.331253,-120.521629,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-120.52163 49.33125),BC-MAIN-599152a0
9,1754058,2023,Creighton Mine,2639.0,co2e_100yr,46.462958,-81.176038,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-81.17604 46.46296),ON-MAIN-52224e1e
11,1754060,2023,Garson Mine,4225.0,co2e_100yr,46.56758,-80.857509,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-80.85751 46.56758),ON-MAIN-48fe2205
12,1754061,2023,Gibraltar Mine,131638.0,co2e_100yr,52.519137,-122.26926,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-122.26926 52.51914),BC-MAIN-6b4800fe
16,1754066,2023,Highland Valley Mine,2412130.0,co2e_100yr,50.486244,-121.042759,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-121.04276 50.48624),BC-MAIN-bf503b6b
17,1754067,2023,Horne Mine,0.0,co2e_100yr,48.254296,-79.010706,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-79.01071 48.25430),QC-MAIN-30c1828c
25,1754077,2023,LaRonde Mine,67116.0,co2e_100yr,48.249139,-78.441434,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-78.44143 48.24914),QC-MAIN-e51eda66
29,1754081,2023,McCreedy West Mine,7782.0,co2e_100yr,46.637729,-81.398089,"Jolleys, M. et al (2024). Mineral Extraction s...",POINT (-81.39809 46.63773),ON-MAIN-7001a391


In [213]:
ctrace_ghg_gdf.rename(columns={
    'ctrace_id': 'env_id',
    'emissions_quantity': 'value',
    'gas': 'unit'}, inplace=True)

ctrace_ghg_gdf.drop(columns=['source_name', 'latitude', 'longitude', 'geometry'], inplace=True)

In [214]:
ctrace_ghg_gdf

Unnamed: 0,env_id,year,value,unit,source_id,main_id
6,1754055,2023,5610.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",ON-MAIN-6e9b2449
7,1754056,2023,6403.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",ON-MAIN-63b394c3
8,1754057,2023,229916.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",BC-MAIN-599152a0
9,1754058,2023,2639.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",ON-MAIN-52224e1e
11,1754060,2023,4225.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",ON-MAIN-48fe2205
12,1754061,2023,131638.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",BC-MAIN-6b4800fe
16,1754066,2023,2412130.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",BC-MAIN-bf503b6b
17,1754067,2023,0.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",QC-MAIN-30c1828c
25,1754077,2023,67116.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",QC-MAIN-e51eda66
29,1754081,2023,7782.0,co2e_100yr,"Jolleys, M. et al (2024). Mineral Extraction s...",ON-MAIN-7001a391


In [None]:
## We miss commodity, flow_type, compartment id and substance id
#ctrace_ghg_gdf['compartment_id'] = 
#ctrace_ghg_gdf['flow_type'] =
#ctrace_ghg_gdf['substance_id'] =

### NPRI dataset

In [219]:
pollutant_gdf.rename(columns={
    'row_id': 'env_id',
    'substance_unit': 'unit'}, inplace=True)

pollutant_gdf.drop(columns=['npri_id', 'facility_name_npri', 'company_name_npri',
       'emission_type', 'emission_subtype',
       'facility_type', 'longitude', 'latitude', 'terrestrial_ecozone',
       'watershed', 'substance_name_npri', 'substance_name_ecoinvent', 'geometry'], inplace=True)

In [220]:
pollutant_gdf

Unnamed: 0,env_id,year,unit,emission_type,emission_subtype,value,source_id,substance_id
0,npri-1568-2023-1,2023,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.636,https://www.canada.ca/en/environment-climate-c...,SUBa2c74fdce8
1,npri-1568-2023-2,2023,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.116,https://www.canada.ca/en/environment-climate-c...,SUB526607d180
2,npri-1568-2023-3,2023,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,76.208,https://www.canada.ca/en/environment-climate-c...,SUB92570cece4
3,npri-1568-2023-4,2023,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.00105,https://www.canada.ca/en/environment-climate-c...,SUB075f575023
4,npri-1568-2023-5,2023,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.18,https://www.canada.ca/en/environment-climate-c...,SUB481f3889a8
...,...,...,...,...,...,...,...,...
24125,npri-5697-2023-6,2023,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",25.1008,https://www.canada.ca/en/environment-climate-c...,SUB4a14c4c889
24126,npri-5698-2023-6,2023,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",23.418,https://www.canada.ca/en/environment-climate-c...,SUB4a14c4c889
24127,npri-34697-2023-2,2023,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",16.543746,https://www.canada.ca/en/environment-climate-c...,SUB4a14c4c889
24128,npri-5697-2023-7,2023,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",12.5823,https://www.canada.ca/en/environment-climate-c...,SUB0d60783c59


In [216]:
pollutant_gdf.columns

Index(['row_id', 'year', 'npri_id', 'facility_name_npri', 'company_name_npri',
       'facility_type', 'longitude', 'latitude', 'terrestrial_ecozone',
       'watershed', 'substance_name_npri', 'substance_name_ecoinvent',
       'substance_unit', 'emission_type', 'emission_subtype', 'value',
       'source_id', 'substance_id', 'geometry'],
      dtype='object')

### GHG dataset

In [221]:
ghg_gdf

Unnamed: 0,row_id,year,ghg_id,facility_name_ghg,longitude,latitude,city,province,sector,value,unit,facility_url,source_id,geometry
0,GHG-10052-2022-1,2022,10052,Carol Project,-66.90020,52.95200,Labrador City,Newfoundland and Labrador,Iron Ore Mining,826.03,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-66.90020 52.95200)
1,GHG-10101-2022-1,2022,10101,Ekati Diamond Mine,-110.62010,64.71950,Yellowknife,Northwest Territories,Diamond Mining,164.88,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-110.62010 64.71950)
2,GHG-10140-2022-1,2022,10140,HBMS Flin Flon Metallurgical Complex,-101.88400,54.77100,Flin Flon,Manitoba,Copper-Zinc Ore Mining,29.73,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-101.88400 54.77100)
3,GHG-10164-2022-1,2022,10164,Kidd Metallurgical Site,-81.09115,48.57373,Timmins District of Cochrane,Ontario,Copper-Zinc Ore Mining,30.82,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-81.09115 48.57373)
4,GHG-10188-2022-1,2022,10188,Mine de Mont-Wright,-67.33306,52.76639,Fermont,Quebec,Iron Ore Mining,293.10,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-67.33306 52.76639)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,GHG-12301-2022-1,2022,12301,GRIFFIN CANADA - WINNIPEG,-97.00250,49.91300,Winnipeg,Manitoba,Steel Foundries,,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-97.00250 49.91300)
180,GHG-12554-2022-1,2022,12554,Molycop Canada Plant 2,-120.09703,50.65598,Kamloops,British Columbia,Iron Foundries,,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-120.09703 50.65598)
181,GHG-12555-2022-1,2022,12555,Moly-Cop Canada,-120.09649,50.65726,Kamloops,British Columbia,Iron Foundries,25.12,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-120.09649 50.65726)
182,GHG-12579-2022-1,2022,12579,Fonderie Laperle,-73.13870,45.89470,Saint Ours,Quebec,Iron Foundries,0.51,kilotonnes of carbon dioxide equivalents (kt C...,https://climate-change.canada.ca/facility-emis...,https://www.canada.ca/en/environment-climate-c...,POINT (-73.13870 45.89470)


In [223]:
ghg_gdf.rename(columns={'row_id': 'env_id'}, inplace=True)
ghg_gdf.drop(columns=['ghg_id', 'facility_name_ghg', 'longitude',
       'latitude', 'city', 'province', 'sector', 'facility_url', 'geometry'], inplace=True)

In [224]:
ghg_gdf['compartment_id'] = 
ghg_gdf['flow_type'] = 'ghg, scope 1'
ghg_gdf['substance_id'] = # nothing

Unnamed: 0,env_id,year,value,unit,source_id
0,GHG-10052-2022-1,2022,826.03,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
1,GHG-10101-2022-1,2022,164.88,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
2,GHG-10140-2022-1,2022,29.73,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
3,GHG-10164-2022-1,2022,30.82,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
4,GHG-10188-2022-1,2022,293.10,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
...,...,...,...,...,...
179,GHG-12301-2022-1,2022,,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
180,GHG-12554-2022-1,2022,,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
181,GHG-12555-2022-1,2022,25.12,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...
182,GHG-12579-2022-1,2022,0.51,kilotonnes of carbon dioxide equivalents (kt C...,https://www.canada.ca/en/environment-climate-c...


### Manually collected data 

In [225]:
env_df

Unnamed: 0,year,reporting_level,reported_company_nrcan,name,main_id,facility_group_id,company_id,facility_type,mining_processing_type,commodity,compartment,flow_type,subflow_type,substance_name,unit,value,comment,source
0,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,Sox,t,315,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
1,2023,Site-specific,Agnico Eagle Mines Limited,Canadian Malartic,ON-MAIN-1f126a43,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,Nox,t,4802,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
2,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,Sox,t,486,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
3,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Air,air_pollution,-,Nox,t,7414,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
4,2023,Site-specific,Agnico Eagle Mines Limited,Goldex,NU-MAIN-8b0264c9,-,,mining,"Underground, concentrator","Gold, silver",Air,air_pollution,-,Sox,t,38,"""In absence of site-level monitoring emission ...",2023-Sustainability-Performance_data.xlsx
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,2023,Site-specific,Wesdome Gold Mines Ltd,Eagle River,QC-MAIN-9de9bb0d,-,,mining,"Underground, concentrator",Gold,Water,water,Total water discharged,-,m3,324347,,2023-ESG-Data-Tables.xlsx
747,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal,-,m3,168115,,2023-ESG-Data-Tables.xlsx
748,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Water withdrawal|Freshwater withdrawn,-,m3,168115,,2023-ESG-Data-Tables.xlsx
749,2023,Site-specific,Wesdome Gold Mines Ltd,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Water,water,Total water consumption,-,m3,82144,,2023-ESG-Data-Tables.xlsx


In [226]:
env_df.columns

Index(['year', 'reporting_level', 'reported_company_nrcan', 'name', 'main_id',
       'facility_group_id', 'company_id', 'facility_type',
       'mining_processing_type', 'commodity', 'compartment', 'flow_type',
       'subflow_type', 'substance_name', 'unit', 'value', 'comment', 'source'],
      dtype='object')

In [None]:
env_df.drop(columns=['reporting_level', 'reported_company_nrcan', 'name', 'facility_type',
       'mining_processing_type', 'commodity' ])

## Production table

In [97]:
production_df.rename(columns={'row_id': 'prod_id'}, inplace=True)

In [98]:
production_df.drop(columns=['level', 'reported_company_nrcan', 'name', 
                            'facility_type', 'mining_processing_type', 'commodities_nrcan'], inplace=True)

In [84]:
ctrace_prod_gdf.rename(columns={'ctrace_id': 'prod_id'}, inplace=True)

In [91]:
# Save units before melting
activity_units = ctrace_prod_gdf['activity_units']
capacity_units = ctrace_prod_gdf['capacity_units']

# Melt
ctrace_prod_gdf = pd.melt(
    ctrace_prod_gdf,
    id_vars=['prod_id', 'year', 'source_name', 'latitude', 'longitude', 'source_id', 'geometry', 'main_id'],
    value_vars=['activity', 'capacity'],
    var_name='data_type',
    value_name='value'
)

# Set data_type labels
ctrace_prod_gdf['data_type'] = ctrace_prod_gdf['data_type'].replace({
    'activity': 'Production',
    'capacity': 'Capacity'
})

# Concatenate units in correct order (activity first, then capacity)
ctrace_prod_gdf['unit'] = list(activity_units) + list(capacity_units)

In [93]:
ctrace_prod_gdf.drop(columns=['source_name', 'latitude', 'longitude', 'geometry'], inplace=True)

In [103]:
ctrace_prod_gdf['geography'] = 'Canada'
ctrace_prod_gdf['reference_point'] = 'Ore processed'

In [104]:
# Concatenate production and ctrace data 
production_table = pd.concat([production_df, ctrace_prod_gdf], ignore_index=True)

In [105]:
production_table

Unnamed: 0,prod_id,year,geography,main_id,facility_group_id,company_id,commodity,reference_point,material_type,data_type,unit,value,comment,source,source_id
0,PROD-1f126a43-2023-1,2023,Canada,ON-MAIN-1f126a43,-,,Ore,2,Ore processed,Production,t,1.959493e+07,,2023-Sustainability-Performance_data,
1,PROD-1f126a44-2023-1,2023,Canada,ON-MAIN-1f126a44,-,,Gold,4,Intermediate metal,Production,oz,6.846400e+05,"""Production reflects Agnico Eagle's 50% intere...",2023-Sustainability-Performance_data,
2,PROD-1f126a45-2023-1,2023,Canada,ON-MAIN-1f126a45,-,,Silver,4,Intermediate metal,Production,koz,3.100000e+02,"""Production reflects Agnico Eagle's 50% intere...",2023-Sustainability-Performance_data,
3,PROD-e7e6a960-2023-1,2023,Canada,QC-MAIN-e7e6a960,-,,Gold,2,Ore processed,Production,t,2.543485e+07,,2023-Sustainability-Performance_data,
4,PROD-e7e6a963-2023-1,2023,Canada,QC-MAIN-e7e6a963,-,,Gold,4,Intermediate metal,Production,oz,6.774460e+05,Like intermediate metal,2023-Sustainability-Performance_data,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384,1754104,2023,Canada,ON-MAIN-28f3f0fc,,,,Ore processed,,Capacity,tonnes,2.453000e+06,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
385,1754105,2023,Canada,QC-MAIN-b2148882,,,,Ore processed,,Capacity,tonnes,8.787959e+07,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
386,25486533,2023,Canada,ON-MAIN-f8313ebd,,,,Ore processed,,Capacity,tonnes,2.500000e+06,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
387,25486534,2023,Canada,ON-MAIN-a3c56a83,,,,Ore processed,,Capacity,tonnes,2.200000e+07,,,"Jolleys, M. et al (2024). Mineral Extraction s..."


In [None]:
production_table['value_tonnes'] =

## Protected & Indigenous table

In [164]:
protected_mapping = {
    "WDPA_PID": "protected_area_id",
    "NAME": "land_name",
    "DESIG": "land_type",
    "STATUS_YR": "date"
}

indigenous_mapping = {
    "indigenous_land_id": "protected_area_id",
    "Name": "land_name",
    "Category": "land_type",
    "Data_Date": "date"
}

In [165]:
protected_clean = wpda_gdf_polygons.rename(columns=protected_mapping)[["protected_area_id", "land_name", "land_type", "distance_km", "geometry", "main_id"]].copy()

In [166]:
indigenous_clean = indigenous_can_gdf.rename(columns=indigenous_mapping)[["protected_area_id", "land_name", "land_type", "distance_km", "geometry", "main_id"]].copy()

In [167]:
target_crs = "EPSG:4326"
protected_clean = protected_clean.to_crs(target_crs)
indigenous_clean = indigenous_clean.to_crs(target_crs)

In [168]:
protected_land_table = pd.concat([protected_clean, indigenous_clean], ignore_index=True)
protected_land_table = gpd.GeoDataFrame(protected_land_table, geometry="geometry", crs=wpda_gdf_polygons.crs)

In [169]:
protected_land_table

Unnamed: 0,protected_area_id,land_name,land_type,distance_km,geometry,main_id
0,611_B,Wood Buffalo National Park Of Canada,National Park,27.397,"POLYGON ((-112.87253 59.99877, -115.58450 59.9...",NT-MAIN-b99d4cb2
1,615,Banff National Park Of Canada,National Park,2.153,"POLYGON ((-117.30592 52.07417, -115.17460 51.2...",AB-MAIN-8434dce0
2,616,Nahanni National Park Reserve Of Canada,National Park,0.000,"MULTIPOLYGON (((-124.02111 60.89056, -128.7980...",NT-MAIN-b3681e03
3,616,Nahanni National Park Reserve Of Canada,National Park,31.820,"MULTIPOLYGON (((-124.02111 60.89056, -128.7980...",YT-MAIN-03a159d7
4,620,Pukaskwa National Park Of Canada,National Park,19.001,"MULTIPOLYGON (((-86.25133 48.57798, -85.62889 ...",ON-MAIN-6e9be24e
...,...,...,...,...,...,...
8790,ZZ-IND-40a2ea30,Six Nations Indian Reserve No. 40,Indian Reserve,25.954,"MULTIPOLYGON Z (((-79.99308 43.08630 0.00003, ...",ON-MAIN-020a683c
8791,ZZ-IND-40a2ea30,Six Nations Indian Reserve No. 40,Indian Reserve,20.924,"MULTIPOLYGON Z (((-79.99308 43.08630 0.00003, ...",ON-MAIN-b2e5c9bb
8792,ZZ-IND-4e299f50,Mashteuiatsh,Indian Reserve,39.343,"MULTIPOLYGON Z (((-72.27116 48.58552 0.00003, ...",QC-MAIN-3aa117d8
8793,ZZ-IND-74ac0e34,Nisga'a,First Nations Treaty Lands,16.458,"MULTIPOLYGON Z (((-129.07569 55.20853 0.00003,...",BC-MAIN-3ef4f421


## Land cover table

In [207]:
land_cover_table = land_cover_gdf.copy()
land_cover_table['npv_biome_type'] = npv_gdf['biome_type']
land_cover_table['source_id'] = land_cover_table['source_id'].astype(str) + ';' + npv_gdf['source_id'].astype(str)

In [208]:
land_cover_table = assign_row_id(land_cover_table,
                                 facility_id_col='main_id', 
                                 prefix='land_cover_id')

In [209]:
land_cover_table.drop(columns=['name', 'geometry'], inplace=True)
land_cover_table.rename(columns={'row_id': 'land_cover_id'}, inplace=True)
land_cover_table = land_cover_table[['land_cover_id', 'main_id', 'year', 'modis_land_cover',
       'esa_land_cover', 'npv_biome_type', 'source_id']]

In [210]:
land_cover_table

Unnamed: 0,land_cover_id,main_id,year,modis_land_cover,esa_land_cover,npv_biome_type,source_id
0,land_cover_id-5801b453-1,QC-MAIN-5801b453,2021,Woody Savannas,Built-up,,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
1,land_cover_id-d29e0839-1,QC-MAIN-d29e0839,2021,Woody Savannas,Bare/Sparse Vegetation,Cool Mixed Forest,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
2,land_cover_id-f1ff4920-1,QC-MAIN-f1ff4920,2021,Woody Savannas,Tree Cover,Cool Mixed Forest,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
3,land_cover_id-4ea8ac9d-1,NU-MAIN-4ea8ac9d,2021,Grasslands,Moss & Lichen,,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
4,land_cover_id-e7e6a960-1,QC-MAIN-e7e6a960,2021,Grasslands,Bare/Sparse Vegetation,Cool Mixed Forest,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
...,...,...,...,...,...,...,...
265,land_cover_id-c5fefb01-1,ON-MAIN-c5fefb01,2021,Woody Savannas,Permanent Water Bodies,Cool Mixed Forest,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
266,land_cover_id-feb13c30-1,BC-MAIN-feb13c30,2021,Woody Savannas,Tree Cover,Cool Evergreen Needleleaf Forest,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
267,land_cover_id-5e0d8b48-1,YT-MAIN-5e0d8b48,2021,Open Shrublands,Grassland,Cold Evergreen Needleleaf Forest,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...
268,land_cover_id-12c68d49-1,ON-MAIN-12c68d49,2021,Woody Savannas,Grassland,Cool Mixed Forest,https://doi.org/10.5067/MODIS/MCD12Q1.061 + ht...


# Exports 

## Simple format

In [31]:
from matching_functions import export_sqlite_db

In [32]:
export_sqlite_db(
    "data/Tables/metallican.sqlite", 
    tables_dict={
    "main": main_gdf,
    "tailings": tailings_gdf,
    "mincan": min_can_gdf,
    "conflict": conflict_gdf,   
    "ghg": ghg_gdf,
    "pollution": pollutant_gdf,
    "climate_categories": climate_category_gdf,
    "peatland": peatland_gdf,
    "weather": weather_gdf,
    "population": population_gdf,
    "natural_potential_vegetation": npv_gdf,
    "land_cover": land_cover_gdf,
    "water_risk": water_risk_gdf,
    "land_occupation": tang_canada_gdf,
    "indigenous_land": indigenous_can_gdf,
    "protected_land": wpda_gdf_polygons,
    "lt_protected_land": matching_main_wpda,
    "lt_indigenous_land": matching_main_indigenous,
    "lt_clusters": lt_land_occupation
},
    keep_geometry_tables=['main', 'tailings'], 
    csv_dir='data/Tables/csv'
)

✅ Exported 'main' → SQLite + CSV
✅ Exported 'tailings' → SQLite + CSV


  df_export["geometry"] = df_export.geometry.to_wkt()
  df_export["geometry"] = df_export.geometry.to_wkt()


✅ Exported 'mincan' → SQLite + CSV
✅ Exported 'conflict' → SQLite + CSV
✅ Exported 'ghg' → SQLite + CSV
✅ Exported 'pollution' → SQLite + CSV
✅ Exported 'climate_categories' → SQLite + CSV
✅ Exported 'peatland' → SQLite + CSV
✅ Exported 'weather' → SQLite + CSV
✅ Exported 'population' → SQLite + CSV
✅ Exported 'natural_potential_vegetation' → SQLite + CSV
✅ Exported 'land_cover' → SQLite + CSV
✅ Exported 'water_risk' → SQLite + CSV
✅ Exported 'land_occupation' → SQLite + CSV
✅ Exported 'indigenous_land' → SQLite + CSV
✅ Exported 'protected_land' → SQLite + CSV
✅ Exported 'lt_protected_land' → SQLite + CSV
✅ Exported 'lt_indigenous_land' → SQLite + CSV
✅ Exported 'lt_clusters' → SQLite + CSV
✅ All exports completed to SQLite and CSVs in: data/Tables/csv


## More advanced setting

In [None]:
from matching_functions import create_and_populate_database

In [37]:
tables_dict = {
    "main": main_gdf,
    "tailings": tailings_gdf,
    "mincan": min_can_gdf,
    "conflict": conflict_gdf,
    "ghg": ghg_gdf,
    "pollution": pollutant_gdf,
    "climate_categories": climate_category_gdf,
    "weather": weather_gdf,
    "peatland": peatland_gdf,
    "population": population_gdf,
    "water_risk": water_risk_gdf,
    "land_cover": land_cover_gdf,
    "natural_potential_vegetation": npv_gdf,
    "indigenous_land": indigenous_can_gdf,
    "protected_land": wpda_gdf_polygons,
    "land_occupation": tang_canada_gdf
}

# Run the function
create_and_populate_database(
    db_path="data/Tables/metallican.sqlite",
    schema_path="schema.sql",
    tables_dict=tables_dict,
    keep_geometry_tables=["main", "tailings"]
)


ℹ️ No old database found at 'data/Tables/metallican.sqlite'
✅ Empty database structure created at 'data/Tables/metallican.sqlite'
🔄 Converting geometries...


  df["geometry"] = df["geometry"].to_wkt()


✅ Geometries handled (kept only where needed)
✅ New connection opened
✅ Inserted 305 rows into 'main'
✅ Inserted 227 rows into 'tailings'
✅ Inserted 939 rows into 'mincan'
✅ Inserted 20 rows into 'conflict'
✅ Inserted 184 rows into 'ghg'
✅ Inserted 24130 rows into 'pollution'
✅ Inserted 2745 rows into 'climate_categories'
✅ Inserted 39040 rows into 'weather'
✅ Inserted 305 rows into 'peatland'
✅ Inserted 1220 rows into 'population'
✅ Inserted 9760 rows into 'water_risk'
✅ Inserted 305 rows into 'land_cover'
✅ Inserted 305 rows into 'natural_potential_vegetation'
✅ Inserted 3258 rows into 'indigenous_land'
✅ Inserted 13468 rows into 'protected_land'
✅ Inserted 1885 rows into 'land_occupation'
✅ No foreign key problems found!
✅ Connection closed properly
