In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
import uuid
from db_creation_function import 

# Import and (first) clean sources

## Import 

In [2]:
# NRCan datasets
nrcan_mining_df = pd.read_excel(
    r'data/NRCan/Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)/producing_mines.xlsx')
nrcan_manufacturing_df = pd.read_excel(
    r'data/NRCan/Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)/metal_work.xlsx')

In [3]:
# MinCan datasets
min_can_df = pd.read_excel(
    r'data/Sources/MinCan _Past and Present Productive Mines of Canada, 1950-2022_March2024.xlsx', sheet_name='Data')

In [148]:
# NPRI datasets
npri_df = pd.read_excel(r'data/Emissions_accounts/Raw_data/NPRI/NPRI-INRP_DataDonnées_2023.xlsx', sheet_name='INRP-NPRI 2023', skiprows=3)

In [5]:
# GHG from large facilities datasets
ghg_facility_df = pd.read_csv(
    r'data/Emissions_accounts/Raw_data/GHG_large_facilities/Greenhouse gas emissions from large facilities.csv',
    encoding='ISO-8859-1')

## Clean

### MinCan

In [6]:
def is_mine_active(row):
    '''
    Function to define if the mine is currently active, 
    based on the Changes in Status of Production
    '''

    current_year = 2022

    # Convert year values to integers, ignoring non-numeric values
    def to_int(value):
        try:
            return int(value)
        except (ValueError, TypeError):
            return None

    # Convert all year columns to integers
    open1 = to_int(row['open1'])
    close1 = to_int(row['close1'])
    open2 = to_int(row['open2'])
    close2 = to_int(row['close2'])
    open3 = to_int(row['open3'])
    close3 = to_int(row['close3'])

    # Check if any of the 'close' columns have the value 'open'
    if row['close1'] == 'open' or row['close2'] == 'open' or row['close3'] == 'open':
        return 'Active'

    # Find the latest year among open and close columns
    years = [open1, close1, open2, close2, open3, close3]
    years = [year for year in years if year is not None]

    if not years:
        return 'Unknown'

    latest_year = max(years)

    # If the latest year is a 'close' year, the mine is inactive
    if latest_year in [close1, close2, close3]:
        return 'Inactive'

    # If the latest year is an 'open' year and it's the current year or later, consider it active
    if latest_year in [open1, open2, open3] and latest_year >= current_year:
        return 'Active'

    # For all other cases, consider it inactive
    return 'Inactive'


# Apply the function to create a new column 'mine_status'
min_can_df['mine_status'] = min_can_df.apply(is_mine_active, axis=1)

In [7]:
min_can_df

Unnamed: 0,company1,company2,company3,company4,company5,company6,namemine,town,province,latitude,...,commodity7,commodity8,information,source1,source2,source3,link1,link2,link3,mine_status
0,Quebec Copper Corp. Ltd,,,,,,,,Quebec,45.263786,...,,,,0,1,0,,,,Inactive
1,Noland Mines Ltd.,,,,,,Spruce Creek,Atlin,British Columbia,59.382720,...,,,,0,1,1,https://www.mindat.org/feature-6088912.html,https://www.mindat.org/loc-257276.html,https://mrdata.usgs.gov/ardf/show-ardf.php?ard...,Inactive
2,Falconbridge Nickel Mines Ltd.,,,,,,Wesfrob (Tasu),Moresby Island,British Columbia,52.758340,...,,,,0,1,1,https://www.mindat.org/loc-26169.html,,,Inactive
3,Greenwood Coal Co. Ltd.,,,,,,,,Nova Scotia,45.550219,...,,,The currently entered coordinates are for the ...,0,1,1,https://www.mindat.org/loc-290980.html,,,Inactive
4,Noranda Mines Ltd,Normetal Mining Corporation Ltd.,Falconbridge Ltd.,Xstrata,,,Horne,Noranda,Quebec,48.254270,...,,,Alternative names: Horne Mine (1975) The mine ...,0,1,1,https://www.mindat.org/loc-609.html,,,Inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,"Imperial Metals Corporation, Mitsubishi Materi...",Teck Resources Limited,Huckleberry Mines Ltd.,,,,Huckleberry,Kemano,British Columbia,53.681110,...,,,,1,0,0,,,,Inactive
943,BHP Diamonds Inc.,Dia Met Minerals Ltd.,Charles Fipke and Stewart Blusson,Arctic Canadian Diamond Company,,,Panda (Ekati Project),Lac de Gras,Northwest Territories,64.716667,...,,,Brief closure between 2020 and 2021,1,0,0,,,,Active
944,Magnola Magnesium Inc.,Society generale de financement du Quebec,,,,,Ardorbec,Danville,Quebec,45.748462,...,,,,1,0,0,,,,Active
945,Canadian Johns-Manville Co. Ltd.,Asbestos Inc.,,,,,Jeffrey,Asbestos (Val-des-Sources),Quebec,45.775100,...,,,"FR: En 1992, la mine est l'objet d'une importa...",0,0,1,https://miningwatch.ca/sites/default/files/asb...,https://niche-canada.org/2015/11/19/workers-as...,https://www.researchgate.net/publication/28664...,Inactive


### GHG facility

In [8]:
# We need to sort it by NAICS list, since they are not properly listed by "hierarchy", e.g. no parent nor code
# So we define classification lists based on NAICS subsectors, e.g. 5-6 digits code
metal_ore_mining_naics = [
    "Metal ore mining",
    "Iron ore mining",
    "Gold and silver ore mining",
    "Copper, nickel, lead and zinc ore mining",
    "Lead-zinc ore mining",
    "Nickel-copper ore mining",
    "Copper-zinc ore mining",
    "Other metal ore mining",
    "Uranium ore mining",
    "All other metal ore mining",
    "Non-metallic mineral mining and quarrying",
    "Stone mining and quarrying",
    "Granite mining and quarrying",
    "Limestone mining and quarrying",
    "Marble mining and quarrying",
    "Sandstone mining and quarrying",
    "Sand, gravel, clay, and ceramic and refractory minerals mining and quarrying",
    "Sand and gravel mining and quarrying",
    "Shale, clay and refractory mineral mining and quarrying",
    "Other non-metallic mineral mining and quarrying",
    "Diamond mining",
    "Salt mining",
    "Asbestos mining",
    "Gypsum mining",
    "Potash mining",
    "Peat extraction",
    "All other non-metallic mineral mining and quarrying",
    "Support activities for mining, and oil and gas extraction",
    "Oil and gas contract drilling",
    "Contract drilling (except oil and gas)",
    "Services to oil and gas extraction",
    "Other support activities for mining"
]

metal_manufacturing_naics = [
    "Primary metal manufacturing",
    "Iron and steel mills and ferro-alloy manufacturing",
    "Steel product manufacturing from purchased steel",
    "Iron and steel pipes and tubes manufacturing from purchased steel",
    "Rolling and drawing of purchased steel",
    "Cold-rolled steel shape manufacturing",
    "Steel wire drawing",
    "Alumina and aluminum production and processing",
    "Primary production of alumina and aluminum",
    "Aluminum rolling, drawing, extruding and alloying",
    "Non-ferrous metal (except aluminum) production and processing",
    "Non-ferrous metal (except aluminum) smelting and refining",
    "Copper rolling, drawing, extruding and alloying",
    "Non-ferrous metal (except copper and aluminum) rolling, drawing, extruding and alloying",
    "Foundries",
    "Ferrous metal foundries",
    "Iron foundries",
    "Steel foundries",
    "Non-ferrous metal foundries",
    "Non-ferrous metal die-casting foundries",
    "Non-ferrous metal foundries (except die-casting)"
]

In [9]:
def filter_ghg_facility_naics(df, classifications):
    # Normalize the classifications to lowercase for case-insensitive comparison
    classifications_lower = [cls.lower() for cls in classifications]
    
    df_copy = df.copy()
    df_copy['NAICS_Lower'] = df_copy['Industry classification'].str.lower()
    filtered_df = df_copy[df_copy['NAICS_Lower'].isin(classifications_lower)]
    filtered_df = filtered_df.drop(columns=['NAICS_Lower'])
    filtered_df = filtered_df.reset_index(drop=True)
    
    return filtered_df

In [10]:
ghg_facility_metal_mining_df = filter_ghg_facility_naics(ghg_facility_df, metal_ore_mining_naics)
ghg_facility_metal_manufacturing_df = filter_ghg_facility_naics(ghg_facility_df, metal_manufacturing_naics)

### NPRI

In [149]:
# Create mapping NPRI-EI through IW+ mapping
# Import existing concordances 
mapping_npri_iw = pd.read_excel(r'data/Concordances/openIO_IW_EI_concordance.xlsx', sheet_name='NPRI_to_IW21')
mapping_ei_iw = pd.read_excel(r'data/Concordances/openIO_IW_EI_concordance.xlsx', sheet_name='EI_to_IW+')

In [150]:
# Standardize column names for merging
mapping_npri_iw.rename(columns={'IMPACT World+ flows': 'iw_name'}, inplace=True)
mapping_ei_iw.rename(columns={'iw name': 'iw_name'}, inplace=True)

In [151]:
# Add the 'EI' column to df_npri_iw based on matching iw_name with df_ei_iw
mapping_npri_iw['EI'] = mapping_npri_iw['iw_name'].map(
    lambda iw: mapping_ei_iw[mapping_ei_iw['iw_name'] == iw]['ecoinvent name'].iloc[0]
    if iw in mapping_ei_iw['iw_name'].values else 'No match found'
)


In [152]:
mapping_npri_iw

Unnamed: 0,OpenIO flows,iw_name,EI
0,Carbon dioxide,"Carbon dioxide, fossil","Carbon dioxide, fossil"
1,Methane,"Methane, fossil",Methane
2,Dinitrogen monoxide,Dinitrogen monoxide,Dinitrogen monoxide
3,CF4,"Methane, tetrafluoro-, CFC-14","Methane, tetrafluoro-, R-14"
4,C2F6,"Ethane, hexafluoro-, HFC-116","Ethane, hexafluoro-, HFC-116"
...,...,...,...
348,Propylene oxide,Propylene oxide,Propylene oxide
349,Thiourea,Thiourea,No match found
350,2-Ethoxyethyl acetate,2-Ethoxyethyl acetate,No match found
351,Azo disperse dyes,,No match found


In [153]:
#mapping_npri_iw.to_excel(r'data/Concordances/NPRI_EI_concordance.xlsx', index=False)

In [154]:
# We need to rename it to merge it to NPRI data
mapping_npri_iw.rename(columns={'OpenIO flows': 'Substance Name (English)'}, inplace=True)

In [155]:
# We add the Ecoinvent name based on the mapping 
npri_df = npri_df.merge(mapping_npri_iw[['Substance Name (English)', 'EI']],
              on='Substance Name (English)',
              how='left')

In [156]:
npri_df

Unnamed: 0,Year,NPRI ID,Company Name,Facility Name,City,CSD,CA or CMA,Economic Region,Province,Postal Code,...,Metal Recovery,Inorganic Compound Recovery,Acid or Base Recovery,Catalyst Recovery,Pollution Abatement Residue Recovery,Used Oil Recovery,Other.2,Total.5,"Total Releases, Disposals and Transfers for Recycling",EI
0,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.000020,Acrolein
1,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.004000,Ammonia
2,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.097000,"Carbon monoxide, fossil"
3,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.000090,Formaldehyde
4,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.000000,Manganese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63869,2023,34009,ZF Group Oakville,ZF Group Oakville,Oakville,Oakville,Toronto,Toronto,ON,,...,,,,,,,,,12.735000,
63870,2023,7095,Zochem ULC,Zochem,Brampton,Brampton,Toronto,Toronto,ON,L6T 3T4,...,,,,,,,,,0.356000,No match found
63871,2023,7095,Zochem ULC,Zochem,Brampton,Brampton,Toronto,Toronto,ON,L6T 3T4,...,,,,,,,,,0.356000,"Particulates, < 2.5 um"
63872,2023,7095,Zochem ULC,Zochem,Brampton,Brampton,Toronto,Toronto,ON,L6T 3T4,...,,,,,,,,,0.356000,No match found


In [157]:
def clean_npri(df, category_mapping):
    """
    Renames columns in a DataFrame based on a category mapping.
    Each column name is prefixed with the category.

    Args:
        df (pd.DataFrame): The DataFrame whose columns need renaming.
        category_mapping (dict): A dictionary mapping categories to their subcategories.

    Returns:
        pd.DataFrame: The DataFrame with renamed columns.
    """
    # Create a mapping of old column names to new column names
    column_renaming = {}
    
    for category, subcategories in category_mapping.items():
        for subcategory in subcategories:
            if subcategory in df.columns:
                # Create a new column name with the format 'category_subcategory'
                new_column_name = f"{category.lower().replace(' ', '_')}_{subcategory.lower().replace(' ', '_')}"
                column_renaming[subcategory] = new_column_name

    # Rename columns in the DataFrame
    df = df.rename(columns=column_renaming)
    return df

In [158]:

# Your provided category mapping
category_mapping = {
    'Air Emissions': ['Stack Emissions', 'Storage / Handling', 'Fugitive Emissions', 'Spills', 'Other', 'Total'],
    'Water Releases': ['Direct Discharge', 'Spills.1', 'Leaks', 'Total.1', 'Receiving Waterbody (English)', 'Receiving Waterbody (French)'],
    'Land Releases': ['Spills.2', 'Leaks.1', 'Other.1', 'Total.2'],
    'Total Releases': ['Total Releases (Excluding Road Dust)', 'Road Dust Emissions', 'Total Releases Including Road Dust'],
    'On-Site Disposal': ['Land Fill.1', 'Land Treatment.1', 'Underground Injection.1', 'Tailings.1', 'Waste Rock.1', 'Total On-Site'],
    'Off-Site Disposal': ['Land Fill', 'Land Treatment', 'Underground Injection', 'Storage', 'Tailings', 'Waste Rock', 'Total Off-Site'],
    'Transfers for Treatment': ['Physical Treatment', 'Chemical Treatment', 'Biological Treatment', 'Incineration', 'Transfer to a Municipal Sewage Treatment Plant', 'Total.3'],
    'Total On/Off Treatment': ['Total.4'],
    'Transfers for Recycling': [
        'Energy Recovery', 'Solvent Recovery', 'Organic Compound Recovery', 'Metal Recovery',
        'Inorganic Compound Recovery', 'Acid or Base Recovery', 'Catalyst Recovery',
        'Pollution Abatement Residue Recovery', 'Used Oil Recovery', 'Other.2', 'Total.5'
    ],
    'Grand Total': ['Total Releases, Disposals and Transfers for Recycling']
}

# Rename columns
npri_df = clean_npri(npri_df, category_mapping)
npri_df

Unnamed: 0,Year,NPRI ID,Company Name,Facility Name,City,CSD,CA or CMA,Economic Region,Province,Postal Code,...,transfers_for_recycling_metal_recovery,transfers_for_recycling_inorganic_compound_recovery,transfers_for_recycling_acid_or_base_recovery,transfers_for_recycling_catalyst_recovery,transfers_for_recycling_pollution_abatement_residue_recovery,transfers_for_recycling_used_oil_recovery,transfers_for_recycling_other.2,transfers_for_recycling_total.5,"grand_total_total_releases,_disposals_and_transfers_for_recycling",EI
0,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.000020,Acrolein
1,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.004000,Ammonia
2,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.097000,"Carbon monoxide, fossil"
3,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.000090,Formaldehyde
4,2023,30809,•\tEmboutissage Takumi Canada Inc./Takumi Stam...,Takumi Stamping St. Thomas,St. Thomas,St. Thomas,London,London,ON,N5P 0B6,...,,,,,,,,,0.000000,Manganese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63869,2023,34009,ZF Group Oakville,ZF Group Oakville,Oakville,Oakville,Toronto,Toronto,ON,,...,,,,,,,,,12.735000,
63870,2023,7095,Zochem ULC,Zochem,Brampton,Brampton,Toronto,Toronto,ON,L6T 3T4,...,,,,,,,,,0.356000,No match found
63871,2023,7095,Zochem ULC,Zochem,Brampton,Brampton,Toronto,Toronto,ON,L6T 3T4,...,,,,,,,,,0.356000,"Particulates, < 2.5 um"
63872,2023,7095,Zochem ULC,Zochem,Brampton,Brampton,Toronto,Toronto,ON,L6T 3T4,...,,,,,,,,,0.356000,No match found


In [205]:
npri_metal_mining_df = npri_df[npri_df["NAICS 4 Sector Name (English)"] == "Metal ore mining"]
npri_metal_manufacturing_df = npri_df[npri_df["NAICS 4 Code"].astype(str).str[:3] == "331"]

# 1 - Create facility table 

In [11]:
# We first create an empty df with the columns we want to have in the facility_df
facility_columns = {
    "facility_id": "text",
    "facility_name": "text",
    "sub_site_name": "text",
    "latitude": "float",
    "longitude": "float",
    "geometry": "geometry",
    "city": "text",
    "province": "text",
    "facility_type": "text",
    "mining_processing_type": "text",
    "primary_commodity": "text",
    "commodities": "text",
    "source": "text"  
}

facility_df = pd.DataFrame(columns=facility_columns.keys())
facility_df

Unnamed: 0,facility_id,facility_name,sub_site_name,latitude,longitude,geometry,city,province,facility_type,mining_processing_type,primary_commodity,commodities,source


In [12]:
# Based on concordances tables
column_mapping_facility = {
    "nrcan_mining_df": {
        "OPERATIO_E": "facility_name",
        "LATITUDE": "latitude",
        "LONGITUDE": "longitude",
        "CITY_E": "city",
        "PROVINCE_E": "province",
        "FAC_DESC_E": "mining_processing_type",
        #"COM_DESC_E": "primary_commodity",
        "COM_DESC_E": "commodities"
        
    },
    "nrcan_manufacturing_df": {
        "OPERATIO_E": "facility_name",
        "LATITUDE": "latitude",
        "LONGITUDE": "longitude",
        #"xx": "city", # Not provided 
        "PROVINCE_E": "province",
        "FAC_TYPE_E": "mining_processing_type",
        #"COM_GROU_E": "primary_commodity",
        "COM_DESC_E": "commodities"
    }, 
    }

In [14]:
def populate_facility_df(column_mapping, facility_df, dynamic_columns=None, source_dfs=None):
    """
    Populate a facility DataFrame based on a column mapping and optional dynamic column values.

    Parameters:
        column_mapping (dict): A dictionary where keys are DataFrame names (as strings) and values are mappings
                               of source column names to target column names.
        facility_df (pd.DataFrame): The target facility DataFrame to populate.
        dynamic_columns (dict, optional): A dictionary where keys are target column names and values are mappings
                                          of DataFrame names to specific values (e.g., facility type).
        source_dfs (dict): A dictionary where keys are DataFrame names (as strings) and values are the actual DataFrames.

    Returns:
        pd.DataFrame: The populated facility DataFrame.
    """
    # Debug: Ensure facility_df starts empty or with expected rows
    print(f"Initial facility_df rows: {len(facility_df)}")

    for source_name, mapping in column_mapping.items():
        print(f"Processing DataFrame: {source_name}")

        df = source_dfs.get(source_name)
        if df is None:
            print(f"Warning: DataFrame '{source_name}' not found.")
            continue

        # Create a temporary DataFrame for the current source
        temp_df = pd.DataFrame()

        for src_col, target_col in mapping.items():
            if target_col in facility_df.columns and src_col in df.columns:
                # Map the source column to the target column
                temp_df[target_col] = df[src_col]

        # Add dynamic columns if provided
        if dynamic_columns:
            for dynamic_col, source_values in dynamic_columns.items():
                if dynamic_col in facility_df.columns and source_name in source_values:
                    temp_df[dynamic_col] = source_values[source_name]

        # Add a 'source' column for provenance tracking
        temp_df["source"] = source_name

        # Ensure temp_df aligns with facility_df
        missing_columns = set(facility_df.columns) - set(temp_df.columns)
        for col in missing_columns:
            temp_df[col] = pd.NA

        # Debug: Print temp_df shape before appending
        print(f"Temp DF rows to append: {len(temp_df)}")

        # Append temp_df to facility_df
        facility_df = pd.concat([facility_df, temp_df], ignore_index=True)

        # Debug: Print facility_df shape after appending
        print(f"Rows in facility_df after appending {source_name}: {len(facility_df)}")

    # Final debug: Ensure the final facility_df shape is correct
    print(f"Final facility_df rows: {len(facility_df)}")
    return facility_df


In [15]:
facility_df = populate_facility_df(column_mapping_facility, 
                                   facility_df, 
                                   dynamic_columns = {"facility_type": 
                                                          {"nrcan_mining_df": "mining",
                                                           "nrcan_manufacturing_df": "manufacturing"}
                                                      }, 
                                   source_dfs = {"nrcan_mining_df": nrcan_mining_df,
                                                  "nrcan_manufacturing_df": nrcan_manufacturing_df}
                                   )

Initial facility_df rows: 0
Processing DataFrame: nrcan_mining_df
Temp DF rows to append: 199
Rows in facility_df after appending nrcan_mining_df: 199
Processing DataFrame: nrcan_manufacturing_df
Temp DF rows to append: 73
Rows in facility_df after appending nrcan_manufacturing_df: 272
Final facility_df rows: 272


  facility_df = pd.concat([facility_df, temp_df], ignore_index=True)


In [16]:
# Compute primary_commodity from commodities
if "commodities" in facility_df.columns:
    facility_df["primary_commodity"] = facility_df["commodities"].apply(
            lambda x: x.split(",")[0].strip() if isinstance(x, str) and x else None
        )

In [18]:
facility_df

Unnamed: 0,facility_id,facility_name,sub_site_name,latitude,longitude,geometry,city,province,facility_type,mining_processing_type,primary_commodity,commodities,source
0,,Sleeping Giant,,49.132590,-77.974340,,north of Amos,Quebec,mining,Concentrator,Gold,"Gold, silver",nrcan_mining_df
1,,Amaruq,,65.415000,-96.697000,,Baker Lake,Nunavut,mining,"Open-pit, underground",Gold,Gold,nrcan_mining_df
2,,Canadian Malartic,,48.122223,-78.130824,,Malartic,Quebec,mining,"Open-pit, concentrator",Gold,"Gold, silver",nrcan_mining_df
3,,Detour Lake,,50.018143,-79.717631,,Northeast of Cochrane,Ontario,mining,"Open-pit, concentrator",Gold,Gold,nrcan_mining_df
4,,Goldex,,48.092600,-77.873300,,Val-d’Or,Quebec,mining,"Underground, concentrator",Gold,"Gold, silver",nrcan_mining_df
...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,,Sault Ste. Marie Division,,46.524326,-84.395602,,,Ontario,manufacturing,,,,nrcan_manufacturing_df
268,,Welland Mill,,42.998265,-79.235065,,,Ontario,manufacturing,Electric arc furnace,Steel,Steel,nrcan_manufacturing_df
269,,Copper Cliff Complex,,46.480100,-81.056900,,,Ontario,manufacturing,"Smelter, refinery, plant",Nickel (oxide sinter,"Nickel (oxide sinter, pellets, powder, sulfide...",nrcan_manufacturing_df
270,,Port Colborne,,42.883000,-79.240000,,,Ontario,manufacturing,Refinery,Electrolytic cobalt,"Electrolytic cobalt, platinum group metals (in...",nrcan_manufacturing_df


In [19]:
def assign_ids(facility_df, id_column="facility_id"):
    """
    Assign deterministic facility IDs that include province codes and facility type prefixes.

    Parameters:
        facility_df (pd.DataFrame): The DataFrame to which the IDs will be assigned.
        id_column (str): The name of the column for the unique IDs.

    Returns:
        pd.DataFrame: The DataFrame with assigned facility IDs.
    """
    if id_column not in facility_df.columns:
        facility_df[id_column] = None  # Create the ID column if it doesn't exist

    # Dictionary mapping provinces to their codes
    province_codes = {
        "Ontario": "ON",
        "Quebec": "QC",
        "British Columbia": "BC",
        "Alberta": "AB",
        "Manitoba": "MB",
        "Saskatchewan": "SK",
        "Newfoundland and Labrador": "NL",
        "New Brunswick": "NB",
        "Nova Scotia": "NS",
        "Prince Edward Island": "PE",
        "Northwest Territories": "NT",
        "Yukon": "YT",
        "Nunavut": "NU"
    }

    def generate_id(row):
        # Get the province code
        province = row.get("province", "Unknown")
        province_code = province_codes.get(province, "ZZ")  # Use 'ZZ' for unknown provinces

        # Determine prefix based on facility type
        facility_type = row.get("facility_type", "UNKNOWN").upper()
        prefix = {
            "MINING": "MIN",
            "MANUFACTURING": "MAN",
            "PROCESSING": "PRO"
        }.get(facility_type, "OTH")  # Default prefix is 'OTH' for other types

        # Create a unique hash from facility_name, latitude, longitude, and type
        unique_hash = hash((facility_type, row["facility_name"], row["latitude"], row["longitude"])) & 0xFFFFFFFF

        # Combine province code, prefix, and hash to form the ID
        return f"{province_code}-{prefix}-{unique_hash:08d}"  # Ensures a fixed 8-digit hash

    facility_df[id_column] = facility_df.apply(generate_id, axis=1)

    return facility_df

In [20]:
facility_df = assign_ids(facility_df)

In [21]:
facility_df

Unnamed: 0,facility_id,facility_name,sub_site_name,latitude,longitude,geometry,city,province,facility_type,mining_processing_type,primary_commodity,commodities,source
0,QC-MIN-4214229325,Sleeping Giant,,49.132590,-77.974340,,north of Amos,Quebec,mining,Concentrator,Gold,"Gold, silver",nrcan_mining_df
1,NU-MIN-1605859119,Amaruq,,65.415000,-96.697000,,Baker Lake,Nunavut,mining,"Open-pit, underground",Gold,Gold,nrcan_mining_df
2,QC-MIN-3151125605,Canadian Malartic,,48.122223,-78.130824,,Malartic,Quebec,mining,"Open-pit, concentrator",Gold,"Gold, silver",nrcan_mining_df
3,ON-MIN-3512602579,Detour Lake,,50.018143,-79.717631,,Northeast of Cochrane,Ontario,mining,"Open-pit, concentrator",Gold,Gold,nrcan_mining_df
4,QC-MIN-1807873265,Goldex,,48.092600,-77.873300,,Val-d’Or,Quebec,mining,"Underground, concentrator",Gold,"Gold, silver",nrcan_mining_df
...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,ON-MAN-3747348111,Sault Ste. Marie Division,,46.524326,-84.395602,,,Ontario,manufacturing,,,,nrcan_manufacturing_df
268,ON-MAN-2997132702,Welland Mill,,42.998265,-79.235065,,,Ontario,manufacturing,Electric arc furnace,Steel,Steel,nrcan_manufacturing_df
269,ON-MAN-3269259229,Copper Cliff Complex,,46.480100,-81.056900,,,Ontario,manufacturing,"Smelter, refinery, plant",Nickel (oxide sinter,"Nickel (oxide sinter, pellets, powder, sulfide...",nrcan_manufacturing_df
270,ON-MAN-1510270017,Port Colborne,,42.883000,-79.240000,,,Ontario,manufacturing,Refinery,Electrolytic cobalt,"Electrolytic cobalt, platinum group metals (in...",nrcan_manufacturing_df


## Join other dfs based on geospatial merging

In [118]:
def add_geospatial_info(facility_df, other_df, matching_columns, buffer_distance=1000, crs="EPSG:4326"):
    """
    Add information from another DataFrame to facility_df based on geospatial matching.

    Parameters:
        facility_df (pd.DataFrame): The main facility DataFrame.
        other_df (pd.DataFrame): The secondary DataFrame with additional information.
        matching_columns (dict): Columns to add from other_df. Format: {"source_column": "target_column"}.
        buffer_distance (float): Buffer distance in meters for proximity matching.
        crs (str): Coordinate Reference System, default is WGS 84 (EPSG:4326).

    Returns:
        pd.DataFrame: The updated facility_df with added information.
    """
    # Convert facility_df and other_df to GeoDataFrames
    facility_gdf = gpd.GeoDataFrame(
        facility_df,
        geometry=gpd.points_from_xy(facility_df["longitude"], facility_df["latitude"]),
        crs=crs,
    )
    other_gdf = gpd.GeoDataFrame(
        other_df,
        geometry=gpd.points_from_xy(other_df["longitude"], other_df["latitude"]),
        crs=crs,
    )

    # Reproject to a projected CRS for accurate buffering
    facility_gdf = facility_gdf.to_crs("EPSG:3857")
    other_gdf = other_gdf.to_crs("EPSG:3857")

    # Create a buffer around each facility
    facility_gdf["geometry"] = facility_gdf["geometry"].buffer(buffer_distance)

    # Perform a spatial join to find matches within the buffer
    joined_gdf = gpd.sjoin(other_gdf, facility_gdf, how="inner", predicate="within")

    # Drop duplicate matches and aggregate if necessary
    joined_gdf = joined_gdf.groupby("index_right").first()

    # Add the matching columns to facility_gdf
    for source_col, target_col in matching_columns.items():
        if source_col in other_gdf.columns:
            facility_gdf[target_col] = joined_gdf[source_col]

    # Reproject back to the original CRS
    facility_gdf = facility_gdf.to_crs(crs)

    # Drop buffer geometry for clean output
    facility_gdf = facility_gdf.drop(columns="geometry")

    return pd.DataFrame(facility_gdf)


In [119]:
# Define matching columns
matching_columns = {"link1": "link1"}

# Add geospatial information
facility_merged_df = add_geospatial_info(facility_df, min_can_df, matching_columns, buffer_distance=1000)

In [120]:
facility_df

Unnamed: 0,facility_id,facility_name,sub_site_name,latitude,longitude,city,province,facility_type,mining_processing_type,primary_commodity,commodities,source,link1
0,QC-MIN-2279011331,Sleeping Giant,,49.132590,-77.974340,,Quebec,mining,,,,nrcan_mining_df,https://miningdataonline.com/property/1589/Sle...
1,NU-MIN-2554473818,Amaruq,,65.415000,-96.697000,,Nunavut,mining,,,,nrcan_mining_df,
2,QC-MIN-34795366,Canadian Malartic,,48.122223,-78.130824,,Quebec,mining,,,,nrcan_mining_df,
3,ON-MIN-474125210,Detour Lake,,50.018143,-79.717631,,Ontario,mining,,,,nrcan_mining_df,https://www.northernontariobusiness.com/indust...
4,QC-MIN-2303249714,Goldex,,48.092600,-77.873300,,Quebec,mining,,,,nrcan_mining_df,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
267,ON-MAN-222987211,Sault Ste. Marie Division,,46.524326,-84.395602,,Ontario,manufacturing,,,,nrcan_manufacturing_df,
268,ON-MAN-1047519766,Welland Mill,,42.998265,-79.235065,,Ontario,manufacturing,,,,nrcan_manufacturing_df,
269,ON-MAN-1265141535,Copper Cliff Complex,,46.480100,-81.056900,,Ontario,manufacturing,,,,nrcan_manufacturing_df,
270,ON-MAN-829784927,Port Colborne,,42.883000,-79.240000,,Ontario,manufacturing,,,,nrcan_manufacturing_df,


# 2 - Create GHG table based on GHG from large facilities dataset

In [84]:
# We first create an empty df with the columns we want to have in the ghg_df
ghg_columns = {
    "id": "text",
    #"facility_id": "text", better to create it later with the geographical function
    "latitude": "float",
    "longitude": "float",   
    "geometry": "geometry",
    "year": "integer",
    "ghge_ktCO2eq": "float", # column 
    "facility_url" : "url" # column Facility information
}

ghg_df = pd.DataFrame(columns=ghg_columns.keys())
ghg_df

Unnamed: 0,id,latitude,longitude,geometry,year,ghge_ktCO2eq,facility_url


In [85]:
# Based on concordances tables
column_mapping_ghg = {
    "ghg_facility_metal_mining_df": {
        "Latitude": "latitude",
        "Longitude": "longitude",
        #"City": "city",
        #"Province": "province",
        "Report year": "year",
        "Total emissions": "ghge_ktCO2eq",
        "Facility information": "facility_url",
 
    },
    
    "ghg_facility_metal_manufacturing_df": {
        "Latitude": "latitude",
        "Longitude": "longitude",
        #"City": "city",
        #"Province": "province",
        "Report year": "year",
        "Total emissions": "ghge_ktCO2eq",
        "Facility information": "facility_url",
    }, 
    }

In [86]:
ghg_df = populate_facility_df(column_mapping_ghg, ghg_df, 
                              source_dfs = {
                                  "ghg_facility_metal_mining_df": ghg_facility_metal_mining_df,
                                  "ghg_facility_metal_manufacturing_df": ghg_facility_metal_manufacturing_df})

Initial facility_df rows: 0
Processing DataFrame: ghg_facility_metal_mining_df
Temp DF rows to append: 111
Rows in facility_df after appending ghg_facility_metal_mining_df: 111
Processing DataFrame: ghg_facility_metal_manufacturing_df
Temp DF rows to append: 73
Rows in facility_df after appending ghg_facility_metal_manufacturing_df: 184
Final facility_df rows: 184


  facility_df = pd.concat([facility_df, temp_df], ignore_index=True)


In [87]:
ghg_df

Unnamed: 0,id,latitude,longitude,geometry,year,ghge_ktCO2eq,facility_url,source
0,,52.95200,-66.90020,,2022,826.03,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_mining_df
1,,64.71950,-110.62010,,2022,164.88,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_mining_df
2,,54.77100,-101.88400,,2022,29.73,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_mining_df
3,,48.57373,-81.09115,,2022,30.82,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_mining_df
4,,52.76639,-67.33306,,2022,293.10,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_mining_df
...,...,...,...,...,...,...,...,...
179,,49.91300,-97.00250,,2022,,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_manufacturing_df
180,,50.65598,-120.09703,,2022,,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_manufacturing_df
181,,50.65726,-120.09649,,2022,25.12,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_manufacturing_df
182,,45.89470,-73.13870,,2022,0.51,https://climate-change.canada.ca/facility-emis...,ghg_facility_metal_manufacturing_df


In [102]:
def assign_facility_ids_by_location(facility_df, ghg_df, proximity_threshold=1000):
    """
    Assigns facility IDs from `facility_df` to `ghg_df` based on geographical proximity.
    
    Args:
        facility_df (pd.DataFrame): DataFrame with facility IDs, latitude, and longitude.
        ghg_df (pd.DataFrame): DataFrame with latitude and longitude, where facility IDs will be assigned.
        proximity_threshold (float): Maximum distance for matching (in meters).
        
    Returns:
        gpd.GeoDataFrame: Updated `ghg_df` with a new `facility_id` column as the second column.
    """
    # Step 1: Convert facility_df and ghg_df to GeoDataFrames
    if not isinstance(facility_df, gpd.GeoDataFrame):
        facility_df = gpd.GeoDataFrame(
            facility_df,
            geometry=gpd.points_from_xy(facility_df.longitude, facility_df.latitude),
            crs="EPSG:4326"  # WGS 84 Geographic Coordinate System
        )
    
    if not isinstance(ghg_df, gpd.GeoDataFrame):
        ghg_df = gpd.GeoDataFrame(
            ghg_df,
            geometry=gpd.points_from_xy(ghg_df.longitude, ghg_df.latitude),
            crs="EPSG:4326"  # WGS 84 Geographic Coordinate System
        )
    
    # Step 2: Re-project to a projected CRS for spatial operations
    facility_df_proj = facility_df.to_crs(epsg=3857)  # Web Mercator
    ghg_df_proj = ghg_df.to_crs(epsg=3857)

    # Step 3: Apply buffer to facility_df geometries based on proximity_threshold (in meters)
    facility_df_proj['geometry_buffered'] = facility_df_proj.geometry.buffer(proximity_threshold)

    # Step 4: Perform spatial join
    # Use the buffered geometry to find matches within the threshold
    joined = gpd.sjoin(
        ghg_df_proj,
        facility_df_proj[['facility_id', 'geometry_buffered']].rename(columns={'geometry_buffered': 'geometry'}),
        how="left",
        predicate="within"  # Matches points within the buffered geometries
    )
    
    # Step 5: Handle duplicates (if multiple facilities match the same GHG point)
    deduplicated = (
        joined[['facility_id']]
        .groupby(joined.index)  # Group by the original index of ghg_df
        .first()               # Take the first match (can also use min, max, etc.)
    )

    # Step 6: Assign facility_id to ghg_df
    ghg_df['facility_id'] = deduplicated['facility_id']

    # Step 7: Reorder columns to make facility_id the second column
    columns = ghg_df.columns.to_list()
    reordered_columns = [columns[0], 'facility_id'] + [col for col in columns if col not in ['facility_id', columns[0]]]
    ghg_df = ghg_df[reordered_columns]
    
    # Step 8: Calculate and print the percentage of rows with None/NaN in facility_id
    total_rows = len(ghg_df)
    unmatched_rows = ghg_df['facility_id'].isna().sum()
    unmatched_percentage = (unmatched_rows / total_rows) * 100
    print(f"Percentage of unmatched rows (facility_id = None): {unmatched_percentage:.2f}%")
    
    # Step 9: Return the updated GeoDataFrame in its original CRS
    return ghg_df

In [103]:
ghg_df_updated = assign_facility_ids_by_location(facility_df, ghg_df, proximity_threshold=10000)

Percentage of unmatched rows (facility_id = None): 34.78%


# 3 - Create pollutant table based on NPRI 

In [217]:
# We first create an empty df with the columns we want to have in the ghg_df
pollutant_columns = {
    "id": "text",
    "facility_name_npri": "text",
    "facility_type": "text",
    "latitude": "float",
    "longitude": "float",
    "geometry": "geometry",
    "year": "integer",
    "terrestrial_ecozone": "text",
    "watershed": "text",
    "substance_name_npri": "text",
    "substance_name_ecoinvent": "text",
    "substance_unit": "text",
    "air_emissions_stack_emissions": "float",
    "air_emissions_storage_/_handling": "float",
    "air_emissions_fugitive_emissions": "float",
    "air_emissions_spills": "float",
    "air_emissions_other": "float",
    "air_emissions_total": "float",
    "water_releases_direct_discharge": "float",
    "water_releases_spills": "float",
    "water_releases_leaks": "float",
    "water_releases_total": "float",
    "water_releases_receiving_waterbody_(english)": "float",
    "water_releases_receiving_waterbody_(french)": "float",
    "land_releases_spills": "float",
    "land_releases_leaks": "float",
    "land_releases_other": "float",
    "land_releases_total": "float",
    "total_releases_total_releases_(excluding_road_dust)": "float",
    "total_releases_road_dust_emissions": "float",
    "total_releases_total_releases_including_road_dust": "float",
    "off-site_disposal_land_fill": "float",
    "off-site_disposal_land_treatment": "float",
    "off-site_disposal_underground_injection": "float",
    "off-site_disposal_tailings": "float",
    "off-site_disposal_waste_rock": "float",
    "on-site_disposal_total_on-site": "float",
    "on-site_disposal_land_fill": "float",
    "on-site_disposal_land_treatment": "float",
    "on-site_disposal_underground_injection": "float",
    "off-site_disposal_storage": "float",
    "on-site_disposal_tailings": "float",
    "on-site_disposal_waste_rock": "float",
    "off-site_disposal_total_off-site": "float",
    "transfers_for_treatment_physical_treatment": "float",
    "transfers_for_treatment_chemical_treatment": "float",
    "transfers_for_treatment_biological_treatment": "float",
    "transfers_for_treatment_incineration": "float",
    "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant": "float",
    "transfers_for_treatment_total": "float",
    "total_on/off_treatment_total": "float",
    "transfers_for_recycling_energy_recovery": "float",
    "transfers_for_recycling_solvent_recovery": "float",
    "transfers_for_recycling_organic_compound_recovery": "float",
    "transfers_for_recycling_metal_recovery": "float",
    "transfers_for_recycling_inorganic_compound_recovery": "float",
    "transfers_for_recycling_acid_or_base_recovery": "float",
    "transfers_for_recycling_catalyst_recovery": "float",
    "transfers_for_recycling_pollution_abatement_residue_recovery": "float",
    "transfers_for_recycling_used_oil_recovery": "float",
    "transfers_for_recycling_other": "float",
    "transfers_for_recycling_total": "float",
    "grand_total_total_releases,_disposals_and_transfers_for_recycling": "float",
}

# Create an empty DataFrame with the specified columns
pollutant_df = pd.DataFrame(columns=pollutant_columns.keys())

In [218]:
pollutant_df

Unnamed: 0,id,facility_name_npri,facility_type,latitude,longitude,geometry,year,terrestrial_ecozone,watershed,substance_name_npri,...,transfers_for_recycling_organic_compound_recovery,transfers_for_recycling_metal_recovery,transfers_for_recycling_inorganic_compound_recovery,transfers_for_recycling_acid_or_base_recovery,transfers_for_recycling_catalyst_recovery,transfers_for_recycling_pollution_abatement_residue_recovery,transfers_for_recycling_used_oil_recovery,transfers_for_recycling_other,transfers_for_recycling_total,"grand_total_total_releases,_disposals_and_transfers_for_recycling"


In [219]:
column_mapping_pollutant = {
    "npri_metal_mining_df": {
        "Facility Name": "facility_name_npri",
        "Latitude": "latitude",
        "Longitude": "longitude",
        "Year": "year",
        "Terrestrial ecozone (English) ": "terrestrial_ecozone", # space 
        "Watershed (English)": "watershed",
        "Substance Name (English)": "substance_name_npri",
        "EI": "substance_name_ecoinvent",
        "Units": "substance_unit",
        "air_emissions_stack_emissions": "air_emissions_stack_emissions",
        "air_emissions_storage_/_handling": "air_emissions_storage_/_handling",
        "air_emissions_fugitive_emissions": "air_emissions_fugitive_emissions",
        "air_emissions_spills": "air_emissions_spills",
        "air_emissions_other": "air_emissions_other",
        "air_emissions_total": "air_emissions_total",
        "water_releases_direct_discharge": "water_releases_direct_discharge",
        "water_releases_spills": "water_releases_spills",
        "water_releases_leaks": "water_releases_leaks",
        "water_releases_total": "water_releases_total",
        "water_releases_receiving_waterbody_(english)": "water_releases_receiving_waterbody_(english)",
        "water_releases_receiving_waterbody_(french)": "water_releases_receiving_waterbody_(french)",
        "land_releases_spills": "land_releases_spills",
        "land_releases_leaks": "land_releases_leaks",
        "land_releases_other": "land_releases_other",
        "land_releases_total": "land_releases_total",
        "total_releases_total_releases_(excluding_road_dust)": "total_releases_total_releases_(excluding_road_dust)",
        "total_releases_road_dust_emissions": "total_releases_road_dust_emissions",
        "total_releases_total_releases_including_road_dust": "total_releases_total_releases_including_road_dust",
        "off-site_disposal_land_fill": "off-site_disposal_land_fill",
        "off-site_disposal_land_treatment": "off-site_disposal_land_treatment",
        "off-site_disposal_underground_injection": "off-site_disposal_underground_injection",
        "off-site_disposal_tailings": "off-site_disposal_tailings",
        "off-site_disposal_waste_rock": "off-site_disposal_waste_rock",
        "on-site_disposal_total_on-site": "on-site_disposal_total_on-site",
        "on-site_disposal_land_fill": "on-site_disposal_land_fill",
        "on-site_disposal_land_treatment": "on-site_disposal_land_treatment",
        "on-site_disposal_underground_injection": "on-site_disposal_underground_injection",
        "off-site_disposal_storage": "off-site_disposal_storage",
        "on-site_disposal_tailings": "on-site_disposal_tailings",
        "on-site_disposal_waste_rock": "on-site_disposal_waste_rock",
        "off-site_disposal_total_off-site": "off-site_disposal_total_off-site",
        "transfers_for_treatment_physical_treatment": "transfers_for_treatment_physical_treatment",
        "transfers_for_treatment_chemical_treatment": "transfers_for_treatment_chemical_treatment",
        "transfers_for_treatment_biological_treatment": "transfers_for_treatment_biological_treatment",
        "transfers_for_treatment_incineration": "transfers_for_treatment_incineration",
        "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant": "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant",
        "transfers_for_treatment_total": "transfers_for_treatment_total",
        "total_on/off_treatment_total": "total_on/off_treatment_total",
        "transfers_for_recycling_energy_recovery": "transfers_for_recycling_energy_recovery",
        "transfers_for_recycling_solvent_recovery": "transfers_for_recycling_solvent_recovery",
        "transfers_for_recycling_organic_compound_recovery": "transfers_for_recycling_organic_compound_recovery",
        "transfers_for_recycling_metal_recovery": "transfers_for_recycling_metal_recovery",
        "transfers_for_recycling_inorganic_compound_recovery": "transfers_for_recycling_inorganic_compound_recovery",
        "transfers_for_recycling_acid_or_base_recovery": "transfers_for_recycling_acid_or_base_recovery",
        "transfers_for_recycling_catalyst_recovery": "transfers_for_recycling_catalyst_recovery",
        "transfers_for_recycling_pollution_abatement_residue_recovery": "transfers_for_recycling_pollution_abatement_residue_recovery",
        "transfers_for_recycling_used_oil_recovery": "transfers_for_recycling_used_oil_recovery",
        "transfers_for_recycling_other": "transfers_for_recycling_other",
        "transfers_for_recycling_total": "transfers_for_recycling_total",
        "grand_total_total_releases,_disposals_and_transfers_for_recycling": "grand_total_total_releases,_disposals_and_transfers_for_recycling",
    },
    
    "npri_metal_manufacturing_df": {
        "Facility Name": "facility_name_npri",
        "Latitude": "latitude",
        "Longitude": "longitude",
        "Year": "year",
        "Terrestrial ecozone (English) ": "terrestrial_ecozone", # space 
        "Watershed (English)": "watershed",
        "Substance Name (English)": "substance_name_npri",
        "EI": "substance_name_ecoinvent",
        "Units": "substance_unit",
        "air_emissions_stack_emissions": "air_emissions_stack_emissions",
        "air_emissions_storage_/_handling": "air_emissions_storage_/_handling",
        "air_emissions_fugitive_emissions": "air_emissions_fugitive_emissions",
        "air_emissions_spills": "air_emissions_spills",
        "air_emissions_other": "air_emissions_other",
        "air_emissions_total": "air_emissions_total",
        "water_releases_direct_discharge": "water_releases_direct_discharge",
        "water_releases_spills": "water_releases_spills",
        "water_releases_leaks": "water_releases_leaks",
        "water_releases_total": "water_releases_total",
        "water_releases_receiving_waterbody_(english)": "water_releases_receiving_waterbody_(english)",
        "water_releases_receiving_waterbody_(french)": "water_releases_receiving_waterbody_(french)",
        "land_releases_spills": "land_releases_spills",
        "land_releases_leaks": "land_releases_leaks",
        "land_releases_other": "land_releases_other",
        "land_releases_total": "land_releases_total",
        "total_releases_total_releases_(excluding_road_dust)": "total_releases_total_releases_(excluding_road_dust)",
        "total_releases_road_dust_emissions": "total_releases_road_dust_emissions",
        "total_releases_total_releases_including_road_dust": "total_releases_total_releases_including_road_dust",
        "off-site_disposal_land_fill": "off-site_disposal_land_fill",
        "off-site_disposal_land_treatment": "off-site_disposal_land_treatment",
        "off-site_disposal_underground_injection": "off-site_disposal_underground_injection",
        "off-site_disposal_tailings": "off-site_disposal_tailings",
        "off-site_disposal_waste_rock": "off-site_disposal_waste_rock",
        "on-site_disposal_total_on-site": "on-site_disposal_total_on-site",
        "on-site_disposal_land_fill": "on-site_disposal_land_fill",
        "on-site_disposal_land_treatment": "on-site_disposal_land_treatment",
        "on-site_disposal_underground_injection": "on-site_disposal_underground_injection",
        "off-site_disposal_storage": "off-site_disposal_storage",
        "on-site_disposal_tailings": "on-site_disposal_tailings",
        "on-site_disposal_waste_rock": "on-site_disposal_waste_rock",
        "off-site_disposal_total_off-site": "off-site_disposal_total_off-site",
        "transfers_for_treatment_physical_treatment": "transfers_for_treatment_physical_treatment",
        "transfers_for_treatment_chemical_treatment": "transfers_for_treatment_chemical_treatment",
        "transfers_for_treatment_biological_treatment": "transfers_for_treatment_biological_treatment",
        "transfers_for_treatment_incineration": "transfers_for_treatment_incineration",
        "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant": "transfers_for_treatment_transfer_to_a_municipal_sewage_treatment_plant",
        "transfers_for_treatment_total": "transfers_for_treatment_total",
        "total_on/off_treatment_total": "total_on/off_treatment_total",
        "transfers_for_recycling_energy_recovery": "transfers_for_recycling_energy_recovery",
        "transfers_for_recycling_solvent_recovery": "transfers_for_recycling_solvent_recovery",
        "transfers_for_recycling_organic_compound_recovery": "transfers_for_recycling_organic_compound_recovery",
        "transfers_for_recycling_metal_recovery": "transfers_for_recycling_metal_recovery",
        "transfers_for_recycling_inorganic_compound_recovery": "transfers_for_recycling_inorganic_compound_recovery",
        "transfers_for_recycling_acid_or_base_recovery": "transfers_for_recycling_acid_or_base_recovery",
        "transfers_for_recycling_catalyst_recovery": "transfers_for_recycling_catalyst_recovery",
        "transfers_for_recycling_pollution_abatement_residue_recovery": "transfers_for_recycling_pollution_abatement_residue_recovery",
        "transfers_for_recycling_used_oil_recovery": "transfers_for_recycling_used_oil_recovery",
        "transfers_for_recycling_other": "transfers_for_recycling_other",
        "transfers_for_recycling_total": "transfers_for_recycling_total",
        "grand_total_total_releases,_disposals_and_transfers_for_recycling": "grand_total_total_releases,_disposals_and_transfers_for_recycling",
    }
}


In [220]:
pollutant_df

Unnamed: 0,id,facility_name_npri,facility_type,latitude,longitude,geometry,year,terrestrial_ecozone,watershed,substance_name_npri,...,transfers_for_recycling_organic_compound_recovery,transfers_for_recycling_metal_recovery,transfers_for_recycling_inorganic_compound_recovery,transfers_for_recycling_acid_or_base_recovery,transfers_for_recycling_catalyst_recovery,transfers_for_recycling_pollution_abatement_residue_recovery,transfers_for_recycling_used_oil_recovery,transfers_for_recycling_other,transfers_for_recycling_total,"grand_total_total_releases,_disposals_and_transfers_for_recycling"


In [221]:
pollutant_df = populate_facility_df(column_mapping_pollutant, 
                                    pollutant_df,
                                    dynamic_columns = {
                                        "facility_type": {
                                            "npri_metal_mining_df": "mining",
                                            "npri_metal_manufacturing_df": "manufacturing"
                                        },
                                    },
                                    source_dfs = {
                                  "npri_metal_mining_df": npri_metal_mining_df,
                                  "npri_metal_manufacturing_df": npri_metal_manufacturing_df
                                    }
                                    )

Initial facility_df rows: 0
Processing DataFrame: npri_metal_mining_df
Temp DF rows to append: 1676
Rows in facility_df after appending npri_metal_mining_df: 1676
Processing DataFrame: npri_metal_manufacturing_df
Temp DF rows to append: 1856
Rows in facility_df after appending npri_metal_manufacturing_df: 3532
Final facility_df rows: 3532


  facility_df = pd.concat([facility_df, temp_df], ignore_index=True)
  facility_df = pd.concat([facility_df, temp_df], ignore_index=True)


In [223]:
pollutant_df = assign_facility_ids_by_location(facility_df, pollutant_df, proximity_threshold=10000)

Percentage of unmatched rows (facility_id = None): 24.01%


In [224]:
pollutant_df

Unnamed: 0,id,facility_id,facility_name_npri,facility_type,latitude,longitude,geometry,year,terrestrial_ecozone,watershed,...,transfers_for_recycling_metal_recovery,transfers_for_recycling_inorganic_compound_recovery,transfers_for_recycling_acid_or_base_recovery,transfers_for_recycling_catalyst_recovery,transfers_for_recycling_pollution_abatement_residue_recovery,transfers_for_recycling_used_oil_recovery,transfers_for_recycling_other,transfers_for_recycling_total,"grand_total_total_releases,_disposals_and_transfers_for_recycling",source
0,,,True North Gold Mine,mining,51.02180,-95.67950,POINT (-95.6795 51.0218),2023,Boreal Shield,Nelson River Drainage Area,...,,,,,,,,,0.000000,npri_metal_mining_df
1,,,True North Gold Mine,mining,51.02180,-95.67950,POINT (-95.6795 51.0218),2023,Boreal Shield,Nelson River Drainage Area,...,,,,,,,,,0.000000,npri_metal_mining_df
2,,,True North Gold Mine,mining,51.02180,-95.67950,POINT (-95.6795 51.0218),2023,Boreal Shield,Nelson River Drainage Area,...,,,,,,,,,0.000000,npri_metal_mining_df
3,,,True North Gold Mine,mining,51.02180,-95.67950,POINT (-95.6795 51.0218),2023,Boreal Shield,Nelson River Drainage Area,...,,,,,,,,,0.000000,npri_metal_mining_df
4,,,True North Gold Mine,mining,51.02180,-95.67950,POINT (-95.6795 51.0218),2023,Boreal Shield,Nelson River Drainage Area,...,,,,,,,,,0.208000,npri_metal_mining_df
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3527,,,Rayette Manufacturing,manufacturing,43.80800,-79.49600,POINT (-79.496 43.808),2023,Mixedwood Plain,St. Lawrence Drainage Area,...,,,,,,,,,25.100800,npri_metal_manufacturing_df
3528,,,Bowes Manufacturing,manufacturing,43.80360,-79.49480,POINT (-79.4948 43.8036),2023,Mixedwood Plain,St. Lawrence Drainage Area,...,,,,,,,,,23.418000,npri_metal_manufacturing_df
3529,,,541 Bowes Warehouse,manufacturing,43.81726,-79.49707,POINT (-79.49707 43.81726),2023,Mixedwood Plain,St. Lawrence Drainage Area,...,,,,,,,,,16.543746,npri_metal_manufacturing_df
3530,,,Rayette Manufacturing,manufacturing,43.80800,-79.49600,POINT (-79.496 43.808),2023,Mixedwood Plain,St. Lawrence Drainage Area,...,,,,,,,,,12.582300,npri_metal_manufacturing_df


# 4 - Create x table based on company and technical reports

# x - Create SUT energy accounts

In [52]:
sut_df = pd.read_csv(r'data/Emissions_accounts/Raw_data/Mining industries, energy consumption by NAICS - 16100029-eng/16100029.csv') 

In [53]:
sut_df

Unnamed: 0,REF_DATE,GEO,DGUID,North American Industry Classification System (NAICS),Energy types,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2019,Canada,2021A000011124,Metal ore mining [2122],"Total, energy expenses (thousands of dollars)",Thousands of dollars,284,units,0,v1354185599,1.1.1,1929480.0,,,,0
1,2019,Canada,2021A000011124,Metal ore mining [2122],"Coal, quantity purchased (tonnes)",Tonnes,287,units,0,v1354185610,1.1.2,207878.0,,,,0
2,2019,Canada,2021A000011124,Metal ore mining [2122],"Coal, expenses (thousands of dollars)",Thousands of dollars,284,units,0,v1354185621,1.1.3,35852.0,,,,0
3,2019,Canada,2021A000011124,Metal ore mining [2122],"Petroleum coke, quantity purchased (tonnes)",Tonnes,287,units,0,v1354185627,1.1.4,0.0,,,,0
4,2019,Canada,2021A000011124,Metal ore mining [2122],"Petroleum coke, expenses (thousands of dollars)",Thousands of dollars,284,units,0,v1354185628,1.1.5,0.0,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3667,2022,Canada,2021A000011124,"Shale, clay and refractory mineral mining and ...","Wood, quantity purchased (metric tonnes)",Metric tonnes,214,units,0,v1354186268,1.27.30,0.0,,,,0
3668,2022,Canada,2021A000011124,"Shale, clay and refractory mineral mining and ...","Wood, expenses (thousands of dollars)",Thousands of dollars,284,units,0,v1354186269,1.27.31,0.0,,,,0
3669,2022,Canada,2021A000011124,"Shale, clay and refractory mineral mining and ...","Water, quantity purchased (litres)",Litres,203,units,0,v1354186270,1.27.32,,x,,,0
3670,2022,Canada,2021A000011124,"Shale, clay and refractory mineral mining and ...","Water, expenses (thousands of dollars)",Thousands of dollars,284,units,0,v1354186271,1.27.33,,x,,,0


In [54]:
def clean_sut(df, year=None, naics=None, units_to_exclude=None):
    # Drop specified columns
    columns_to_remove = ['DGUID', 'UOM_ID', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS', 'SCALAR_FACTOR']
    df_cleaned = df.drop(columns=columns_to_remove, errors='ignore')
    
    # Filter by year if specified
    if year is not None:
        df_cleaned = df_cleaned[df_cleaned['REF_DATE'] == year]
    
    # Filter by NAICS if specified
    if naics is not None:
        df_cleaned = df_cleaned[df_cleaned['North American Industry Classification System (NAICS)'] == naics]
    
    # Exclude specified units from the UOM column if provided
    if units_to_exclude is not None:
        df_cleaned = df_cleaned[~df_cleaned['UOM'].isin(units_to_exclude)]
        
    # Remove rows where VALUE is 0 or NaN
    df_cleaned = df_cleaned[df_cleaned['VALUE'].notna() & (df_cleaned['VALUE'] != 0)]
    
    df_cleaned.reset_index(drop=True, inplace=True)
        
    return df_cleaned

In [55]:
sut_df_cleaned = clean_sut(sut_df, year=2022, units_to_exclude=["Thousands of dollars"])
sut_df_cleaned

Unnamed: 0,REF_DATE,GEO,North American Industry Classification System (NAICS),Energy types,UOM,VALUE
0,2022,Canada,Metal ore mining [2122],"Coal, quantity purchased (tonnes)",Tonnes,179231.0
1,2022,Canada,Metal ore mining [2122],"Natural gas, quantity purchased (cubic metres)",Cubic metres,152461339.0
2,2022,Canada,Metal ore mining [2122],"Gasoline - aviation, quantity purchased (litres)",Litres,11834517.0
3,2022,Canada,Metal ore mining [2122],"Gasoline - motor, quantity purchased (litres)",Litres,31567077.0
4,2022,Canada,Metal ore mining [2122],"Diesel fuel, quantity purchased (litres)",Litres,941997194.0
...,...,...,...,...,...,...
117,2022,Canada,"Sand and gravel, mining and quarrying [212323]","Light Fuel Oil, quantity purchased (litres)",Litres,160123.0
118,2022,Canada,"Sand and gravel, mining and quarrying [212323]","Heavy Fuel Oil, quantity purchased (litres)",Litres,424742.0
119,2022,Canada,"Sand and gravel, mining and quarrying [212323]","Propane, quantity purchased (litres)",Litres,1152653.0
120,2022,Canada,"Shale, clay and refractory mineral mining and ...","Diesel fuel, quantity purchased (litres)",Litres,577582.0


In [56]:
sut_df_cleaned['North American Industry Classification System (NAICS)'].value_counts()

North American Industry Classification System (NAICS)
Metal ore mining [2122]                                                                                        10
Sand, gravel, clay, and ceramic and refractory minerals mining and quarrying [21232]                            9
Other non-metallic mineral mining and quarrying [21239]                                                         8
Non-metallic mineral mining (excluding aggregates, clay and refractory mineral mining and quarrying) [2123]     8
Aggregates, clay and refractory mineral mining and quarrying (excluding non-metallic mineral mining) [2123]     8
Stone mining and quarrying [21231]                                                                              8
Copper, nickel, lead and zinc ore mining [21223]                                                                7
Copper-zinc ore mining [212233]                                                                                 7
Limestone mining and quarrying [21

In [57]:
def split_by_naics(df_cleaned):
    # Group the DataFrame by the 'NAICS' column
    naics_groups = df_cleaned.groupby('North American Industry Classification System (NAICS)')
    
    # Create a dictionary where keys are NAICS values and values are the corresponding DataFrames
    naics_dfs = {naics: group.reset_index(drop=True) for naics, group in naics_groups}
    
    return naics_dfs

In [58]:
# Apply the function to the cleaned DataFrame
naics_dfs = split_by_naics(sut_df_cleaned)

# Access a specific NAICS DataFrame
uranium_mining_df = naics_dfs['Uranium ore mining [212291]']

In [59]:
uranium_mining_df

Unnamed: 0,REF_DATE,GEO,North American Industry Classification System (NAICS),Energy types,UOM,VALUE
0,2022,Canada,Uranium ore mining [212291],"Diesel fuel, quantity purchased (litres)",Litres,2958363.0
1,2022,Canada,Uranium ore mining [212291],"Propane, quantity purchased (litres)",Litres,29071811.0
2,2022,Canada,Uranium ore mining [212291],"Electricity, quantity purchased (kilowatt-hour)",Kilowatt-hours,305354512.0


# 3 - Normalize emissions per facility

In [26]:
df_production = pd.read_excel(r'../data/canada_site_specific_data.xlsx')

PermissionError: [Errno 13] Permission denied: '../data/canada_site_specific_data.xlsx'

## Example 

In [38]:
npri_voisey = dfs_by_company_facility["Vale_Newfoundland_and_Labrador_Limited_Voisey's_Bay_Mine_Site"]
npri_voisey

Unnamed: 0,level_0,index,Year,NPRI ID,Company Name,Facility Name,City,CSD,CA or CMA,Economic Region,...,Metal Recovery,Inorganic Compound Recovery,Acid or Base Recovery,Catalyst Recovery,Pollution Abatement Residue Recovery,Used Oil Recovery,Other.2,Total.5,"Total Releases, Disposals and Transfers for Recycling",EI
4,32,1070,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,384.221,Ammonia
11,100,1714,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,1421.578,Arsenic
25,191,3009,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,1604.145,Cadmium
39,258,3672,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,335.124,"Carbon monoxide, fossil"
54,329,7468,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,563.493,Chromium
61,391,7957,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,241743.578,Cobalt
74,464,8340,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,1237.842,Copper
87,541,9147,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,0.0,No match found
89,553,9702,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,56.975,56.975,56.975,No match found
91,571,10468,2023,6093,Vale Newfoundland and Labrador Limited,Voisey's Bay Mine Site,Happy Valley-Goose Bay,"Division No. 10, Subd. E",,West Coast--Northern Peninsula--Labrador / Côt...,...,,,,,,,,,4.808,"Benzene, hexachloro-"


In [33]:
def allocate_emissions(
    df, production_data, metals, economic_values=None, method='mass',
    unit_column='Units', 
):
    """
    Updated function to exclude specific columns like 'EI' from conversion,
    while performing unit conversion, normalization, and allocation.
    """
    # Step 1: Trim the DataFrame to include only relevant columns
    start_col = 'Substance Name (English)'
    df = df.loc[:, start_col:].copy()

    # Step 2: Identify emission columns (exclude non-numeric columns like 'EI')
    start_col = 'Stack Emissions'
    end_col = 'Total Releases, Disposals and Transfers for Recycling'
    emission_columns = df.loc[:, start_col:end_col].select_dtypes(include='number').columns.tolist()

    # Step 3: Convert units to kg for emissions columns only
    conversion_factors = {'tonnes': 1000, 'kg': 1, 'grams': 0.001, 'g TEQ': 0.001}
    if unit_column in df.columns:
        for unit, factor in conversion_factors.items():
            mask = df[unit_column] == unit
            df.loc[mask, emission_columns] = df.loc[mask, emission_columns] * factor
        df[unit_column] = 'kg'

    # Step 4: Normalize emissions per kg of total production
    total_production = sum(production_data.values())
    for col in emission_columns:
        df[f'{col}_per_kg_Total'] = df[col] / total_production

    # Step 5: Allocate emissions (mass or economic)
    allocation_factors = {}
    for metal, production in production_data.items():
        if method == 'mass':
            allocation_factors[metal] = production / total_production
        elif method == 'economic':
            total_economic_value = sum(production_data[m] * economic_values[m] for m in metals)
            allocation_factors[metal] = (production * economic_values[metal]) / total_economic_value
        else:
            raise ValueError("Invalid method. Use 'mass' or 'economic'.")

    for col in emission_columns:
        for metal in metals:
            df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]

    return df

In [34]:
production_data = {'Nickel': 50000000, 'Copper': 32000000}  # kg of nickel and copper metal in concentrate
economic_values = {'Nickel': 13800, 'Copper': 6160}  # unit value in 2019 $/t given by USGS
metals = ['Nickel', 'Copper']

In [35]:
mass_df = allocate_emissions(
    npri_voisey, production_data, metals, economic_values=economic_values, method='mass', 
    unit_column='Units'
)

economic_df = allocate_emissions(
    npri_voisey, production_data, metals, economic_values=economic_values, method='economic', 
    unit_column='Units'
)

  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Total'] * allocation_factors[metal]
  df[f'{col}_per_kg_{metal}'] = df[f'{col}_per_kg_Tota

In [36]:
def organize_emissions_data(df, metals):
    """
    Organize the emissions data into a clean format with desired columns.
    """
    # Define category mapping
    category_mapping = {
        'Air Emissions': ['Stack Emissions', 'Storage / Handling', 'Fugitive Emissions', 'Spills', 'Other', 'Total'],
        'Water Releases': ['Direct Discharge', 'Spills', 'Leaks', 'Total'],
        'Land Releases': ['Spills', 'Leaks', 'Other', 'Total'],
        'Total Releases': ['Total Releases (Excluding Road Dust)', 'Road Dust Emissions', 'Total Releases Including Road Dust'],
        'On-Site Disposal': ['Land Fill', 'Land Treatment', 'Underground Injection', 'Tailings', 'Waste Rock', 'Total On-Site'],
        'Off-Site Disposal': ['Land Fill', 'Land Treatment', 'Underground Injection', 'Storage', 'Tailings', 'Waste Rock', 'Total Off-Site'],
        'Transfers for Recycling': ['Energy Recovery', 'Metal Recovery', 'Other', 'Total']
    }

    # Prepare rows for clean output
    rows = []
    for category, columns in category_mapping.items():
        for col in columns:
            total_col = f"{col}_per_kg_Total"
            metal_cols = [f"{col}_per_kg_{metal}" for metal in metals]

            if total_col in df.columns:
                for _, row in df.iterrows():
                    row_data = {
                        'Type': category,
                        'Category': col,
                        'Substance Name (English)': row['Substance Name (English)'],
                        'Ecoinvent Name': row['EI'] if 'EI' in df.columns else 'Unknown',
                        'Unit': row['Units'],
                        'Total Emissions (per kg)': row[total_col],
                    }
                    for metal, metal_col in zip(metals, metal_cols):
                        row_data[f'{metal} Emissions (per kg)'] = row[metal_col] if metal_col in df.columns else 0
                    rows.append(row_data)

    # Convert to DataFrame
    result_df = pd.DataFrame(rows)

    return result_df

In [37]:
mass_df = organize_emissions_data(mass_df, metals)
economic_df = organize_emissions_data(economic_df, metals)

In [38]:
with pd.ExcelWriter('allocation.xlsx', engine='xlsxwriter') as writer:
    mass_df.to_excel(writer, sheet_name='mass', index=False)
    economic_df.to_excel(writer, sheet_name='eco', index=False) 

# 4 - Create national consumption markets based on production data 