In [1]:
import pandas as pd
import numpy as np

In [2]:
# Custom functions
from core.data_manipulations import merge_without_suffixes, merge_main_and_group

In [3]:
metallican_path = r'C:\Users\mp_ma\OneDrive - polymtl\POST_DOC\CODE\metallican_db'

# Import MetalliCan tables

In [4]:
main_table = pd.read_csv(metallican_path + r'\database\CSV\main_table.csv')
production_table = pd.read_csv(metallican_path + r'\database\CSV\production_table.csv')
tech_attributes_table = pd.read_csv(metallican_path + r'\database\CSV\tech_attributes_table.csv')
env_table = pd.read_csv(metallican_path + r'\database\CSV\environmental_flows_table.csv')
technosphere_table = pd.read_csv(metallican_path + r'\database\CSV\materials_energy_table.csv')
archetypes_table = pd.read_csv(metallican_path + r'\database\CSV\archetypes_table.csv')
land_table = pd.read_csv(metallican_path + r'\database\CSV\land_occupation_table.csv')
intensity_table = pd.read_csv(metallican_path + r'\database\CSV\intensity_table.csv')
waste_table = pd.read_table(metallican_path + r'\database\CSV\waste_table.csv', sep=',')

In [5]:
substances_table = pd.read_csv(metallican_path + r'\database\CSV\substances_table.csv')

In [6]:
intensity_table_nrj = intensity_table[intensity_table['type'] == 'Energy']
intensity_table_nrj = intensity_table_nrj[~intensity_table_nrj['intensity_id'].str.startswith('INT-CMP')]

In [7]:
# Let's remove ClimateTRACE data
production_table = production_table[~production_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]

# Harmonization of environmental flow units

In [8]:
# Let's remove ClimateTRACE data
env_table = env_table[~env_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]

In [9]:
env_table

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,facility_group_id,company_id,source_id
28,npri-1568-2023-1,2023,Air,NA - 16,Emission,Stack Emissions,tonnes,0.636,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
29,npri-1568-2023-2,2023,Air,NA - 02,Emission,Stack Emissions,kg,0.116,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
30,npri-1568-2023-3,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,76.208,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
31,npri-1568-2023-4,2023,Air,NA - 04,Emission,Stack Emissions,tonnes,0.00105,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
32,npri-1568-2023-5,2023,Air,NA - 05,Emission,Stack Emissions,kg,0.18,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5147,ENV-9de9bb0d-2023-8,2023,Air,NA - M16,Emission,,t,0.99,,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables
5148,ENV-9de9bb0d-2023-9,2023,Water,7732-18-5,Withdrawal,,m3,168115.0,Water withdrawal,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables
5149,ENV-9de9bb0d-2023-10,2023,Water,7732-18-5,Withdrawal,,m3,168115.0,Water withdrawal|Freshwater withdrawn,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables
5150,ENV-9de9bb0d-2023-11,2023,Water,7732-18-5,Consumption,,m3,82144.0,Total water consumption,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables


In [10]:
unit_mapping = {
'7732-18-5' : 'm3',
'NA - M10' : 'tonnes',
'NA - M09' : 'tonnes',
'NA - M16' : 'tonnes',
'630-08-0' : 'tonnes',
'NA - NOx' : 'tonnes',
'NA - 08' : 'kg',
'NA - 10' : 'tonnes',
'NA - 02' : 'kg',
'NA - M08' : 'tonnes',
'NA - GHG' : 'tCO2eq',
'NA - 12': 'kg',
'NA - SOx': 'tonnes'
}

In [11]:
conversion_factors = {
    ('kg', 'tonnes'): 1/1000,
    ('kg', 't'): 1/1000,
    ('tonnes', 'kg'): 1000,
    ('t', 'kg'): 1000,
    ('tonnes', 't'): 1,
    ('t', 'tonnes'): 1,
    ('ML', 'm3'): 1000,
    ('m3', 'ML'): 1/1000,
    ('L', 'm3'): 1/1000,
    ('m3', 'L'): 1000,
    ('Mm3', 'm3'): 1000000,
    ('km3', 'm3'): 1000,
    ('ktCO2eq', 'tCO2eq'): 1000,
    ('tCO2eq', 'ktCO2eq'): 1/1000
}


In [12]:
def update_units_and_values(row):
    substance_id = row['substance_id']
    current_unit = row['unit']
    value = row['value']

    if substance_id in unit_mapping:
        target_unit = unit_mapping[substance_id]

        if current_unit != target_unit:
            conversion_key = (current_unit, target_unit)
            if conversion_key in conversion_factors:
                value = value * conversion_factors[conversion_key]
            # Si la conversion n'est pas définie, la valeur reste inchangée

        current_unit = target_unit  # Mettre à jour l'unité

    return pd.Series({'unit': current_unit, 'value': value})


env_table['value'] = pd.to_numeric(env_table['value'], errors='coerce')
mask = env_table['substance_id'].isin(unit_mapping.keys())
env_table.loc[mask, ['unit', 'value']] = env_table.loc[mask].apply(update_units_and_values, axis=1)

# Harmonization of technosphere units

In [13]:
# Separe energy and material
energy_df = technosphere_table[technosphere_table['flow_type'] == 'Energy']
material_df = technosphere_table[technosphere_table['flow_type'] == 'Material use']

In [14]:
energy_df

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id
0,TECH-857b7b89-2023-1,2023,Energy,Acetylene,GJ,18.475651,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data
1,TECH-857b7b89-2023-2,2023,Energy,Aviation fuel,GJ,72676.110790,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data
2,TECH-857b7b89-2023-3,2023,Energy,Diesel,GJ,287042.447232,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data
3,TECH-857b7b89-2023-4,2023,Energy,Gasoline,GJ,13568.450000,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data
4,TECH-857b7b89-2023-5,2023,Energy,Propane,GJ,42071.041300,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data
...,...,...,...,...,...,...,...,...,...,...,...
226,TECH-7607a50e-2023-3,2023,Energy,Diesel,GJ,140100.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023
227,TECH-7607a50e-2023-4,2023,Energy,Gasoline,GJ,2124.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023
228,TECH-7607a50e-2023-5,2023,Energy,Naphta,GJ,6344.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023
229,TECH-7607a50e-2023-6,2023,Energy,Natural gas,GJ,221612.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023


## Energy flows

In [15]:
from core.conversion_functions import standardize_energy_to_MJ

In [16]:
energy_df_sd = standardize_energy_to_MJ(energy_df)

In [17]:
energy_df_sd

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,value_MJ,unit_source,assumption_note,unit_standard,needs_factor
0,TECH-857b7b89-2023-1,2023,Energy,Acetylene,GJ,18.475651,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data,1.847565e+04,direct_unit,gj→MJ factor=1000.0,MJ,False
1,TECH-857b7b89-2023-2,2023,Energy,Aviation fuel,GJ,72676.110790,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data,7.267611e+07,direct_unit,gj→MJ factor=1000.0,MJ,False
2,TECH-857b7b89-2023-3,2023,Energy,Diesel,GJ,287042.447232,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data,2.870424e+08,direct_unit,gj→MJ factor=1000.0,MJ,False
3,TECH-857b7b89-2023-4,2023,Energy,Gasoline,GJ,13568.450000,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data,1.356845e+07,direct_unit,gj→MJ factor=1000.0,MJ,False
4,TECH-857b7b89-2023-5,2023,Energy,Propane,GJ,42071.041300,,BC-MAIN-857b7b89,,CMP-4a434d72,SRC_NewmontCorporation_2023-Performance-data,4.207104e+07,direct_unit,gj→MJ factor=1000.0,MJ,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226,TECH-7607a50e-2023-3,2023,Energy,Diesel,GJ,140100.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,1.401000e+08,direct_unit,gj→MJ factor=1000.0,MJ,False
227,TECH-7607a50e-2023-4,2023,Energy,Gasoline,GJ,2124.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,2.124000e+06,direct_unit,gj→MJ factor=1000.0,MJ,False
228,TECH-7607a50e-2023-5,2023,Energy,Naphta,GJ,6344.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,6.344000e+06,direct_unit,gj→MJ factor=1000.0,MJ,False
229,TECH-7607a50e-2023-6,2023,Energy,Natural gas,GJ,221612.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,2.216120e+08,direct_unit,gj→MJ factor=1000.0,MJ,False


## Material flows

In [18]:
from core.conversion_functions import standardize_materials_to_t

In [19]:
material_df_sd = standardize_materials_to_t(material_df)

In [20]:
material_df_sd

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,mass_t,mass_source,mass_note,needs_density
35,TECH-6dc537e6-2023-2,2023,Material use,Cement,t,27374.0,,QC-MAIN-6dc537e6,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,27374.0,t,reported in tonnes,False
39,TECH-6dc537e6-2023-6,2023,Material use,Lime,t,2229.0,,QC-MAIN-6dc537e6,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,2229.0,t,reported in tonnes,False
40,TECH-6dc537e6-2023-7,2023,Material use,Lubricants,kl,380.687,,QC-MAIN-6dc537e6,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,335.00456,L×density→t,kl→L × density kg/L; density=0.88,False
41,TECH-6dc537e6-2023-8,2023,Material use,Nitric acid (HNO3),l,26575.0,,QC-MAIN-6dc537e6,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,,missing_density,volume reported; no density mapping for this s...,True
43,TECH-6dc537e6-2023-10,2023,Material use,Sodium cyanide (NaCN),t,838.8,,QC-MAIN-6dc537e6,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,838.8,t,reported in tonnes,False
44,TECH-6dc537e6-2023-11,2023,Material use,Sulfuric acid (H2SO4),l,73.96,,QC-MAIN-6dc537e6,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,,missing_density,volume reported; no density mapping for this s...,True
45,TECH-6dc537e6-2023-12,2023,Material use,Tires,t,241.250567,,QC-MAIN-6dc537e6,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,241.250567,t,reported in tonnes,False
65,TECH-687b8c8d-2023-6,2023,Material use,Total sodium cyanide used,t,245.0,,ON-MAIN-687b8c8d,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,245.0,t,reported in tonnes,False
66,TECH-687b8c8d-2023-7,2023,Material use,Total blasting agents used e.g. ANFO,t,486.3,,ON-MAIN-687b8c8d,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,486.3,t,reported in tonnes,False
116,TECH-fefeaee4-2023-3,2023,Material use,Cement,t,9417.6,,ON-MAIN-fefeaee4,,CMP-4a434d72,SRC_NewmontCorporation_Newmont-2023-Performanc...,9417.6,t,reported in tonnes,False


In [22]:
energy_df_sd.to_excel(r'data/MetalliCan/cleaned_data/energy_df_sd.xlsx', index=False)
material_df_sd.to_excel(r'data/MetalliCan/cleaned_data/material_df_sd.xlsx', index=False)

# Normalization of technosphere and biosphere flows by production values

## Check production_data_available

In [23]:
# To get the reference points available
production_data_available = production_table.groupby(['main_id', 'facility_group_id'], dropna=False).agg(
     commodities=('commodity', lambda x: x.unique().tolist()),
     reference_points=('reference_point', lambda x: x.unique().tolist()),
     material_types=('material_type', lambda x: x.unique().tolist())
 ).reset_index()

In [25]:
production_data_available = merge_main_and_group(production_data_available, main_table)

In [37]:
from core.data_manipulations import prepare_normalization_data, normalize_by_production

In [38]:
normalization_production_df = prepare_normalization_data(production_table)
# Let's remove usable ore for main_id = YT-MAIN-44857446
normalization_production_df = normalization_production_df[~((normalization_production_df['main_id'] == 'YT-MAIN-44857446') & (normalization_production_df['reference_point'] == 'Usable ore'))]

In [39]:
normalization_production_df.to_csv(r'normalization_production_df.csv', index=False)

## Work on the production table

## Technosphere flows

In [40]:
energy_df_sd_norm = normalize_by_production(energy_df_sd, normalization_production_df, value_col='value_MJ', prod_col='value_tonnes')

In [41]:
energy_df_sd_norm

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,...,value_MJ,unit_source,assumption_note,unit_standard,needs_factor,value_tonnes_main,value_tonnes_group,value_tonnes_match,value_normalized,normalization_key
0,TECH-857b7b89-2023-1,2023,Energy,Acetylene,GJ,18.475651,,BC-MAIN-857b7b89,,CMP-4a434d72,...,1.847565e+04,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,0.111299,main_id
1,TECH-857b7b89-2023-2,2023,Energy,Aviation fuel,GJ,72676.110790,,BC-MAIN-857b7b89,,CMP-4a434d72,...,7.267611e+07,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,437.807896,main_id
2,TECH-857b7b89-2023-3,2023,Energy,Diesel,GJ,287042.447232,,BC-MAIN-857b7b89,,CMP-4a434d72,...,2.870424e+08,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,1729.171369,main_id
3,TECH-857b7b89-2023-4,2023,Energy,Gasoline,GJ,13568.450000,,BC-MAIN-857b7b89,,CMP-4a434d72,...,1.356845e+07,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,81.737651,main_id
4,TECH-857b7b89-2023-5,2023,Energy,Propane,GJ,42071.041300,,BC-MAIN-857b7b89,,CMP-4a434d72,...,4.207104e+07,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,253.440008,main_id
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,TECH-7607a50e-2023-3,2023,Energy,Diesel,GJ,140100.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,1.401000e+08,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,48.678844,main_id
178,TECH-7607a50e-2023-4,2023,Energy,Gasoline,GJ,2124.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,2.124000e+06,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,0.738000,main_id
179,TECH-7607a50e-2023-5,2023,Energy,Naphta,GJ,6344.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,6.344000e+06,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,2.204273,main_id
180,TECH-7607a50e-2023-6,2023,Energy,Natural gas,GJ,221612.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,2.216120e+08,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,77.000827,main_id


In [42]:
material_df_sd_norm = normalize_by_production(material_df_sd, normalization_production_df, value_col='mass_t', prod_col='value_tonnes')

In [43]:
energy_df_sd_norm.to_excel(r'data/MetalliCan/cleaned_data/energy_df_sd_norm.xlsx', index=False)
material_df_sd_norm.to_excel(r'data/MetalliCan/cleaned_data/material_df_sd_norm.xlsx', index=False)

## Biosphere flows

In [44]:
biosphere_df_norm = normalize_by_production(env_table, normalization_production_df, value_col='value', prod_col='value_tonnes')

In [45]:
biosphere_df_norm

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,facility_group_id,company_id,source_id,value_tonnes_main,value_tonnes_group,value_tonnes_match,value_normalized,normalization_key
0,npri-1568-2023-1,2023,Air,NA - 16,Emission,Stack Emissions,tonnes,0.63600,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...,441588.0,,441588.0,1.440257e-06,main_id
1,npri-1568-2023-2,2023,Air,NA - 02,Emission,Stack Emissions,kg,0.11600,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...,441588.0,,441588.0,2.626883e-07,main_id
2,npri-1568-2023-3,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,76.20800,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...,441588.0,,441588.0,1.725772e-04,main_id
3,npri-1568-2023-4,2023,Air,NA - 04,Emission,Stack Emissions,tonnes,0.00105,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...,441588.0,,441588.0,2.377782e-09,main_id
4,npri-1568-2023-5,2023,Air,NA - 05,Emission,Stack Emissions,kg,0.18000,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...,441588.0,,441588.0,4.076198e-07,main_id
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5119,ENV-9de9bb0d-2023-8,2023,Air,NA - M16,Emission,,tonnes,0.99000,,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables,191148.0,,191148.0,5.179233e-06,main_id
5120,ENV-9de9bb0d-2023-9,2023,Water,7732-18-5,Withdrawal,,m3,168115.00000,Water withdrawal,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables,191148.0,,191148.0,8.795017e-01,main_id
5121,ENV-9de9bb0d-2023-10,2023,Water,7732-18-5,Withdrawal,,m3,168115.00000,Water withdrawal|Freshwater withdrawn,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables,191148.0,,191148.0,8.795017e-01,main_id
5122,ENV-9de9bb0d-2023-11,2023,Water,7732-18-5,Consumption,,m3,82144.00000,Total water consumption,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables,191148.0,,191148.0,4.297403e-01,main_id


In [46]:
biosphere_df_norm['unit_normalized'] = biosphere_df_norm['unit'] + '/t'

In [48]:
biosphere_df_norm.to_excel(r'data/MetalliCan/cleaned_data/biosphere_df_norm.xlsx', index=False)

# Clean land occupation table

In [53]:
land_table

Unnamed: 0,land_occupation_id,area_km2,geometry,distance_km,main_id,tailing_id,source_id
0,45954,2.622800,POLYGON Z ((-62.920418000502096 44.98665598524...,0.000000,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
1,45955,0.441869,POLYGON Z ((-62.92612599959345 44.991678995543...,1.186584,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
2,7631,0.209038,POLYGON Z ((-53.8260233402639 47.4043112260747...,0.811781,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
3,7641,0.113569,POLYGON Z ((-53.7468445297622 47.4086824659858...,4.647047,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
4,7633,0.213592,POLYGON Z ((-53.81396413042625 47.413655936064...,0.000000,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
...,...,...,...,...,...,...,...
789,QC-LI-219a2e44,2.926809,MULTIPOLYGON Z (((-79.08481557072447 48.229685...,,QC-MAIN-30c1828c,,https://www.nature.com/articles/s41597-025-052...
790,BC-LI-9c3aa818,0.122644,POLYGON Z ((-130.49619675383414 56.64850845409...,,BC-MAIN-b1fe389a,,https://www.nature.com/articles/s41597-025-052...
791,ON-LI-1a9a061c,4.202254,MULTIPOLYGON Z (((-81.36899483562593 48.678821...,,ON-MAIN-f8313ebd,,https://www.nature.com/articles/s41597-025-052...
792,QC-LI-94bd8222,6.202897,POLYGON Z ((-79.22195784417586 49.574666146450...,,QC-MAIN-b86f7d07,,https://www.nature.com/articles/s41597-025-052...


In [54]:
# Step 1: Find main_id that have more than one unique source_id
multi_source_main_ids = land_table.groupby('main_id')['source_id'].nunique()
multi_source_main_ids = multi_source_main_ids[multi_source_main_ids > 1].index

# Step 2: Filter the dataframe
land_table = land_table[
    ~land_table['main_id'].isin(multi_source_main_ids) |
    (land_table['source_id'] == "https://www.nature.com/articles/s41597-025-05296-y")
]

In [55]:
land_table

Unnamed: 0,land_occupation_id,area_km2,geometry,distance_km,main_id,tailing_id,source_id
0,45954,2.622800,POLYGON Z ((-62.920418000502096 44.98665598524...,0.000000,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
1,45955,0.441869,POLYGON Z ((-62.92612599959345 44.991678995543...,1.186584,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
2,7631,0.209038,POLYGON Z ((-53.8260233402639 47.4043112260747...,0.811781,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
3,7641,0.113569,POLYGON Z ((-53.7468445297622 47.4086824659858...,4.647047,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
4,7633,0.213592,POLYGON Z ((-53.81396413042625 47.413655936064...,0.000000,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
...,...,...,...,...,...,...,...
789,QC-LI-219a2e44,2.926809,MULTIPOLYGON Z (((-79.08481557072447 48.229685...,,QC-MAIN-30c1828c,,https://www.nature.com/articles/s41597-025-052...
790,BC-LI-9c3aa818,0.122644,POLYGON Z ((-130.49619675383414 56.64850845409...,,BC-MAIN-b1fe389a,,https://www.nature.com/articles/s41597-025-052...
791,ON-LI-1a9a061c,4.202254,MULTIPOLYGON Z (((-81.36899483562593 48.678821...,,ON-MAIN-f8313ebd,,https://www.nature.com/articles/s41597-025-052...
792,QC-LI-94bd8222,6.202897,POLYGON Z ((-79.22195784417586 49.574666146450...,,QC-MAIN-b86f7d07,,https://www.nature.com/articles/s41597-025-052...


In [56]:
# For each main_id, give me the sum of area_km2 associated
land_table = land_table.groupby('main_id')['area_km2'].sum().reset_index()

In [57]:
land_table

Unnamed: 0,main_id,area_km2
0,BC-MAIN-23155c25,1.499690
1,BC-MAIN-3ef4f421,1.396089
2,BC-MAIN-3f490561,7.967835
3,BC-MAIN-4724f4ba,0.416737
4,BC-MAIN-599152a0,13.233210
...,...,...
111,SK-MAIN-9dd2b7f8,4.345047
112,SK-MAIN-bb89158f,10.235647
113,SK-MAIN-d3c471e8,1.973892
114,YT-MAIN-44857446,5.293594


In [58]:
land_table = land_table.merge(main_table[['main_id', 'facility_group_id', 'facility_name', 'facility_group_name', 'province', 'facility_type', 'mining_processing_type', 'commodities']], on='main_id', how='left')

In [59]:
land_table

Unnamed: 0,main_id,area_km2,facility_group_id,facility_name,facility_group_name,province,facility_type,mining_processing_type,commodities
0,BC-MAIN-23155c25,1.499690,,Myra Falls,,British Columbia,mining,Underground,"Zinc, copper, silver, gold, lead"
1,BC-MAIN-3ef4f421,1.396089,,Avanti Kitsault,,British Columbia,project,,"Molybdenum, silver, lead"
2,BC-MAIN-3f490561,7.967835,,Mount Polley,,British Columbia,mining,"Open-pit, concentrator","Gold, copper, silver"
3,BC-MAIN-4724f4ba,0.416737,,Elk,,British Columbia,mining,Open-pit,Gold
4,BC-MAIN-599152a0,13.233210,,Copper Mountain,,British Columbia,mining,"Open-pit, concentrator","Copper, gold, silver"
...,...,...,...,...,...,...,...,...,...
111,SK-MAIN-9dd2b7f8,4.345047,,Horseshoe-Raven,,Saskatchewan,project,,Uranium
112,SK-MAIN-bb89158f,10.235647,GRP-21eee27d,Key Lake,Key Lake + McArthur River,Saskatchewan,mining,Concentrator,Uranium
113,SK-MAIN-d3c471e8,1.973892,GRP-21eee27d,McArthur River,Key Lake + McArthur River,Saskatchewan,mining,Underground,Uranium
114,YT-MAIN-44857446,5.293594,,Keno Hill Silver District,,Yukon,mining,"Underground, concentrator","Silver, zinc, lead"


In [60]:
land_table_mining = land_table[land_table['facility_type'] == 'mining']

In [61]:
land_table_mining.to_excel(r'data/MetalliCan/land_table_mining.xlsx', index=False)

# Add commodity information

In [47]:
energy_df_sd_norm = merge_main_and_group(energy_df_sd_norm, main_table)
material_df_sd_norm = merge_main_and_group(material_df_sd_norm, main_table)
biosphere_df_norm = merge_main_and_group(biosphere_df_norm, main_table)

In [48]:
energy_df_sd_norm

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,...,needs_factor,value_tonnes_main,value_tonnes_group,value_tonnes_match,value_normalized,normalization_key,facility_name,facility_group_name,mining_processing_type,commodities
0,TECH-857b7b89-2023-1,2023,Energy,Acetylene,GJ,18.475651,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,0.111299,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
1,TECH-857b7b89-2023-2,2023,Energy,Aviation fuel,GJ,72676.110790,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,437.807896,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
2,TECH-857b7b89-2023-3,2023,Energy,Diesel,GJ,287042.447232,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,1729.171369,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
3,TECH-857b7b89-2023-4,2023,Energy,Gasoline,GJ,13568.450000,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,81.737651,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
4,TECH-857b7b89-2023-5,2023,Energy,Propane,GJ,42071.041300,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,253.440008,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,TECH-7607a50e-2023-3,2023,Energy,Diesel,GJ,140100.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,48.678844,main_id,Young-Davidson,,"Underground, concentrator",Gold
178,TECH-7607a50e-2023-4,2023,Energy,Gasoline,GJ,2124.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,0.738000,main_id,Young-Davidson,,"Underground, concentrator",Gold
179,TECH-7607a50e-2023-5,2023,Energy,Naphta,GJ,6344.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,2.204273,main_id,Young-Davidson,,"Underground, concentrator",Gold
180,TECH-7607a50e-2023-6,2023,Energy,Natural gas,GJ,221612.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,77.000827,main_id,Young-Davidson,,"Underground, concentrator",Gold


In [49]:
energy_df_sd_norm['commodities'].value_counts()

commodities
Gold                                                   75
Gold, silver                                           56
Gold, copper, silver                                   14
Copper, gold, silver                                    9
Iron                                                    6
Copper, molybdenum, silver                              5
Silver, zinc, lead                                      5
Gold, zinc, copper, silver, cadmium                     5
Gold, zinc                                              4
Platinum group metals, gold, nickel, copper, cobalt     3
Name: count, dtype: int64

In [50]:
material_df_sd_norm['commodities'].value_counts()

commodities
Gold, silver            25
Gold                    18
Gold, copper, silver     7
Name: count, dtype: int64

In [51]:
biosphere_df_norm['commodities'].value_counts()

commodities
Gold                                                                                                                                                                                                1033
Steel                                                                                                                                                                                                793
Gold, silver                                                                                                                                                                                         508
Iron                                                                                                                                                                                                 287
Nickel, copper, platinum group metals, gold, silver, cobalt, selenium, tellurium                                                                                                        

In [52]:
energy_df_sd_norm.to_excel(r'data/MetalliCan/energy_df_sd_norm.xlsx', index=False)
material_df_sd_norm.to_excel(r'data/MetalliCan/material_df_sd_norm.xlsx', index=False)
biosphere_df_norm.to_excel(r'data/MetalliCan/biosphere_df_norm.xlsx', index=False)