In [1]:
import pandas as pd

In [2]:
from utils.data_manipulations import merge_main_and_group, aggregate_biosphere_facility_groups

In [3]:
metallican_path = r'C:\Users\mp_ma\OneDrive - polymtl\POST_DOC\CODE\metallican_db'

# Import MetalliCan tables

In [4]:
main_table = pd.read_csv(metallican_path + r'\database\CSV\main_table.csv')
production_table = pd.read_csv(metallican_path + r'\database\CSV\production_table.csv')
tech_attributes_table = pd.read_csv(metallican_path + r'\database\CSV\tech_attributes_table.csv')
env_table = pd.read_csv(metallican_path + r'\database\CSV\environmental_flows_table.csv')
technosphere_table = pd.read_csv(metallican_path + r'\database\CSV\materials_energy_table.csv')
archetypes_table = pd.read_csv(metallican_path + r'\database\CSV\archetypes_table.csv')
land_table = pd.read_csv(metallican_path + r'\database\CSV\land_occupation_table.csv')
intensity_table = pd.read_csv(metallican_path + r'\database\CSV\intensity_table.csv')
waste_table = pd.read_table(metallican_path + r'\database\CSV\waste_table.csv', sep=',')
substances_table = pd.read_csv(metallican_path + r'\database\CSV\substances_table.csv')

In [5]:
intensity_table_nrj = intensity_table[intensity_table['type'] == 'Energy']
intensity_table_nrj = intensity_table_nrj[~intensity_table_nrj['intensity_id'].str.startswith('INT-CMP')]

In [6]:
# Let's remove ClimateTRACE data from production and env tables
production_table = production_table[~production_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]
env_table = env_table[~env_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]

In [7]:
production_table = merge_main_and_group(production_table, main_table, cols_to_add=['facility_name', 'facility_group_name'])
ta_table = merge_main_and_group(tech_attributes_table, main_table, cols_to_add=['facility_name', 'facility_group_name'])

In [8]:
production_table.to_csv(r'data\MetalliCan\pre_cleaned_data\production_table.csv', index=False, encoding='utf-8-sig')
ta_table.to_csv(r'data\MetalliCan\pre_cleaned_data\ta_table.csv', index=False, encoding='utf-8-sig')

In [9]:
# Let's remove ClimateTRACE data
env_table = env_table[~env_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]

In [10]:
#ids_energy = set(energy_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
#ids_material = set(material_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
#ids_biosphere = set(biosphere_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
#ids_land = set(land_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))

In [11]:
# Facilities with at least technosphere and biosphere flows
#best_ids = ids_energy & ids_material & ids_biosphere
#best_ids

## Harmonize units for the same substance_id

In [12]:
unit_mapping = {
'7732-18-5' : 'm3',
'NA - M10' : 'tonnes',
'NA - M09' : 'tonnes',
'NA - M16' : 'tonnes',
'630-08-0' : 'tonnes',
'NA - NOx' : 'tonnes',
'NA - 08' : 'kg',
'NA - 10' : 'tonnes',
'NA - 02' : 'kg',
'NA - M08' : 'tonnes',
'NA - GHG' : 'tCO2eq',
'NA - 12': 'kg',
'NA - SOx': 'tonnes'
}

In [13]:
conversion_factors = {
    ('kg', 'tonnes'): 1/1000,
    ('kg', 't'): 1/1000,
    ('tonnes', 'kg'): 1000,
    ('t', 'kg'): 1000,
    ('tonnes', 't'): 1,
    ('t', 'tonnes'): 1,
    ('ML', 'm3'): 1000,
    ('m3', 'ML'): 1/1000,
    ('L', 'm3'): 1/1000,
    ('m3', 'L'): 1000,
    ('Mm3', 'm3'): 1000000,
    ('km3', 'm3'): 1000,
    ('ktCO2eq', 'tCO2eq'): 1000,
    ('tCO2eq', 'ktCO2eq'): 1/1000
}


In [14]:
def update_units_and_values(row):
    substance_id = row['substance_id']
    current_unit = row['unit']
    value = row['value']

    if substance_id in unit_mapping:
        target_unit = unit_mapping[substance_id]

        if current_unit != target_unit:
            conversion_key = (current_unit, target_unit)
            if conversion_key in conversion_factors:
                value = value * conversion_factors[conversion_key]
            # Si la conversion n'est pas définie, la valeur reste inchangée

        current_unit = target_unit  # Mettre à jour l'unité

    return pd.Series({'unit': current_unit, 'value': value})


env_table['value'] = pd.to_numeric(env_table['value'], errors='coerce')
mask = env_table['substance_id'].isin(unit_mapping.keys())
env_table.loc[mask, ['unit', 'value']] = env_table.loc[mask].apply(update_units_and_values, axis=1)

In [15]:
env_table

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,facility_group_id,company_id,source_id
28,npri-1568-2023-1,2023,Air,NA - 16,Emission,Stack Emissions,tonnes,0.63600,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
29,npri-1568-2023-2,2023,Air,NA - 02,Emission,Stack Emissions,kg,0.11600,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
30,npri-1568-2023-3,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,76.20800,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
31,npri-1568-2023-4,2023,Air,NA - 04,Emission,Stack Emissions,tonnes,0.00105,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
32,npri-1568-2023-5,2023,Air,NA - 05,Emission,Stack Emissions,kg,0.18000,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5359,ENV-9de9bb0d-2023-8,2023,Air,NA - M16,Emission,,tonnes,0.99000,,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables
5360,ENV-9de9bb0d-2023-9,2023,Water,7732-18-5,Withdrawal,,m3,168115.00000,Water withdrawal,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables
5361,ENV-9de9bb0d-2023-10,2023,Water,7732-18-5,Withdrawal,,m3,168115.00000,Water withdrawal|Freshwater withdrawn,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables
5362,ENV-9de9bb0d-2023-11,2023,Water,7732-18-5,Consumption,,m3,82144.00000,Total water consumption,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables


## Aggregate data at facility-group level

In [16]:
biosphere_df = aggregate_biosphere_facility_groups(env_table, remove_individuals='True')

In [17]:
# To have the substance names
biosphere_df = biosphere_df.merge(substances_table[['substance_id', 'substance_name']], how='left',
                                        on='substance_id')

In [18]:
biosphere_df.to_excel(r'data/MetalliCan/pre_cleaned_data/biosphere_df.xlsx', index=False)

# Harmonization of technosphere units

In [19]:
# Separe energy and material
energy_df = technosphere_table[technosphere_table['flow_type'] == 'Energy']
material_df = technosphere_table[technosphere_table['flow_type'] == 'Material use']

## Energy flows

In [20]:
from utils.conversion_functions import standardize_energy_to_MJ

In [21]:
# Replace the unit 'kl' to 'm3' for Natural gas to avoid issues during conversion
energy_df.loc[(energy_df['subflow_type'] == 'Natural gas') & (energy_df['unit'] == 'kl'), 'unit'] = 'm3'

In [22]:
energy_df_sd = standardize_energy_to_MJ(energy_df)

In [23]:
energy_df_sd[energy_df_sd['value_MJ'].isna()]

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,value_MJ,unit_source,assumption_note,unit_standard,needs_factor
109,TECH-ed23117f-2023-1,2023,Energy,Electricity consumption|Not specified,tCO2eq,7075.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No LHV for subflow=electricity consumption,,True
110,TECH-ed23117f-2023-2,2023,Energy,Diesel,tCO2eq,68884.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for diesel [tco2eq],,True
111,TECH-ed23117f-2023-3,2023,Energy,Explosives,tCO2eq,690.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for explosives [tco2eq],,True
112,TECH-ed23117f-2023-4,2023,Energy,Gasoline,tCO2eq,1557.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for gasoline [tco2eq],,True
113,TECH-ed23117f-2023-5,2023,Energy,Propane,tCO2eq,1362.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for propane [tco2eq],,True


In [24]:
# Check the rows where needs_factor is TRUE
energy_df_sd[energy_df_sd['value_MJ'].isna()]

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,value_MJ,unit_source,assumption_note,unit_standard,needs_factor
109,TECH-ed23117f-2023-1,2023,Energy,Electricity consumption|Not specified,tCO2eq,7075.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No LHV for subflow=electricity consumption,,True
110,TECH-ed23117f-2023-2,2023,Energy,Diesel,tCO2eq,68884.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for diesel [tco2eq],,True
111,TECH-ed23117f-2023-3,2023,Energy,Explosives,tCO2eq,690.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for explosives [tco2eq],,True
112,TECH-ed23117f-2023-4,2023,Energy,Gasoline,tCO2eq,1557.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for gasoline [tco2eq],,True
113,TECH-ed23117f-2023-5,2023,Energy,Propane,tCO2eq,1362.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for propane [tco2eq],,True


In [25]:
energy_df_sd.to_excel(r'data/MetalliCan/pre_cleaned_data/energy_df.xlsx', index=False)

## Material flows

In [26]:
from utils.conversion_functions import standardize_materials_to_t

In [27]:
material_df_sd = standardize_materials_to_t(material_df)

In [28]:
# Check the rows where mass_t is NaN
material_df_sd[material_df_sd['mass_t'].isna()]

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,mass_t,mass_source,mass_note,needs_density


In [29]:
material_df_sd.to_excel(r'data/MetalliCan/pre_cleaned_data/material_df.xlsx', index=False)

# Harmonization of land flows

In [30]:
# Step 1: Find main_id that have more than one unique source_id
multi_source_main_ids = land_table.groupby('main_id')['source_id'].nunique()
multi_source_main_ids = multi_source_main_ids[multi_source_main_ids > 1].index

# Step 2: Filter the dataframe
land_table = land_table[
    ~land_table['main_id'].isin(multi_source_main_ids) |
    (land_table['source_id'] == "https://www.nature.com/articles/s41597-025-05296-y")
]

In [31]:
# For each main_id, give me the sum of area_km2 associated
land_table = land_table.groupby('main_id')['area_km2'].sum().reset_index()

In [32]:
land_table = land_table.merge(main_table[['main_id', 'facility_group_id', 'facility_name', 'facility_group_name', 'province', 'facility_type', 'mining_processing_type', 'commodities']], on='main_id', how='left')

In [33]:
land_table_mining = land_table[land_table['facility_type'] == 'mining']

In [34]:
# Add a column with area in m2
land_table_mining['area_m2'] = land_table_mining['area_km2'] * 1e6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  land_table_mining['area_m2'] = land_table_mining['area_km2'] * 1e6


In [35]:
land_table_mining

Unnamed: 0,main_id,area_km2,facility_group_id,facility_name,facility_group_name,province,facility_type,mining_processing_type,commodities,area_m2
0,BC-MAIN-23155c25,1.499690,,Myra Falls,,British Columbia,mining,Underground,"Zinc, copper, silver, gold, lead",1.499690e+06
2,BC-MAIN-3f490561,7.967835,,Mount Polley,,British Columbia,mining,"Open-pit, concentrator","Gold, copper, silver",7.967835e+06
3,BC-MAIN-4724f4ba,0.416737,,Elk,,British Columbia,mining,Open-pit,Gold,4.167369e+05
4,BC-MAIN-599152a0,13.233210,,Copper Mountain,,British Columbia,mining,"Open-pit, concentrator","Copper, gold, silver",1.323321e+07
5,BC-MAIN-6b4800fe,22.527999,,Gibraltar,,British Columbia,mining,"Open-pit, concentrator","Copper, molybdenum, silver",2.252800e+07
...,...,...,...,...,...,...,...,...,...,...
110,SK-MAIN-91cf5448,1.600590,,Cigar Lake,,Saskatchewan,mining,Underground,Uranium,1.600590e+06
112,SK-MAIN-bb89158f,10.235647,GRP-21eee27d,Key Lake,Key Lake + McArthur River,Saskatchewan,mining,Concentrator,Uranium,1.023565e+07
113,SK-MAIN-d3c471e8,1.973892,GRP-21eee27d,McArthur River,Key Lake + McArthur River,Saskatchewan,mining,Underground,Uranium,1.973892e+06
114,YT-MAIN-44857446,5.293594,,Keno Hill Silver District,,Yukon,mining,"Underground, concentrator","Silver, zinc, lead",5.293594e+06


In [36]:
land_table_mining.to_excel(r'data/MetalliCan/pre_cleaned_data/land_table_mining.xlsx', index=False)

# Normalization

## Prepare data for normalization

In [37]:
from utils.data_manipulations import build_activity_name, add_site_id

In [38]:
# Load prices and production data
price_df = pd.read_excel(r'data/Prices/Prices_data.xlsx', sheet_name='data')
production_df = pd.read_excel(r'data/MetalliCan/sites_for_lci.xlsx', sheet_name='prod_data')

In [39]:
# Keep only relevant columns
energy_df_sd = energy_df_sd[['main_id', 'facility_group_id', 'flow_type', 'subflow_type', 'value_MJ']]
material_df_sd = material_df_sd[['main_id', 'facility_group_id', 'flow_type', 'subflow_type', 'mass_t']]
biosphere_df = biosphere_df[['main_id', 'facility_group_id', 'substance_name', 'unit', 'value']]

In [40]:
# Add activitiy_name to production_df
production_df['activity_name'] = production_df.apply(lambda row: build_activity_name(row, production_df), axis=1)

In [41]:
production_df = add_site_id(production_df)
energy_df_sd = add_site_id(energy_df_sd)
material_df_sd = add_site_id(material_df_sd)
biosphere_df = add_site_id(biosphere_df)

In [42]:
energy_df_sd = energy_df_sd.merge(production_df[['site_id', 'activity_name']], on='site_id', how='left')
material_df_sd = material_df_sd.merge(production_df[['site_id', 'activity_name']], on='site_id', how='left')
biosphere_df = biosphere_df.merge(production_df[['site_id', 'activity_name']], on='site_id', how='left')

In [43]:
# Replace column name mass_t to mass for normalization function
material_df_sd = material_df_sd.rename(columns={'mass_t': 'mass'})

## Normalize flows

In [44]:
from core.lci_database_builder import normalize_flows

In [45]:
energy_norm_ore = normalize_flows(energy_df_sd, production_df, mode='ore', value_col='value_MJ')
energy_norm_mass = normalize_flows(energy_df_sd, production_df, mode='metal', allocation='mass', value_col='value_MJ')
energy_norm_econ = normalize_flows(energy_df_sd, production_df, price_df=price_df, mode='metal', allocation='economic', value_col='value_MJ')

In [46]:
material_norm_ore = normalize_flows(material_df_sd, production_df, mode='ore', value_col='mass')
material_norm_mass = normalize_flows(material_df_sd, production_df, mode='metal', allocation='mass', value_col='mass')
material_norm_econ = normalize_flows(material_df_sd, production_df, price_df=price_df, mode='metal', allocation='economic', value_col='mass')

In [47]:
biosphere_norm_ore = normalize_flows(biosphere_df, production_df, mode='ore', value_col='value')
biosphere_norm_mass = normalize_flows(biosphere_df, production_df, mode='metal', allocation='mass', value_col='value')
biosphere_norm_econ = normalize_flows(biosphere_df, production_df, price_df=price_df, mode='metal', allocation='economic', value_col='value')

In [48]:
# Only where activity name is not null
#biosphere_norm_ore = biosphere_norm_ore[~biosphere_norm_ore['activity_name'].isna()]
#biosphere_norm_ore

# Exports normalized dataframes

In [49]:
energy_norm_ore.to_csv(r'data/MetalliCan/data_for_lci_initialization/ore_normalization/energy_df.csv', index=False)
material_norm_ore.to_csv(r'data/MetalliCan/data_for_lci_initialization/ore_normalization/material_df.csv', index=False)
biosphere_norm_ore.to_csv(r'data/MetalliCan/data_for_lci_initialization/ore_normalization/biosphere_df.csv', index=False)

In [50]:
energy_norm_econ.to_csv(r'data/MetalliCan/data_for_lci_initialization/economic_allocation/energy_df.csv', index=False)
material_norm_econ.to_csv(r'data/MetalliCan/data_for_lci_initialization/economic_allocation/material_df.csv', index=False)
biosphere_norm_econ.to_csv(r'data/MetalliCan/data_for_lci_initialization/economic_allocation/biosphere_df.csv', index=False)