In [1]:
import pandas as pd

In [2]:
from utils.data_manipulations import merge_main_and_group, aggregate_biosphere_facility_groups

In [3]:
metallican_path = r'C:\Users\mp_ma\OneDrive - polymtl\POST_DOC\CODE\metallican_db'

# Import MetalliCan tables

In [4]:
main_table = pd.read_csv(metallican_path + r'\database\CSV\main_table.csv')
production_table = pd.read_csv(metallican_path + r'\database\CSV\production_table.csv')
tech_attributes_table = pd.read_csv(metallican_path + r'\database\CSV\tech_attributes_table.csv')
env_table = pd.read_csv(metallican_path + r'\database\CSV\environmental_flows_table.csv')
technosphere_table = pd.read_csv(metallican_path + r'\database\CSV\materials_energy_table.csv')
archetypes_table = pd.read_csv(metallican_path + r'\database\CSV\archetypes_table.csv')
land_table = pd.read_csv(metallican_path + r'\database\CSV\land_occupation_table.csv')
intensity_table = pd.read_csv(metallican_path + r'\database\CSV\intensity_table.csv')
waste_table = pd.read_table(metallican_path + r'\database\CSV\waste_table.csv', sep=',')
substances_table = pd.read_csv(metallican_path + r'\database\CSV\substances_table.csv')
#npv_table = pd.read_csv(metallican_path + r'\database\CSV\npv_table.csv')

In [5]:
intensity_table_nrj = intensity_table[intensity_table['type'] == 'Energy']
intensity_table_nrj = intensity_table_nrj[~intensity_table_nrj['intensity_id'].str.startswith('INT-CMP')]

In [6]:
# Let's remove ClimateTRACE data from production and env tables
production_table = production_table[~production_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]
env_table = env_table[~env_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]

In [7]:
production_table = merge_main_and_group(production_table, main_table, cols_to_add=['facility_name', 'facility_group_name'])
ta_table = merge_main_and_group(tech_attributes_table, main_table, cols_to_add=['facility_name', 'facility_group_name'])

In [8]:
production_table.to_csv(r'data\MetalliCan\pre_cleaned_data\production_table.csv', index=False, encoding='utf-8-sig')
ta_table.to_csv(r'data\MetalliCan\pre_cleaned_data\ta_table.csv', index=False, encoding='utf-8-sig')

In [56]:
# Let's remove ClimateTRACE data
env_table = env_table[~env_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]

In [57]:
#ids_energy = set(energy_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
#ids_material = set(material_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
#ids_biosphere = set(biosphere_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
#ids_land = set(land_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))

In [58]:
# Facilities with at least technosphere and biosphere flows
#best_ids = ids_energy & ids_material & ids_biosphere
#best_ids

## Harmonize units for the same substance_id

In [59]:
unit_mapping = {
'7732-18-5' : 'm3',
'NA - M10' : 'tonnes',
'NA - M09' : 'tonnes',
'NA - M16' : 'tonnes',
'630-08-0' : 'tonnes',
'NA - NOx' : 'tonnes',
'NA - 08' : 'kg',
'NA - 10' : 'tonnes',
'NA - 02' : 'kg',
'NA - M08' : 'tonnes',
'NA - GHG' : 'tCO2eq',
'NA - 12': 'kg',
'NA - SOx': 'tonnes'
}

In [60]:
conversion_factors = {
    ('kg', 'tonnes'): 1/1000,
    ('kg', 't'): 1/1000,
    ('tonnes', 'kg'): 1000,
    ('t', 'kg'): 1000,
    ('tonnes', 't'): 1,
    ('t', 'tonnes'): 1,
    ('ML', 'm3'): 1000,
    ('m3', 'ML'): 1/1000,
    ('L', 'm3'): 1/1000,
    ('m3', 'L'): 1000,
    ('Mm3', 'm3'): 1000000,
    ('km3', 'm3'): 1000,
    ('ktCO2eq', 'tCO2eq'): 1000,
    ('tCO2eq', 'ktCO2eq'): 1/1000
}


In [61]:
def update_units_and_values(row):
    substance_id = row['substance_id']
    current_unit = row['unit']
    value = row['value']

    if substance_id in unit_mapping:
        target_unit = unit_mapping[substance_id]

        if current_unit != target_unit:
            conversion_key = (current_unit, target_unit)
            if conversion_key in conversion_factors:
                value = value * conversion_factors[conversion_key]
            # Si la conversion n'est pas définie, la valeur reste inchangée

        current_unit = target_unit  # Mettre à jour l'unité

    return pd.Series({'unit': current_unit, 'value': value})


env_table['value'] = pd.to_numeric(env_table['value'], errors='coerce')
mask = env_table['substance_id'].isin(unit_mapping.keys())
env_table.loc[mask, ['unit', 'value']] = env_table.loc[mask].apply(update_units_and_values, axis=1)

In [62]:
env_table

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,facility_group_id,company_id,source_id
28,npri-1568-2023-1,2023,Air,NA - 16,Emission,Stack Emissions,tonnes,6.360000e-01,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
29,npri-1568-2023-2,2023,Air,NA - 02,Emission,Stack Emissions,kg,1.160000e-01,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
30,npri-1568-2023-3,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,7.620800e+01,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
31,npri-1568-2023-4,2023,Air,NA - 04,Emission,Stack Emissions,tonnes,1.050000e-03,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
32,npri-1568-2023-5,2023,Air,NA - 05,Emission,Stack Emissions,kg,1.800000e-01,,ON-MAIN-1f126a43,,CMP-6265c407,https://www.canada.ca/en/environment-climate-c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5338,ENV-9de9bb0d-2023-12,2023,Air,NA - M16,Emission,,tonnes,9.900000e-01,,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables
5339,ENV-02884fb5-2023-11,2023,Water,7732-18-5,Consumption,,m3,-8.937000e+05,Consumed,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...
5340,ENV-02884fb5-2023-12,2023,Water,7732-18-5,Withdrawal,,m3,5.145100e+06,Withdrawal|Freshwater,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...
5341,ENV-7607a50e-2023-15,2023,Water,7732-18-5,Discharged,,m3,1.088500e+06,Total water discharge,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023


## Aggregate data at facility-group level

In [63]:
biosphere_df = aggregate_biosphere_facility_groups(env_table, remove_individuals='True')

In [64]:
# To have the substance names
biosphere_df = biosphere_df.merge(substances_table[['substance_id', 'substance_name']], how='left',
                                        on='substance_id')

In [65]:
biosphere_df.to_excel(r'data/MetalliCan/pre_cleaned_data/biosphere_df.xlsx', index=False)

# Harmonization of technosphere units

In [66]:
# Separe energy and material
energy_df = technosphere_table[technosphere_table['flow_type'] == 'Energy']
material_df = technosphere_table[technosphere_table['flow_type'] == 'Material use']

## Energy flows

In [67]:
from utils.conversion_functions import standardize_energy_to_MJ

In [68]:
# Replace the unit 'kl' to 'm3' for Natural gas to avoid issues during conversion
energy_df.loc[(energy_df['subflow_type'] == 'Natural gas') & (energy_df['unit'] == 'kl'), 'unit'] = 'm3'

In [69]:
energy_df_sd = standardize_energy_to_MJ(energy_df)

In [70]:
energy_df_sd[energy_df_sd['value_MJ'].isna()]

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,value_MJ,unit_source,assumption_note,unit_standard,needs_factor
41,TECH-ed23117f-2023-1,2023,Energy,Diesel,tCO2eq,68884.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for diesel [tco2eq],,True
101,TECH-ed23117f-2023-2,2023,Energy,Electricity consumption|Not specified,tCO2eq,7075.0,62% come from the grid.,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No LHV for subflow=electricity consumption,,True
113,TECH-ed23117f-2023-3,2023,Energy,Explosives,tCO2eq,690.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for explosives [tco2eq],,True
128,TECH-ed23117f-2023-4,2023,Energy,Gasoline,tCO2eq,1557.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for gasoline [tco2eq],,True
203,TECH-ed23117f-2023-5,2023,Energy,Propane,tCO2eq,1362.0,Initially in tCO2eq.,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for propane [tco2eq],,True


In [71]:
# Check the rows where needs_factor is TRUE
energy_df_sd[energy_df_sd['value_MJ'].isna()]

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,value_MJ,unit_source,assumption_note,unit_standard,needs_factor
41,TECH-ed23117f-2023-1,2023,Energy,Diesel,tCO2eq,68884.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for diesel [tco2eq],,True
101,TECH-ed23117f-2023-2,2023,Energy,Electricity consumption|Not specified,tCO2eq,7075.0,62% come from the grid.,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No LHV for subflow=electricity consumption,,True
113,TECH-ed23117f-2023-3,2023,Energy,Explosives,tCO2eq,690.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for explosives [tco2eq],,True
128,TECH-ed23117f-2023-4,2023,Energy,Gasoline,tCO2eq,1557.0,,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for gasoline [tco2eq],,True
203,TECH-ed23117f-2023-5,2023,Energy,Propane,tCO2eq,1362.0,Initially in tCO2eq.,BC-MAIN-ed23117f,,CMP-fa032e20,SRC_CenterraGoldInc_Data,,missing_factor,No conversion rule for propane [tco2eq],,True


In [72]:
energy_df_sd.to_excel(r'data/MetalliCan/pre_cleaned_data/energy_df.xlsx', index=False)

## Material flows

In [None]:
from utils.conversion_functions import standardize_materials_to_t

In [74]:
material_df_sd = standardize_materials_to_t(material_df)

In [75]:
# Check the rows where mass_t is NaN
material_df_sd[material_df_sd['mass_t'].isna()]

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,mass_t,mass_source,mass_note,needs_density


In [76]:
material_df_sd.to_excel(r'data/MetalliCan/pre_cleaned_data/material_df.xlsx', index=False)

# Harmonization of land flows

In [77]:
# Step 1: Find main_id that have more than one unique source_id
multi_source_main_ids = land_table.groupby('main_id')['source_id'].nunique()
multi_source_main_ids = multi_source_main_ids[multi_source_main_ids > 1].index

# Step 2: Filter the dataframe
land_table = land_table[
    ~land_table['main_id'].isin(multi_source_main_ids) |
    (land_table['source_id'] == "https://www.nature.com/articles/s41597-025-05296-y")
]

In [78]:
# For each main_id, give the sum of area_km2 associated
land_table = land_table.groupby('main_id')['area_km2'].sum().reset_index()

In [79]:
land_table = land_table.merge(main_table[['main_id', 'facility_group_id', 'facility_name', 'facility_group_name', 'province', 'facility_type', 'mining_processing_type', 'commodities', 'operation_periods']], on='main_id', how='left')

In [80]:
land_table_mining = land_table[land_table['facility_type'] == 'mining']

In [81]:
# Add a column with area in m2 and
land_table_mining['area_m2'] = land_table_mining['area_km2'] * 1e6
land_table_mining.drop(columns=['area_km2'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  land_table_mining['area_m2'] = land_table_mining['area_km2'] * 1e6
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  land_table_mining.drop(columns=['area_km2'], inplace=True)


In [82]:
land_table_mining

Unnamed: 0,main_id,facility_group_id,facility_name,facility_group_name,province,facility_type,mining_processing_type,commodities,operation_periods,area_m2
0,BC-MAIN-23155c25,,Myra Falls,,British Columbia,mining,Underground,"Zinc, copper, silver, gold, lead",1966–1985; 2002–2015; 2019–open,1.499690e+06
2,BC-MAIN-3f490561,,Mount Polley,,British Columbia,mining,"Open-pit, concentrator","Gold, copper, silver",,7.967835e+06
3,BC-MAIN-4724f4ba,,Elk,,British Columbia,mining,Open-pit,Gold,,4.167369e+05
4,BC-MAIN-599152a0,,Copper Mountain,,British Columbia,mining,"Open-pit, concentrator","Copper, gold, silver",1884–1958; 2011–open,1.323321e+07
5,BC-MAIN-6b4800fe,,Gibraltar,,British Columbia,mining,"Open-pit, concentrator","Copper, molybdenum, silver",1972–1998; 2004–open,2.252800e+07
...,...,...,...,...,...,...,...,...,...,...
110,SK-MAIN-91cf5448,,Cigar Lake,,Saskatchewan,mining,Underground,Uranium,2015–open,1.600590e+06
112,SK-MAIN-bb89158f,GRP-21eee27d,Key Lake,Key Lake + McArthur River,Saskatchewan,mining,Concentrator,Uranium,,1.023565e+07
113,SK-MAIN-d3c471e8,GRP-21eee27d,McArthur River,Key Lake + McArthur River,Saskatchewan,mining,Underground,Uranium,,1.973892e+06
114,YT-MAIN-44857446,,Keno Hill Silver District,,Yukon,mining,"Underground, concentrator","Silver, zinc, lead",,5.293594e+06


In [83]:
land_table_mining.columns

Index(['main_id', 'facility_group_id', 'facility_name', 'facility_group_name',
       'province', 'facility_type', 'mining_processing_type', 'commodities',
       'operation_periods', 'area_m2'],
      dtype='object')

In [84]:
import pandas as pd

def aggregate_land_facility_groups(
    df: pd.DataFrame,
    by=("facility_group_id",),                    # -> 1 ligne par facility_group_id
    sum_cols=("area_m2",),                        # colonnes à sommer
    concat_cols=("operation_periods", "commodities", "mining_processing_type"),  # concat unique
    first_cols=("province", "facility_type", "company_id"),  # on prend la 1ère valeur non-nulle
    remove_individuals: bool=False
) -> pd.DataFrame:
    """
    Agrège les enregistrements par facility_group_id (1 ligne par groupe).
    - Somme les colonnes numériques définies dans `sum_cols`
    - Concatène de façon unique et nettoyée les colonnes de `concat_cols`
    - Recopie la première valeur non-nulle pour `first_cols`
    - Crée de nouvelles lignes agrégées (main_id=None, facility_name="")
    - Optionnellement supprime les lignes individuelles du groupe (remove_individuals=True)

    Paramètres
    ----------
    df : pd.DataFrame
    by : tuple[str]
        Clés d'agrégation. Par défaut uniquement ("facility_group_id",)
        -> garantit 1 ligne par facility group ID.
    sum_cols : tuple[str]
        Colonnes à sommer.
    concat_cols : tuple[str]
        Colonnes à concaténer (valeurs uniques, trim, dédoublonnées).
    first_cols : tuple[str]
        Colonnes pour lesquelles on prend la première valeur non-nulle.
    remove_individuals : bool
        Si True, supprime les enregistrements individuels des groupes agrégés.

    Retour
    ------
    pd.DataFrame
        DataFrame combinant les lignes d'origine (ou non) et les lignes agrégées.
    """

    df = df.copy()

    # --- Helpers -------------------------------------------------------------
    def first_non_null(series: pd.Series):
        s = series.dropna()
        if not s.empty:
            # Evite les chaînes vides considérées comme "non-informatives"
            for v in s:
                if isinstance(v, str) and v.strip() == "":
                    continue
                return v
            # si tout est vide/"" -> retourner NaN
            return pd.NA
        return pd.NA

    def concat_unique(series: pd.Series):
        """Concatène des valeurs uniques en nettoyant les séparateurs et espaces.
        Gère des entrées déjà séparées par des virgules.
        """
        if series.isna().all():
            return pd.NA
        tokens = []
        for v in series.dropna().astype(str):
            # scinder sur virgule si présent
            parts = [p.strip() for p in v.split(",") if p.strip() != ""]
            tokens.extend(parts)
        # unicité en préservant l'ordre d'apparition
        seen = set()
        uniq = []
        for t in tokens:
            key = t.lower()  # unicité case-insensitive
            if key not in seen:
                seen.add(key)
                uniq.append(t)
        return ", ".join(uniq) if uniq else pd.NA

    # S'assurer que la colonne facility_group_id existe et filtrer les groupes valides
    if "facility_group_id" not in df.columns:
        raise KeyError("La colonne 'facility_group_id' est absente du DataFrame.")

    df_groups = df[df["facility_group_id"].notna()].copy()
    if df_groups.empty:
        # rien à agréger : retourner df tel quel
        return df

    # Construire le dict d'aggregations
    agg_dict = {}

    for c in sum_cols:
        if c in df_groups.columns:
            agg_dict[c] = "sum"

    for c in concat_cols:
        if c in df_groups.columns:
            agg_dict[c] = concat_unique

    for c in first_cols:
        if c in df_groups.columns:
            agg_dict[c] = first_non_null

    # Groupby UNIQUEMENT par les clés "by" (par défaut: facility_group_id)
    grouped = (
        df_groups
        .groupby(list(by), dropna=False)
        .agg(agg_dict)
        .reset_index()
    )

    # Ajouter/forcer les champs des lignes agrégées
    grouped["main_id"] = None
    grouped["facility_name"] = ""  # vide pour les lignes agrégées
    grouped["comment"] = grouped.get("comment", pd.Series(index=grouped.index, dtype="object"))
    grouped["comment"] = grouped["comment"].fillna("Aggregated value from multiple facilities")

    # Réordonner/réaligner les colonnes sur le df original
    # (on garde toutes les colonnes d'origine ; s'il en manque, on les crée)
    for col in df.columns:
        if col not in grouped.columns:
            grouped[col] = pd.NA
    grouped = grouped[df.columns]

    # Optionnellement supprimer les indiv. des groupes agrégés
    if remove_individuals:
        to_remove = df_groups["facility_group_id"].unique()
        base = df[~df["facility_group_id"].isin(to_remove)].copy()
    else:
        base = df

    # Combiner
    combined = pd.concat([base, grouped], ignore_index=True)

    # (Optionnel) garantir au maximum 1 ligne agrégée par group_id
    # -> si jamais des duplications surviennent, on garde la première.
    mask_agg = (combined["main_id"].isna()) & (combined["facility_name"] == "")
    dupe_mask = mask_agg & combined.duplicated(subset=list(by) + ["main_id", "facility_name"], keep="first")
    if dupe_mask.any():
        combined = combined.loc[~dupe_mask].copy()

    return combined


In [85]:
land_table_mining = aggregate_land_facility_groups(land_table_mining)

In [86]:
land_table_mining

Unnamed: 0,main_id,facility_group_id,facility_name,facility_group_name,province,facility_type,mining_processing_type,commodities,operation_periods,area_m2
0,BC-MAIN-23155c25,,Myra Falls,,British Columbia,mining,Underground,"Zinc, copper, silver, gold, lead",1966–1985; 2002–2015; 2019–open,1.499690e+06
1,BC-MAIN-3f490561,,Mount Polley,,British Columbia,mining,"Open-pit, concentrator","Gold, copper, silver",,7.967835e+06
2,BC-MAIN-4724f4ba,,Elk,,British Columbia,mining,Open-pit,Gold,,4.167369e+05
3,BC-MAIN-599152a0,,Copper Mountain,,British Columbia,mining,"Open-pit, concentrator","Copper, gold, silver",1884–1958; 2011–open,1.323321e+07
4,BC-MAIN-6b4800fe,,Gibraltar,,British Columbia,mining,"Open-pit, concentrator","Copper, molybdenum, silver",1972–1998; 2004–open,2.252800e+07
...,...,...,...,...,...,...,...,...,...,...
83,,GRP-25483238,,,Quebec,mining,Open-pit,Ilmenite,1948–open,2.331099e+06
84,,GRP-2a663492,,,Newfoundland and Labrador,mining,Open-pit,Iron,,1.034080e+07
85,,GRP-a13779f8,,,Manitoba,mining,"Concentrator, Underground","Gold, zinc, copper, silver","2013–open, 1949–2000; 2021–open",3.104131e+06
86,,GRP-dc07540b,,,Ontario,mining,"Underground, Concentrator","Nickel, copper, platinum group metals, gold, s...",2014–open,2.941696e+07


In [87]:
land_table_mining.to_excel(r'data/MetalliCan/pre_cleaned_data/land_table_mining.xlsx', index=False)

In [88]:
# Which ones do not have operation periods?
land_table_mining[land_table_mining['operation_periods'].isna()]

Unnamed: 0,main_id,facility_group_id,facility_name,facility_group_name,province,facility_type,mining_processing_type,commodities,operation_periods,area_m2
1,BC-MAIN-3f490561,,Mount Polley,,British Columbia,mining,"Open-pit, concentrator","Gold, copper, silver",,7967835.0
2,BC-MAIN-4724f4ba,,Elk,,British Columbia,mining,Open-pit,Gold,,416736.9
7,BC-MAIN-bf503b6b,,Highland Valley,,British Columbia,mining,"Open-pit, concentrator","Copper, silver, molybdenum",,63924260.0
9,MB-MAIN-0898e255,GRP-a13779f8,Stall Lake,Snow Lake,Manitoba,mining,Concentrator,"Gold, zinc",,330003.3
10,MB-MAIN-915d9faa,,Tanco,,Manitoba,mining,"Underground, concentrator, plant","Cesium, lithium",,1058100.0
13,MB-MAIN-e0a6250e,,Thompson (T-1 and T-3),,Manitoba,mining,"Open-pit, underground, concentrator","Nickel, cobalt, copper, platinum group metals,...",,7481998.0
15,NL-MAIN-842ba1b4,,Nugget Pond,,Newfoundland and Labrador,mining,Concentrator,Copper,,163254.6
16,NL-MAIN-b64bae7a,,Scully,,Newfoundland and Labrador,mining,"Open-pit, concentrator",Iron,,10572190.0
17,NL-MAIN-c139de6d,,Ming,,Newfoundland and Labrador,mining,Underground,"Copper, gold, silver",,1510729.0
18,NL-MAIN-dd723db4,,Carol Lake,,Newfoundland and Labrador,mining,"Open-pit, concentrator",Iron,,26104650.0


# Normalization

## Prepare data for normalization

In [37]:
from utils.data_manipulations import build_activity_name, add_site_id

In [38]:
# Load prices and production data
price_df = pd.read_excel(r'data/Prices/Prices_data.xlsx', sheet_name='data')
production_df = pd.read_excel(r'data/MetalliCan/sites_for_lci.xlsx', sheet_name='prod_data')

In [39]:
# Keep only relevant columns
energy_df_sd = energy_df_sd[['main_id', 'facility_group_id', 'flow_type', 'subflow_type', 'value_MJ']]
material_df_sd = material_df_sd[['main_id', 'facility_group_id', 'flow_type', 'subflow_type', 'mass_t']]
biosphere_df = biosphere_df[['main_id', 'facility_group_id', 'substance_name', 'unit', 'value']]

In [40]:
# Add activitiy_name to production_df
production_df['activity_name'] = production_df.apply(lambda row: build_activity_name(row, production_df), axis=1)

In [41]:
energy_df_sd

Unnamed: 0,main_id,facility_group_id,flow_type,subflow_type,value_MJ
4,BC-MAIN-857b7b89,,Energy,Acetylene,1.847565e+04
5,BC-MAIN-8eb8be0d,,Energy,Acetylene,2.000000e+03
6,,GRP-147b3123,Energy,Ammonium nitrate,2.070000e+04
7,QC-MAIN-02884fb5,,Energy,ANFO,1.530900e+06
11,BC-MAIN-857b7b89,,Energy,Aviation fuel,7.267611e+07
...,...,...,...,...,...
210,,GRP-14bfbb82,Energy,Propane,4.017700e+07
211,,GRP-a13779f8,Energy,Propane,4.840000e+07
212,QC-MAIN-02884fb5,,Energy,Propane,3.300000e+04
217,ON-MAIN-fefeaee4,,Energy,Solar,1.483200e+06


In [42]:
production_df = add_site_id(production_df)
energy_df_sd = add_site_id(energy_df_sd)
material_df_sd = add_site_id(material_df_sd)
biosphere_df = add_site_id(biosphere_df)

In [43]:
energy_df_sd = energy_df_sd.merge(production_df[['site_id', 'activity_name']], on='site_id', how='left')
material_df_sd = material_df_sd.merge(production_df[['site_id', 'activity_name']], on='site_id', how='left')
biosphere_df = biosphere_df.merge(production_df[['site_id', 'activity_name']], on='site_id', how='left')

In [44]:
# Replace column name mass_t to mass for normalization function
material_df_sd = material_df_sd.rename(columns={'mass_t': 'mass'})

In [45]:
energy_df_sd

Unnamed: 0,main_id,facility_group_id,flow_type,subflow_type,value_MJ,site_id,activity_name
0,BC-MAIN-857b7b89,,Energy,Acetylene,1.847565e+04,BC-MAIN-857b7b89,"Au, Underground mining and beneficiation at Br..."
1,BC-MAIN-8eb8be0d,,Energy,Acetylene,2.000000e+03,BC-MAIN-8eb8be0d,"Au and Cu, Open-pit mining and beneficiation a..."
2,,GRP-147b3123,Energy,Ammonium nitrate,2.070000e+04,GRP-147b3123,"Au and Ag, Underground mining and beneficiatio..."
3,QC-MAIN-02884fb5,,Energy,ANFO,1.530900e+06,QC-MAIN-02884fb5,"Au and Ag, Underground mining and beneficiatio..."
4,BC-MAIN-857b7b89,,Energy,Aviation fuel,7.267611e+07,BC-MAIN-857b7b89,"Au, Underground mining and beneficiation at Br..."
...,...,...,...,...,...,...,...
168,,GRP-14bfbb82,Energy,Propane,4.017700e+07,GRP-14bfbb82,"Au, Underground mining and beneficiation at Se..."
169,,GRP-a13779f8,Energy,Propane,4.840000e+07,GRP-a13779f8,"Au and Ag and Cu and Zn, Underground mining an..."
170,QC-MAIN-02884fb5,,Energy,Propane,3.300000e+04,QC-MAIN-02884fb5,"Au and Ag, Underground mining and beneficiatio..."
171,ON-MAIN-fefeaee4,,Energy,Solar,1.483200e+06,ON-MAIN-fefeaee4,"Au, Underground mining and beneficiation at Mu..."


## Normalize flows

In [46]:
from core.lci_database_builder import normalize_flows

In [47]:
energy_norm_ore = normalize_flows(energy_df_sd, production_df, mode='ore', value_col='value_MJ')
energy_norm_mass = normalize_flows(energy_df_sd, production_df, mode='metal', allocation='mass', value_col='value_MJ')
energy_norm_econ = normalize_flows(energy_df_sd, production_df, price_df=price_df, mode='metal', allocation='economic', value_col='value_MJ')

In [48]:
energy_norm_ore

Unnamed: 0,main_id,facility_group_id,flow_type,subflow_type,value_MJ,site_id,activity_name,ore_processed_t,value_normalized,functional_unit,allocation_factor,normalization_key
0,BC-MAIN-857b7b89,,Energy,Acetylene,1.847565e+04,BC-MAIN-857b7b89,"Au, Underground mining and beneficiation at Br...",166000.0,0.111299,Ore processed,1,ore
1,BC-MAIN-8eb8be0d,,Energy,Acetylene,2.000000e+03,BC-MAIN-8eb8be0d,"Au and Cu, Open-pit mining and beneficiation a...",1139000.0,0.001756,Ore processed,1,ore
2,,GRP-147b3123,Energy,Ammonium nitrate,2.070000e+04,GRP-147b3123,"Au and Ag, Underground mining and beneficiatio...",1574000.0,0.013151,Ore processed,1,ore
3,QC-MAIN-02884fb5,,Energy,ANFO,1.530900e+06,QC-MAIN-02884fb5,"Au and Ag, Underground mining and beneficiatio...",1034000.0,1.480561,Ore processed,1,ore
4,BC-MAIN-857b7b89,,Energy,Aviation fuel,7.267611e+07,BC-MAIN-857b7b89,"Au, Underground mining and beneficiation at Br...",166000.0,437.807896,Ore processed,1,ore
...,...,...,...,...,...,...,...,...,...,...,...,...
168,,GRP-14bfbb82,Energy,Propane,4.017700e+07,GRP-14bfbb82,"Au, Underground mining and beneficiation at Se...",122000.0,329.319672,Ore processed,1,ore
169,,GRP-a13779f8,Energy,Propane,4.840000e+07,GRP-a13779f8,"Au and Ag and Cu and Zn, Underground mining an...",1562479.0,30.976416,Ore processed,1,ore
170,QC-MAIN-02884fb5,,Energy,Propane,3.300000e+04,QC-MAIN-02884fb5,"Au and Ag, Underground mining and beneficiatio...",1034000.0,0.031915,Ore processed,1,ore
171,ON-MAIN-fefeaee4,,Energy,Solar,1.483200e+06,ON-MAIN-fefeaee4,"Au, Underground mining and beneficiation at Mu...",1028000.0,1.442802,Ore processed,1,ore


In [49]:
energy_norm_econ

Unnamed: 0,main_id,facility_group_id,flow_type,subflow_type,value_MJ,site_id,activity_name,metal,mass_t,allocation_factor,value_normalized,functional_unit,normalization_key
0,BC-MAIN-857b7b89,,Energy,Acetylene,1.847565e+04,BC-MAIN-857b7b89,"Au, Underground mining and beneficiation at Br...",Au,0.902002,1.000000,2.048295e+04,"Au, usable ore",metal_economic
1,BC-MAIN-8eb8be0d,,Energy,Acetylene,2.000000e+03,BC-MAIN-8eb8be0d,"Au and Cu, Open-pit mining and beneficiation a...",Au,0.155518,0.243397,3.130155e+03,"Au, usable ore",metal_economic
2,BC-MAIN-8eb8be0d,,Energy,Acetylene,2.000000e+03,BC-MAIN-8eb8be0d,"Au and Cu, Open-pit mining and beneficiation a...",Cu,3628.736000,0.756603,4.170064e-01,"Cu, usable ore",metal_economic
3,,GRP-147b3123,Energy,Ammonium nitrate,2.070000e+04,GRP-147b3123,"Au and Ag, Underground mining and beneficiatio...",Au,4.133655,0.998440,4.999863e+03,"Au, usable ore",metal_economic
4,,GRP-147b3123,Energy,Ammonium nitrate,2.070000e+04,GRP-147b3123,"Au and Ag, Underground mining and beneficiatio...",Ag,0.497656,0.001560,6.488424e+01,"Ag, usable ore",metal_economic
...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,,GRP-a13779f8,Energy,Propane,4.840000e+07,GRP-a13779f8,"Au and Ag and Cu and Zn, Underground mining an...",Zn,34642.000000,0.181297,2.532983e+02,"Zn, usable ore",metal_economic
302,QC-MAIN-02884fb5,,Energy,Propane,3.300000e+04,QC-MAIN-02884fb5,"Au and Ag, Underground mining and beneficiatio...",Au,2.892625,0.994898,1.135011e+04,"Au, usable ore",metal_economic
303,QC-MAIN-02884fb5,,Energy,Propane,3.300000e+04,QC-MAIN-02884fb5,"Au and Ag, Underground mining and beneficiatio...",Ag,1.143178,0.005102,1.472927e+02,"Ag, usable ore",metal_economic
304,ON-MAIN-fefeaee4,,Energy,Solar,1.483200e+06,ON-MAIN-fefeaee4,"Au, Underground mining and beneficiation at Mu...",Au,5.598630,1.000000,2.649220e+05,"Au, usable ore",metal_economic


In [50]:
material_norm_ore = normalize_flows(material_df_sd, production_df, mode='ore', value_col='mass')
material_norm_mass = normalize_flows(material_df_sd, production_df, mode='metal', allocation='mass', value_col='mass')
material_norm_econ = normalize_flows(material_df_sd, production_df, price_df=price_df, mode='metal', allocation='economic', value_col='mass')

In [51]:
biosphere_norm_ore = normalize_flows(biosphere_df, production_df, mode='ore', value_col='value')
biosphere_norm_mass = normalize_flows(biosphere_df, production_df, mode='metal', allocation='mass', value_col='value')
biosphere_norm_econ = normalize_flows(biosphere_df, production_df, price_df=price_df, mode='metal', allocation='economic', value_col='value')

In [52]:
# Only where activity name is not null
#biosphere_norm_ore = biosphere_norm_ore[~biosphere_norm_ore['activity_name'].isna()]
#biosphere_norm_ore

# Exports normalized dataframes

In [53]:
energy_norm_ore.to_csv(r'data/MetalliCan/data_for_lci_initialization/ore_normalization/energy_df.csv', index=False)
material_norm_ore.to_csv(r'data/MetalliCan/data_for_lci_initialization/ore_normalization/material_df.csv', index=False)
biosphere_norm_ore.to_csv(r'data/MetalliCan/data_for_lci_initialization/ore_normalization/biosphere_df.csv', index=False)

In [54]:
energy_norm_econ.to_csv(r'data/MetalliCan/data_for_lci_initialization/economic_allocation/energy_df.csv', index=False)
material_norm_econ.to_csv(r'data/MetalliCan/data_for_lci_initialization/economic_allocation/material_df.csv', index=False)
biosphere_norm_econ.to_csv(r'data/MetalliCan/data_for_lci_initialization/economic_allocation/biosphere_df.csv', index=False)