In [1]:
import pandas as pd
import numpy as np

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from typing import List, Tuple

In [4]:
# Custom functions
from some_functions import merge_without_suffixes, normalize_by_production, get_info_for_ids, create_sankey_diagram, get_production_data

In [5]:
metallican_path = r'C:\Users\mp_ma\OneDrive - polymtl\POST_DOC\CODE\metallican_db'

# Import MetalliCan tables

In [6]:
main_table = pd.read_csv(metallican_path + r'\database\CSV\main_table.csv')
production_table = pd.read_csv(metallican_path + r'\database\CSV\production_table.csv')
tech_attributes_table = pd.read_csv(metallican_path + r'\database\CSV\tech_attributes_table.csv')
env_table = pd.read_csv(metallican_path + r'\database\CSV\environmental_flows_table.csv')
technosphere_table = pd.read_csv(metallican_path + r'\database\CSV\materials_energy_table.csv')
archetypes_table = pd.read_csv(metallican_path + r'\database\CSV\archetypes_table.csv')
land_table = pd.read_csv(metallican_path + r'\database\CSV\land_occupation_table.csv')
intensity_table = pd.read_csv(metallican_path + r'\database\CSV\intensity_table.csv')
waste_table = pd.read_table(metallican_path + r'\database\CSV\waste_table.csv', sep=',')

In [7]:
intensity_table_nrj = intensity_table[intensity_table['type'] == 'Energy']
intensity_table_nrj = intensity_table_nrj[~intensity_table_nrj['intensity_id'].str.startswith('INT-CMP')]

In [8]:
# Let's remove ClimateTRACE data
production_table = production_table[~production_table['source_id'].str.startswith('Jolleys, M. et al (2024). Mineral Extraction sector: Mining and Quarrying Emissions from Copper, Iron, Bauxite, Rock and Sand, Hypervine, UK, Climate TRACE Emissions Inventory')]

# Harmonization of technosphere units

In [9]:
# Separe energy and material
energy_df = technosphere_table[technosphere_table['flow_type'] == 'Energy']
material_df = technosphere_table[technosphere_table['flow_type'] == 'Material use']

## Energy flows

In [10]:
## New version
# --- Direct energy units → MJ ---
UNIT_TO_MJ = {
    'mj':   1.0,
    'gj':   1_000.0,
    'tj':   1_000_000.0,
    'j':    1e-6,
    'wh':   0.0036,
    'kwh':  3.6,
    'mwh':  3_600.0,
    'gwh':  3_600_000.0,
}

# --- Volume unit multipliers (to liters) ---
VOLUME_TO_L = {
    'l': 1.0, 'liter': 1.0, 'litre': 1.0, 'liters': 1.0, 'litres': 1.0,
    'kl': 1_000.0, 'kiloliter': 1_000.0, 'kilolitre': 1_000.0,
    'ml': 1_000_000.0, 'megaliter': 1_000_000.0, 'megalitre': 1_000_000.0,
    'gallon': 3.78541, 'gallons': 3.78541,
}

CUBIC_M_TO_M3 = {'m3': 1.0, 'm^3': 1.0, 'cubicmeter': 1.0, 'cubicmeters': 1.0}

# --- Default LHVs (edit with site/company data whenever you can) ---
DEFAULT_LHV = {
    'diesel':      {'MJ/kg': 43.0, 'MJ/L': 38.6, 'density_kg_per_L': 0.835},
    'gasoline':    {'MJ/kg': 44.0, 'MJ/L': 34.2, 'density_kg_per_L': 0.745},
    'heavy_fuel_oil': {'MJ/kg': 40.5, 'MJ/L': 39.69, 'density_kg_per_L': 0.98},
    'coal':        {'MJ/kg': 25.0},
    'natural_gas': {'MJ/m3': 38.0, 'MJ/L': 22.5, 'density_kg_per_L': 0.7},
    'propane':     {'MJ/kg': 46.4, 'MJ/L': 25.3, 'density_kg_per_L': 0.493},
    'electricity': {'MJ/kWh': 3.6},
    'explosives':  {'MJ/kg': 4.0},
    'coke':        {'MJ/kg': 28.0},
    'wood':        {'MJ/kg': 16.0},
}


# --- Subflow canonicalization (aliases + strip pipe suffixes) ---
SUBFLOW_ALIASES = {
    'petrol': 'gasoline',
    'heavy fuel oil': 'heavy_fuel_oil',
    'hfo': 'heavy_fuel_oil',
    'natural gas': 'natural_gas',
    'explosive': 'explosives',
    'lpg': 'propane',
    'surface/underground_emulsion_&_anfo': 'explosives',
    'grinding_media': 'explosives',
    'total_blasting_agents_used_e.g._anfo': 'explosives'
}

def _norm_unit(x):
    if pd.isna(x): return None
    #n = str(x).strip().lower().replace(' ', '')
    #print(f"Original unit: '{x}', Normalized: '{n}'")  # Debug line
    #return n
    return str(x).strip().lower().replace(' ', '')

def _canon_subflow(x):
    if pd.isna(x): return None
    s = str(x).strip().lower()
    if '|' in s:
        s = s.split('|', 1)[0].strip()
    s = SUBFLOW_ALIASES.get(s, s)
    s_us = s.replace(' ', '_')
    #print(f"Original: '{x}', Normalized: '{s_us}'")  # Debug line
    return s_us

def standardize_energy_to_MJ(
    df,
    subflow_col='subflow_type',
    unit_col='unit',
    value_col='value',
    lhv_table=None
):
    """
    Convert energy/fuel rows to MJ.
    """
    lhv = (lhv_table or DEFAULT_LHV).copy()
    out = df.copy()

    # Normalize
    out['_unit_n'] = out[unit_col].map(_norm_unit)
    out['_subflow_n'] = out[subflow_col].map(_canon_subflow)
    out[value_col] = pd.to_numeric(out[value_col], errors='coerce')

    # 1) Direct energy units
    direct_mask = out['_unit_n'].isin(UNIT_TO_MJ)
    out.loc[direct_mask, 'value_MJ'] = (
        out.loc[direct_mask, value_col] *
        out.loc[direct_mask, '_unit_n'].map(UNIT_TO_MJ)
    )
    out.loc[direct_mask, 'unit_source'] = 'direct_unit'
    out.loc[direct_mask, 'assumption_note'] = (
        out.loc[direct_mask, '_unit_n'].map(lambda u: f"{u}→MJ factor={UNIT_TO_MJ[u]}")
    )

    # 2) Fuels via LHV
    fuel_rows = ~direct_mask & out['_subflow_n'].notna() & out[value_col].notna()
    for idx in out.index[fuel_rows]:
        sub = out.at[idx, '_subflow_n']
        unit = out.at[idx, '_unit_n']
        val  = out.at[idx, value_col]
        lhv_data = lhv.get(sub)

        if not lhv_data:
            out.at[idx, 'unit_source'] = 'missing_factor'
            out.at[idx, 'assumption_note'] = f"No LHV for subflow={sub}"
            continue

        converted = False

        # A) Mass units (kg, t, lbs)
        if unit in ('kg', 'kilogram', 'kilograms', 't', 'tonne', 'tonnes',
                    'metricton', 'ton', 'lb', 'lbs', 'pound', 'pounds'):
            # Determine multiplier to convert mass unit to kg
            mult_kg = 1.0
            if unit.startswith('t'):
                mult_kg = 1000.0
            elif unit in ('lb', 'lbs', 'pound', 'pounds'):
                mult_kg = 0.453592 # lbs to kg

            factor_kg = lhv_data.get('MJ/kg')
            if factor_kg:
                out.at[idx, 'value_MJ'] = val * mult_kg * factor_kg
                out.at[idx, 'unit_source'] = 'lhv_factor'
                out.at[idx, 'assumption_note'] = f"{sub} MJ/kg={factor_kg} (from {unit})"
                converted = True

        # B) Volume units (L, kL, ML, Gallons)
        elif unit in VOLUME_TO_L:
            mult_L = VOLUME_TO_L[unit]
            factor_l = lhv_data.get('MJ/L')
            if factor_l is None and lhv_data.get('density_kg_per_L') and lhv_data.get('MJ/kg'):
                dens = lhv_data.get('density_kg_per_L')
                factor_kg = lhv_data.get('MJ/kg')
                mass_kg = val * mult_L * dens
                out.at[idx, 'value_MJ'] = mass_kg * factor_kg
                out.at[idx, 'unit_source'] = 'lhv+density'
                out.at[idx, 'assumption_note'] = f"{sub} L→kg via {dens} kg/L; MJ/kg={factor_kg}"
                converted = True
            elif factor_l:
                out.at[idx, 'value_MJ'] = val * mult_L * factor_l
                out.at[idx, 'unit_source'] = 'lhv_factor'
                out.at[idx, 'assumption_note'] = f"{sub} MJ/L={factor_l}"
                converted = True

        # C) Volume units (m3)
        elif unit in CUBIC_M_TO_M3:
            factor_m3 = lhv_data.get('MJ/m3')
            if factor_m3:
                out.at[idx, 'value_MJ'] = val * CUBIC_M_TO_M3[unit] * factor_m3
                out.at[idx, 'unit_source'] = 'lhv_factor'
                out.at[idx, 'assumption_note'] = f"{sub} MJ/m3={factor_m3}"
                converted = True

        if not converted:
            out.at[idx, 'unit_source'] = 'missing_factor'
            out.at[idx, 'assumption_note'] = f"No conversion rule for subflow={sub}, unit={unit}"

    # Final flags
    out['unit_standard'] = np.where(out['value_MJ'].notna(), 'MJ', None)
    out['needs_factor'] = out['value_MJ'].isna() & out[value_col].notna()
    out = out.drop(columns=['_unit_n', '_subflow_n'], errors='ignore')
    return out

In [11]:
energy_df_sd = standardize_energy_to_MJ(energy_df)

In [12]:
energy_df_sd

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,value_MJ,unit_source,assumption_note,unit_standard,needs_factor
0,TECH-e7e6a960-2023-1,2023,Energy,Electricity consumption|Grid electricity,kWh,6.745244e+08,,QC-MAIN-e7e6a960,,CMP-6265c407,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...,2.428288e+09,direct_unit,kwh→MJ factor=3.6,MJ,False
1,TECH-e7e6a960-2023-2,2023,Energy,Diesel,GJ,2.557832e+06,,QC-MAIN-e7e6a960,,CMP-6265c407,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...,2.557832e+09,direct_unit,gj→MJ factor=1000.0,MJ,False
2,TECH-e7e6a960-2023-3,2023,Energy,Light Fuel & Gasoline,GJ,4.283300e+04,,QC-MAIN-e7e6a960,,CMP-6265c407,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...,4.283300e+07,direct_unit,gj→MJ factor=1000.0,MJ,False
3,TECH-e7e6a960-2023-4,2023,Energy,Natural gas,GJ,1.628050e+05,,QC-MAIN-e7e6a960,,CMP-6265c407,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...,1.628050e+08,direct_unit,gj→MJ factor=1000.0,MJ,False
4,TECH-e7e6a960-2023-5,2023,Energy,Propane,GJ,1.463609e+03,,QC-MAIN-e7e6a960,,CMP-6265c407,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...,1.463609e+06,direct_unit,gj→MJ factor=1000.0,MJ,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,TECH-9de9bb0d-2023-2,2023,Energy,Explosives,t,5.060000e+02,,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd._ESG_data_2023,2.024000e+06,lhv_factor,explosives MJ/kg=4.0 (from t),MJ,False
228,TECH-9de9bb0d-2023-3,2023,Energy,Gasoline,kl,3.600000e+01,,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd._ESG_data_2023,1.231200e+06,lhv_factor,gasoline MJ/L=34.2,MJ,False
229,TECH-9de9bb0d-2023-4,2023,Energy,Natural gas,kl,1.528000e+03,,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd._ESG_data_2023,3.438000e+07,lhv_factor,natural_gas MJ/L=22.5,MJ,False
230,TECH-9de9bb0d-2023-5,2023,Energy,Electricity consumption|Generated on-site,MWh,1.068450e+04,Given in MW in the Excel file but does not mak...,QC-MAIN-9de9bb0d,,CMP-3d2c4955,SRC_WesdomeGoldMinesLtd._ESG_data_2023,3.846420e+07,direct_unit,mwh→MJ factor=3600.0,MJ,False


In [13]:
# Check the rows that could not be converted
energy_df_sd[energy_df_sd['needs_factor']]['subflow_type']

49      Total blasting agents used e.g. ANFO
53     Electricity consumption|Not specified
54                                    Diesel
55                                Explosives
56                                  Gasoline
57                                   Propane
73       Surface/underground emulsion & ANFO
78       Surface/underground emulsion & ANFO
150                           Grinding media
165                           Grinding media
180                           Grinding media
Name: subflow_type, dtype: object

## Material flows

In [14]:
# liters prefixes → L
VOLUME_TO_L = {
    'l': 1.0, 'liter': 1.0, 'litre': 1.0, 'liters': 1.0, 'litres': 1.0,
    'kl': 1_000.0, 'kiloliter': 1_000.0, 'kilolitre': 1_000.0,
    'ml': 1_000_000.0, 'megaliter': 1_000_000.0, 'megalitre': 1_000_000.0,
}

# Default densities (kg/L) – please override with site-specific values when you have them
DEFAULT_DENSITY = {
    # Oils & lubricants family
    'lubricants': 0.88,
    'hydraulic oil': 0.88,
    'transmission oil': 0.88,
    'motor oil': 0.88,
    'drill oil': 0.88,
    'compressor oil': 0.88,

    # Acids (typical commercial concentrations)
    'sulfuric acid (h2so4)': 1.84,    # ~98%
    'hydrochloric acid (hcl)': 1.19,  # ~37%
    'nitric acid (hno3)': 1.51,       # ~68–70%

    # If you have aqueous reagents (e.g., “sodium cyanide solution”) add their conc/density here.
}

# Canonicalize names (left part before '|', lowercased)
ALIASES = {
    'petrol': 'gasoline',
    'grindingmedia': 'grinding media',
    '3/4\'\'balls': 'grinding media',
    '2\'\'balls': 'grinding media',
    '2.5\'\'balls': 'grinding media',
    '5.5\'\'balls': 'grinding media',
    'polyfrothh57': 'polyfroth h57',
    'antiscalant': 'anti-scalant',
}

def _norm_text(x):
    if pd.isna(x): return None
    return str(x).strip()

def _canon_subflow(s):
    if s is None: return None
    # take leftmost token before a pipe and lowercase
    base = s.split('|', 1)[0].strip().lower()
    # strip extra spaces and collapse doubles
    base = ' '.join(base.split())
    return ALIASES.get(base.replace(' ', ''), base)

def standardize_materials_to_t(df, subflow_col='subflow_type', unit_col='unit', value_col='value',
                               density_table=None):
    """
    Convert 'material' rows to tonnes.
    Adds:
      - mass_t : numeric mass in tonnes
      - mass_source : 't','kg→t','L×density→t','missing_density','unknown_unit'
      - mass_note : short note on the assumption used
      - needs_density : True when a volume row had no density mapping
    """
    den = {k.lower(): v for k, v in (density_table or DEFAULT_DENSITY).items()}
    out = df.copy()

    out['_unit_n'] = out[unit_col].astype(str).str.strip().str.lower().str.replace(' ', '', regex=False)
    out['_subflow_n'] = out[subflow_col].map(_canon_subflow)
    out[value_col] = pd.to_numeric(out[value_col], errors='coerce')

    # direct tonnes
    mask_t = out['_unit_n'].isin({'t','tonne','tonnes','metricton','ton'})
    out.loc[mask_t, 'mass_t'] = out.loc[mask_t, value_col].astype(float)
    out.loc[mask_t, 'mass_source'] = 't'
    out.loc[mask_t, 'mass_note'] = 'reported in tonnes'

    # kg → t
    mask_kg = out['_unit_n'].isin({'kg','kilogram','kilograms'})
    out.loc[mask_kg, 'mass_t'] = out.loc[mask_kg, value_col] / 1000.0
    out.loc[mask_kg, 'mass_source'] = 'kg→t'
    out.loc[mask_kg, 'mass_note'] = 'kg/1000'

    # liters family → t using density (kg/L)
    mask_L = out['_unit_n'].isin(VOLUME_TO_L)
    if mask_L.any():
        multL = out.loc[mask_L, '_unit_n'].map(VOLUME_TO_L)
        # find density per row from mapping on canonical subflow
        dens = out.loc[mask_L, '_subflow_n'].map(lambda s: den.get(s if s else '', np.nan))
        mass_t = (out.loc[mask_L, value_col] * multL * dens) / 1000.0
        out.loc[mask_L, 'mass_t'] = mass_t
        out.loc[mask_L, 'mass_source'] = np.where(dens.notna(), 'L×density→t', 'missing_density')
        out.loc[mask_L, 'mass_note'] = np.where(
            dens.notna(),
            (out.loc[mask_L, '_unit_n'].map(str) + f"→L × density kg/L; density=" + dens.map(lambda x: f"{x:g}")),
            "volume reported; no density mapping for this subflow"
        )

    # mark unknown units
    mask_done = mask_t | mask_kg | mask_L
    out.loc[~mask_done & out[value_col].notna(), 'mass_source'] = 'unknown_unit'
    out.loc[~mask_done & out[value_col].notna(), 'mass_note'] = 'no rule for this unit'

    out['needs_density'] = (out['mass_source'] == 'missing_density')

    # clean temp
    out = out.drop(columns=['_unit_n','_subflow_n'])
    return out

In [15]:
material_df_sd = standardize_materials_to_t(material_df)

In [16]:
material_df_sd

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,mass_t,mass_source,mass_note,needs_density
41,TECH-687b8c8d-2023-6,2023,Material use,Total sodium cyanide used,t,245.0,,ON-MAIN-687b8c8d,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,245.0,t,reported in tonnes,False
42,TECH-687b8c8d-2023-7,2023,Material use,Total blasting agents used e.g. ANFO,t,486.3,,ON-MAIN-687b8c8d,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,486.3,t,reported in tonnes,False
50,TECH-7607a50e-2023-8,2023,Material use,Total sodium cyanide used,t,1955.7,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,SRC_AlamosGoldInc_Data_2023,1955.7,t,reported in tonnes,False
90,TECH-02884fb5-2023-3,2023,Material use,Anti-scalant,t,35.4,,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...,35.4,t,reported in tonnes,False
91,TECH-02884fb5-2023-4,2023,Material use,Carbon,t,60.4,,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...,60.4,t,reported in tonnes,False
92,TECH-02884fb5-2023-5,2023,Material use,Caustic soda,t,86.6,,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...,86.6,t,reported in tonnes,False
93,TECH-02884fb5-2023-6,2023,Material use,Compressor oil,t,1.5,,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...,1.5,t,reported in tonnes,False
94,TECH-02884fb5-2023-7,2023,Material use,Cyanide,t,399.3,,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...,399.3,t,reported in tonnes,False
98,TECH-02884fb5-2023-11,2023,Material use,Flocculant,t,46.2,,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...,46.2,t,reported in tonnes,False
100,TECH-02884fb5-2023-13,2023,Material use,Grease,t,4.5,,QC-MAIN-02884fb5,,CMP-3d0a95b7,SRC_IAMGOLDCorporation_2023_iamgold-esg-perfor...,4.5,t,reported in tonnes,False


In [17]:
# Check the ones that need density
material_df_sd[material_df_sd['needs_density']]

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,source_id,mass_t,mass_source,mass_note,needs_density


# Normalization of technosphere and biosphere flows by production values

## Check production_data_available

In [18]:
# To get the reference points available
production_data_available = production_table.groupby(['main_id', 'facility_group_id'], dropna=False).agg(
     commodities=('commodity', lambda x: x.unique().tolist()),
     reference_points=('reference_point', lambda x: x.unique().tolist()),
     material_types=('material_type', lambda x: x.unique().tolist())
 ).reset_index()

In [19]:
production_data_available

Unnamed: 0,main_id,facility_group_id,commodities,reference_points,material_types
0,AB-MAIN-d3a4aba9,,"[Cobalt, Nickel]",[Refined metal produced],[Refinery production]
1,BC-MAIN-3bb6b7cd,,[Zinc],[Refined metal produced],[Refinery production]
2,BC-MAIN-3f490561,,"[Copper, Gold, Ore]","[Usable ore, Crude ore]","[Concentrate, Contained metal in concentrate, ..."
3,BC-MAIN-4724f4ba,,[Gold],"[Total extraction, Crude ore, Usable ore]","[Ore mined, Ore delivered, Contained metal in ..."
4,BC-MAIN-599152a0,,"[Copper, Gold, Ore, Silver]","[Usable ore, Crude ore, Total extraction]","[Concentrate, Contained metal in concentrate, ..."
...,...,...,...,...,...
78,,GRP-14bfbb82,[Gold],"[Crude ore, Total extraction, Usable ore]","[Ore milled, Ore mined, Contained metal in con..."
79,,GRP-21eee27d,[Uranium],"[Usable ore, Crude ore]","[Contained metal in concentrate, Ore processed]"
80,,GRP-2a663492,[Iron],[Total extraction],[Ore mined]
81,,GRP-7a9ba115,[Steel],[Refined metal produced],"[Long/wire rod, bars, slabs]"


In [20]:
# Extract all the rows from the production_data_available that have a non NaN main_id
prod_f = production_data_available[production_data_available['main_id'].notna()]
prod_f = prod_f.merge(main_table[['main_id', 'facility_name', 'facility_type', 'mining_processing_type']], on='main_id', how='left').drop_duplicates(subset=['main_id'], keep='first')

In [21]:
# Extract all the rows from the production_data_available that have a NaN main_id and non NaN facility_group_id
prod_fg = production_data_available[production_data_available['main_id'].isna() & production_data_available['facility_group_id'].notna()]
prod_fg = prod_fg.merge(main_table[['facility_group_id', 'facility_group_name', 'facility_type', 'mining_processing_type']], on='facility_group_id', how='left').drop_duplicates(subset=['facility_group_id'], keep='first')


In [22]:
# Ensure all desired columns are present in both DataFrames before concatenation
cols_to_keep = ['main_id', 'facility_name', 'facility_group_id', 'facility_group_name', 'facility_type', 'mining_processing_type', 'commodities', 'reference_points', 'material_types']

# Add missing columns to merged_f
for col in cols_to_keep:
    if col not in prod_f.columns:
        prod_f[col] = None
# Add missing columns to merged_fg
for col in cols_to_keep:
    if col not in prod_fg.columns:
        prod_fg[col] = None

# Reorder columns
prod_f = prod_f[cols_to_keep]
prod_fg = prod_fg[cols_to_keep]

# Combine the two results
production_data_available = pd.concat([prod_f, prod_fg])

In [23]:
production_data_available

Unnamed: 0,main_id,facility_name,facility_group_id,facility_group_name,facility_type,mining_processing_type,commodities,reference_points,material_types
0,AB-MAIN-d3a4aba9,The Cobalt Refinery Company Inc.,,,manufacturing,Refinery,"[Cobalt, Nickel]",[Refined metal produced],[Refinery production]
1,BC-MAIN-3bb6b7cd,Trail,,,manufacturing,"Smelter, refinery, plant",[Zinc],[Refined metal produced],[Refinery production]
2,BC-MAIN-3f490561,Mount Polley,,,mining,"Open-pit, concentrator","[Copper, Gold, Ore]","[Usable ore, Crude ore]","[Concentrate, Contained metal in concentrate, ..."
3,BC-MAIN-4724f4ba,Elk,,,mining,Open-pit,[Gold],"[Total extraction, Crude ore, Usable ore]","[Ore mined, Ore delivered, Contained metal in ..."
4,BC-MAIN-599152a0,Copper Mountain,,,mining,"Open-pit, concentrator","[Copper, Gold, Ore, Silver]","[Usable ore, Crude ore, Total extraction]","[Concentrate, Contained metal in concentrate, ..."
...,...,...,...,...,...,...,...,...,...
7,,,GRP-14bfbb82,Seabee Gold Operation,mining,Underground,[Gold],"[Crude ore, Total extraction, Usable ore]","[Ore milled, Ore mined, Contained metal in con..."
9,,,GRP-21eee27d,Key Lake + McArthur River,mining,Concentrator,[Uranium],"[Usable ore, Crude ore]","[Contained metal in concentrate, Ore processed]"
11,,,GRP-2a663492,DSO,mining,Open-pit,[Iron],[Total extraction],[Ore mined]
13,,,GRP-7a9ba115,Contrecoeur,manufacturing,Electric arc furnace,[Steel],[Refined metal produced],"[Long/wire rod, bars, slabs]"


In [24]:
production_data_available.to_excel(r'data/MetalliCan/production_data_available.xlsx', index=False)

In [25]:
# def prepare_normalization_data(df):
#     '''
#     Fonction pour nettoyer le DataFrame en vue de la normalisation.
#     Règles :
#     1) Pour chaque (main_id, facility_group_id), garder uniquement les lignes avec data_type = 'Production'.
#     2) Pour chaque (main_id, facility_group_id), garder la ligne avec le reference_point le plus prioritaire.
#        Si plusieurs commodities, les agréger (somme des valeurs, concaténation des commodities).
#     3) Ordre de priorité : 'Crude ore' > 'Total extraction' > 'Intermediate metal produced' > 'Refined metal produced' > 'Usable ore'.
#     Return :
#     cleaned_df : DataFrame nettoyé avec les mêmes colonnes.
#     '''
#     # 1. Filtrer pour ne garder que les lignes avec data_type = 'Production'
#     df_filtered = df[df['data_type'] == 'Production'].copy()
#
#     # 2. Définition des priorités (plus le chiffre est bas, plus la priorité est haute)
#     priority_order = {
#         'Crude ore': 1,
#         'Total extraction': 2,
#         'Intermediate metal produced': 3,
#         'Refined metal produced': 4,
#         'Usable ore': 5
#     }
#
#     # 3. Préparation des clés de groupe
#     df_filtered['facility_group_id'] = df_filtered['facility_group_id'].fillna('NA_ID')
#     id_cols = ['main_id', 'facility_group_id', 'year', 'geography']
#
#     # 4. Créer la colonne de priorité
#     df_filtered['priority'] = df_filtered['reference_point'].map(priority_order).fillna(99)
#
#     # 5. Trier le DataFrame par priorité pour identifier le meilleur niveau
#     df_sorted = df_filtered.sort_values(by=id_cols + ['priority'], ascending=[True] * len(id_cols) + [True])
#
#     # 6. Identification du Meilleur Niveau de Priorité (pour tout le groupe)
#     best_priority_levels = df_sorted.groupby(id_cols)['priority'].min().reset_index().rename(
#         columns={'priority': 'best_priority_level'}
#     )
#
#     # 7. Filtrage pour l'agrégation
#     df_merged = df_sorted.merge(best_priority_levels, on=id_cols, how='left')
#     df_to_sum = df_merged[df_merged['priority'] == df_merged['best_priority_level']].copy()
#
#     # 8. Assurer que les colonnes sont numériques pour la somme
#     df_to_sum['value'] = pd.to_numeric(df_to_sum['value'], errors='coerce')
#     df_to_sum['value_tonnes'] = pd.to_numeric(df_to_sum['value_tonnes'], errors='coerce')
#
#     # 9. Agrégation (Somme des valeurs et concaténation des commodities)
#     aggregated_data = df_to_sum.groupby(id_cols).agg(
#         value_sum=('value', 'sum'),
#         value_tonnes_sum=('value_tonnes', 'sum'),
#         commodity_agg=('commodity', lambda x: ', '.join(sorted(x.unique())))
#     ).reset_index()
#
#     # 10. Extraction des Métadonnées et Fusion
#     metadata_cols = [col for col in df_sorted.columns if col not in ['value', 'value_tonnes', 'priority']]
#     cleaned_df = df_sorted.drop_duplicates(subset=id_cols, keep='first')[metadata_cols]
#
#     # 11. Fusionner les métadonnées avec les valeurs agrégées
#     cleaned_df = cleaned_df.merge(aggregated_data, on=id_cols, how='left')
#
#     # 12. Remplacer les colonnes de valeur et de commodity avec les valeurs agrégées
#     cleaned_df['value'] = cleaned_df['value_sum']
#     cleaned_df['value_tonnes'] = cleaned_df['value_tonnes_sum']
#     cleaned_df['commodity'] = cleaned_df['commodity_agg']
#
#     # 13. Nettoyage Final et Restauration
#     cleaned_df['facility_group_id'] = cleaned_df['facility_group_id'].replace('NA_ID', None)
#
#     # 14. Rétablir l'ordre original des colonnes du DF d'entrée
#     original_cols = df.columns.tolist()
#     cols_to_drop = ['best_priority_level', 'value_sum', 'value_tonnes_sum', 'commodity_agg', 'priority']
#     cleaned_df = cleaned_df.drop(columns=[col for col in cols_to_drop if col in cleaned_df.columns])
#
#     return cleaned_df[original_cols]


In [26]:
def prepare_normalization_data(df):
    '''
    Fonction pour nettoyer le DataFrame en vue de la normalisation.
    Règles :
    1) Pour chaque (main_id, facility_group_id), garder uniquement les lignes avec data_type = 'Production'.
    2) Pour chaque (main_id, facility_group_id), garder la ligne avec le reference_point le plus prioritaire.
       Si plusieurs commodities, les agréger (somme des valeurs, concaténation des commodities).
    3) Ordre de priorité : 'Crude ore' > 'Total extraction' > 'Intermediate metal produced' > 'Refined metal produced' > 'Usable ore'.
    Return :
    cleaned_df : DataFrame nettoyé avec les mêmes colonnes.
    '''
    # 1. Filtrer pour ne garder que les lignes avec data_type = 'Production'
    df_filtered = df[df['data_type'] == 'Production'].copy()

    # 2. Définition des priorités (plus le chiffre est bas, plus la priorité est haute)
    priority_order = {
        'Crude ore': 1,
        'Total extraction': 2,
        'Intermediate metal produced': 3,
        'Refined metal produced': 4,
        'Usable ore': 5
    }

    # 3. Préparation des clés de groupe : remplir NaN pour main_id et facility_group_id
    df_filtered['main_id'] = df_filtered['main_id'].fillna('NA_ID')
    df_filtered['facility_group_id'] = df_filtered['facility_group_id'].fillna('NA_ID')
    id_cols = ['main_id', 'facility_group_id', 'year', 'geography']

    # 4. Créer la colonne de priorité
    df_filtered['priority'] = df_filtered['reference_point'].map(priority_order).fillna(99)

    # 5. Trier le DataFrame par priorité pour identifier le meilleur niveau
    df_sorted = df_filtered.sort_values(by=id_cols + ['priority'], ascending=[True] * len(id_cols) + [True])

    # 6. Identification du Meilleur Niveau de Priorité (pour tout le groupe)
    best_priority_levels = df_sorted.groupby(id_cols)['priority'].min().reset_index().rename(
        columns={'priority': 'best_priority_level'}
    )

    # 7. Filtrage pour l'agrégation
    df_merged = df_sorted.merge(best_priority_levels, on=id_cols, how='left')
    df_to_sum = df_merged[df_merged['priority'] == df_merged['best_priority_level']].copy()

    # 8. Assurer que les colonnes sont numériques pour la somme
    df_to_sum['value'] = pd.to_numeric(df_to_sum['value'], errors='coerce')
    df_to_sum['value_tonnes'] = pd.to_numeric(df_to_sum['value_tonnes'], errors='coerce')

    # 9. Agrégation (Somme des valeurs et concaténation des commodities)
    aggregated_data = df_to_sum.groupby(id_cols).agg(
        value_sum=('value', 'sum'),
        value_tonnes_sum=('value_tonnes', 'sum'),
        commodity_agg=('commodity', lambda x: ', '.join(sorted(x.unique())))
    ).reset_index()

    # 10. Extraction des Métadonnées et Fusion
    metadata_cols = [col for col in df_sorted.columns if col not in ['value', 'value_tonnes', 'priority']]
    cleaned_df = df_sorted.drop_duplicates(subset=id_cols, keep='first')[metadata_cols]

    # 11. Fusionner les métadonnées avec les valeurs agrégées
    cleaned_df = cleaned_df.merge(aggregated_data, on=id_cols, how='left')

    # 12. Remplacer les colonnes de valeur et de commodity avec les valeurs agrégées
    cleaned_df['value'] = cleaned_df['value_sum']
    cleaned_df['value_tonnes'] = cleaned_df['value_tonnes_sum']
    cleaned_df['commodity'] = cleaned_df['commodity_agg']

    # 13. Nettoyage Final et Restauration
    cleaned_df['main_id'] = cleaned_df['main_id'].replace('NA_ID', None)
    cleaned_df['facility_group_id'] = cleaned_df['facility_group_id'].replace('NA_ID', None)

    # 14. Rétablir l'ordre original des colonnes du DF d'entrée
    original_cols = [col for col in df.columns if col in cleaned_df.columns]
    cols_to_drop = ['best_priority_level', 'value_sum', 'value_tonnes_sum', 'commodity_agg', 'priority']
    cleaned_df = cleaned_df.drop(columns=[col for col in cols_to_drop if col in cleaned_df.columns])

    return cleaned_df[original_cols]

In [27]:
normalization_production_df = prepare_normalization_data(production_table)
# Let's remove usable ore for main_id = YT-MAIN-44857446
normalization_production_df = normalization_production_df[~((normalization_production_df['main_id'] == 'YT-MAIN-44857446') & (normalization_production_df['reference_point'] == 'Usable ore'))]

In [28]:
normalization_production_df

Unnamed: 0,prod_id,year,geography,commodity,reference_point,material_type,data_type,unit,value,value_tonnes,comment,main_id,facility_group_id,company_id,source_id
0,PROD-d3a4aba9-2022-1,2022,Canada,"Cobalt, Nickel",Refined metal produced,Refinery production,Production,t,3.563600e+04,3.563600e+04,Deducted as the difference and verified in the SR,AB-MAIN-d3a4aba9,,CMP-2ef1f553,SRC_SherrittInternationalCorporationandGeneral...
1,PROD-3bb6b7cd-2024-1,2024,Canada,Zinc,Refined metal produced,Refinery production,Production,kt,2.560000e+02,2.560000e+05,,BC-MAIN-3bb6b7cd,,CMP-06e20c6e,SRC_TeckResourcesLimited_2024_Annual_report
2,PROD-3f490561-2023-3,2023,Canada,Ore,Crude ore,Ore milled,Production,t,5.948239e+06,5.948239e+06,,BC-MAIN-3f490561,,CMP-0dfcd91e,SRC_ImperialMetalsCorporation_2024-Q4-MDA
3,PROD-4724f4ba-2024-2,2024,Canada,Gold,Crude ore,Ore delivered,Production,t,3.324500e+04,3.324500e+04,,BC-MAIN-4724f4ba,,CMP-d726df1c,SRC_GoldMountainMiningCorp_gold-mountain-repor...
4,PROD-599152a0-2023-4,2023,Canada,Ore,Crude ore,Ore milled,Production,t,6.862152e+06,6.862152e+06,"""As Copper Mountain was acquired on June 20, 2...",BC-MAIN-599152a0,,CMP-12afc634,SRC_HudbayMineralsInc_MDA25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,PROD-f9e41c2a-2023-2,2023,Canada,Gold,Crude ore,Ore milled,Production,t,8.384190e+05,8.384190e+05,,QC-MAIN-f9e41c2a,,CMP-64d60fd7,SRC_EldoradoGold_Data
74,PROD-26b9aaf3-2021-1,2021,Canada,Steel,Intermediate metal produced,EAF production,Production,kt,8.280000e+02,8.280000e+05,,SK-MAIN-26b9aaf3,,CMP-28720916,SRC_EVRAZNorthAmericaInc_Evraz_Regina_Steel_plant
75,PROD-60ba74c4-2022-1,2022,Canada,Uranium,Usable ore,Contained metal in concentrate,Production,mlbs,1.800000e+01,8.164656e+03,Process the high-grade uranium ore which is su...,SK-MAIN-60ba74c4,,CMP-43299e17,SRC_OranoCanadaInc_mining-and-milling
76,PROD-91cf5448-2023-2,2023,Canada,Uranium,Crude ore,Ore processed,Production,mlbs,9.640129e+01,4.372686e+04,"Calculated based on a 15,87% ore grade and 98,...",SK-MAIN-91cf5448,,CMP-cadb529a,SRC_CamecoCorporation_cigar-lake


In [29]:
normalization_production_df.to_csv(r'normalization_production_df.csv', index=False)

## Work on the production table

In [30]:
def normalize_by_production(df, production_df, value_col='value', prod_col='value_tonnes', prod_agg='sum'):

    df = df.copy()
    # ensure numeric
    df[value_col] = pd.to_numeric(df[value_col], errors='coerce')
    prod = production_df.copy()
    prod[prod_col] = pd.to_numeric(prod[prod_col], errors='coerce')

    # aggregate to unique per key
    main_prod = (prod.dropna(subset=['main_id'])
                    .groupby('main_id', as_index=False)[prod_col]
                    .agg(prod_agg)
                    .rename(columns={prod_col: 'value_tonnes_main'}))
    group_prod = (prod.dropna(subset=['facility_group_id'])
                     .groupby('facility_group_id', as_index=False)[prod_col]
                     .agg(prod_agg)
                     .rename(columns={prod_col: 'value_tonnes_group'}))

    # safe 1:1 merges
    out = df.merge(main_prod, on='main_id', how='left').merge(group_prod, on='facility_group_id', how='left')

    # prefer main_id match, fallback to facility_group_id
    out['value_tonnes_match'] = out['value_tonnes_main'].combine_first(out['value_tonnes_group'])
    out['value_normalized'] = out[value_col] / out['value_tonnes_match']

    # diagnostics
    out['normalization_key'] = None
    out.loc[out['value_tonnes_main'].notna(), 'normalization_key'] = 'main_id'
    out.loc[out['value_tonnes_main'].isna() & out['value_tonnes_group'].notna(), 'normalization_key'] = 'facility_group_id'
    return out

## Technosphere flows

In [31]:
energy_df_sd_norm = normalize_by_production(energy_df_sd, normalization_production_df, value_col='value_MJ', prod_col='value_tonnes')

In [32]:
material_df_sd_norm = normalize_by_production(material_df_sd, normalization_production_df, value_col='mass_t', prod_col='value_tonnes')

## Biosphere flows

In [33]:
biosphere_df_norm = normalize_by_production(env_table, normalization_production_df, value_col='value', prod_col='value_tonnes')

In [34]:
biosphere_df_norm['unit_normalized'] = biosphere_df_norm['unit'] + '/t'

# Export normalized technosphere and biosphere flows

In [35]:
energy_df_sd_norm.to_excel(r'data/MetalliCan/energy_df_sd_norm.xlsx', index=False)
material_df_sd_norm.to_excel(r'data/MetalliCan/material_df_sd_norm.xlsx', index=False)
biosphere_df_norm.to_excel(r'data/MetalliCan/biosphere_df_norm.xlsx', index=False)