In [50]:
import pandas as pd
import numpy as np

In [51]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [52]:
from typing import List, Tuple

In [53]:
# Custom functions
from some_functions import get_info_for_ids, create_sankey_diagram, get_production_data

In [54]:
metallican_path = r'C:\Users\mp_ma\OneDrive - polymtl\POST_DOC\CODE\metallican_db'

# MetalliCan data availability

In [54]:
df = pd.read_csv(r'C:\Users\mp_ma\OneDrive - polymtl\POST_DOC\CODE\metallican_db\results\data_coverage\data_coverage_per_table.csv')

In [55]:
import pandas as pd
import plotly.graph_objects as go
from collections import defaultdict

def generate_sankey_from_data(df):
    # Filtrer les données pour exclure les installations de type "advanced project"
    filtered_data = df[df['facility_type'] != 'advanced project'].copy()

    # Catégoriser les flux comme "Non" ou "Oui" pour chaque table
    filtered_data['Production'] = filtered_data['Production table'].apply(lambda x: 'Oui' if x > 0 else 'Non')
    filtered_data['Materials and energy'] = filtered_data['Materials and energy table'].apply(lambda x: 'Oui' if x > 0 else 'Non')
    filtered_data['Environmental flows'] = filtered_data['Environmental flows table'].apply(lambda x: 'Oui' if x > 0 else 'Non')
    filtered_data['Land occupation'] = filtered_data['Land occupation table'].apply(lambda x: 'Oui' if x > 0 else 'Non')

    # Liste des étapes dans l'ordre
    targets = ['facility_type', 'Production', 'Materials and energy', 'Environmental flows', 'Land occupation']

    # Créer un dictionnaire pour compter les transitions
    transition_counts = defaultdict(int)

    # Compter les transitions
    for _, row in filtered_data.iterrows():
        path = [str(row[target]) for target in targets]
        for i in range(len(path) - 1):
            transition = (f"{targets[i]}: {path[i]}", f"{targets[i+1]}: {path[i+1]}")
            transition_counts[transition] += 1

    # Extraire les labels uniques en tant que chaînes de caractères
    all_labels = sorted(set(item for transition in transition_counts.keys() for item in transition))
    label_to_index = {label: index for index, label in enumerate(all_labels)}

    # Préparer les données pour le Sankey
    source_indices = []
    target_indices = []
    values = []

    for (source, target), count in transition_counts.items():
        source_indices.append(label_to_index[source])
        target_indices.append(label_to_index[target])
        values.append(count)

    # Couleurs des nœuds : gris clair pour tous
    node_colors = ['lightgrey'] * len(all_labels)

    # Couleurs des liens : vert clair si la cible est "Oui", rouge clair si "Non"
    link_colors = []
    for (source, target) in transition_counts.keys():
        if target.endswith("Oui"):
            link_colors.append('rgba(144, 238, 144, 0.7)')  # Vert clair
        else:
            link_colors.append('rgba(255, 182, 193, 0.7)')  # Rouge clair

    # Créer le diagramme de Sankey
    fig = go.Figure(go.Sankey(
        arrangement="fixed",
        node=dict(
            pad=50,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_labels,
            color=node_colors,
        ),
        link=dict(
            source=source_indices,
            target=target_indices,
            value=values,
            color=link_colors
        )
    ))

    fig.update_layout(title_text="Diagramme de Sankey des flux de données", font_size=12)
    fig.show()

# Exemple d'utilisation
# data = pd.read_csv('data_coverage_per_table.csv')
# generate_sankey_from_data(data)

# Exemple d'utilisation
# data = pd.read_csv('data_coverage_per_table.csv')
generate_sankey_from_data(df)


# Define functions

In [6]:
def abbreviate_metals(s, metal_map):
    """Replace metal names with abbreviations in a comma-separated string. Case-insensitive."""
    if not isinstance(s, str):
        return s
    parts = [p.strip() for p in s.split(",")]
    abbr_parts = [metal_map.get(p.lower(), p) for p in parts]  # lowercase lookup
    return ", ".join(abbr_parts)

# Import MetalliCan tables

In [7]:
main_table = pd.read_csv(metallican_path + r'\database\CSV\main_table.csv')
tech_attributes_table = pd.read_csv(metallican_path + r'\database\CSV\tech_attributes_table.csv')
archetypes_table = pd.read_csv(metallican_path + r'\database\CSV\archetypes_table.csv')
land_table = pd.read_csv(metallican_path + r'\database\CSV\land_occupation_table.csv')

In [8]:
substances_table = pd.read_csv(metallican_path + r'\database\CSV\substances_table.csv')

In [9]:
# Normalized tables, after cleaning
biosphere_df = pd.read_excel(r'data\MetalliCan\biosphere_df_norm.xlsx')
material_df = pd.read_excel(r'data\MetalliCan\material_df_sd_norm.xlsx')
energy_df = pd.read_excel(r'data\MetalliCan\energy_df_sd_norm.xlsx')

In [10]:
energy_df

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,...,needs_factor,value_tonnes_main,value_tonnes_group,value_tonnes_match,value_normalized,normalization_key,facility_name,facility_group_name,mining_processing_type,commodities
0,TECH-857b7b89-2023-1,2023,Energy,Acetylene,GJ,18.475651,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,0.111299,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
1,TECH-857b7b89-2023-2,2023,Energy,Aviation fuel,GJ,72676.110790,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,437.807896,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
2,TECH-857b7b89-2023-3,2023,Energy,Diesel,GJ,287042.447232,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,1729.171369,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
3,TECH-857b7b89-2023-4,2023,Energy,Gasoline,GJ,13568.450000,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,81.737651,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
4,TECH-857b7b89-2023-5,2023,Energy,Propane,GJ,42071.041300,,BC-MAIN-857b7b89,,CMP-4a434d72,...,False,166000.0,,166000.0,253.440008,main_id,Brucejack,,"Underground, concentrator","Gold, silver"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,TECH-7607a50e-2023-3,2023,Energy,Diesel,GJ,140100.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,48.678844,main_id,Young-Davidson,,"Underground, concentrator",Gold
178,TECH-7607a50e-2023-4,2023,Energy,Gasoline,GJ,2124.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,0.738000,main_id,Young-Davidson,,"Underground, concentrator",Gold
179,TECH-7607a50e-2023-5,2023,Energy,Naphta,GJ,6344.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,2.204273,main_id,Young-Davidson,,"Underground, concentrator",Gold
180,TECH-7607a50e-2023-6,2023,Energy,Natural gas,GJ,221612.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,False,2878047.0,,2878047.0,77.000827,main_id,Young-Davidson,,"Underground, concentrator",Gold


In [11]:
# Remove the rows where value_normalized is NaN for the 3 dfs
biosphere_df = biosphere_df.dropna(subset=['value_normalized'])
material_df = material_df.dropna(subset=['value_normalized'])
energy_df = energy_df.dropna(subset=['value_normalized'])

In [12]:
def convert_to_percent(row):
    if row['unit'] == 'g/t':
        row['value'] = row['value'] / 10000  # 1 g/t = 0.0001%
        row['unit'] = '%'
    # For '%' or any other unit, do nothing
    return row


tech_attributes_table = tech_attributes_table.apply(convert_to_percent, axis=1)
mapping = {
    "Head grade": "Grade",
    "Ore grade": "Grade",
    "Mill ore grade": "Grade",
    "Mill grade": "Grade",
    "Mill feed grade": "Grade",
    "Recovery rate": "Recovery rate",
    "Mill recovery rate": "Recovery rate",
    "Copper recovery rate": "Recovery rate",
    "Strip ratio": "Strip ratio",
    "Concentrate grade": "Concentrate grade",
}

tech_attributes_table['material_type'] = tech_attributes_table['material_type'].replace(mapping)

# Energy consumption exploration

## By energy type

In [None]:
from constants import nrj_subflow

In [80]:
# Add a subflow_type_agg column to the energy_std_norm DataFrame based on the dictionnary
energy_df_agg = energy_df.copy()
energy_df_agg['subflow_type_agg'] = energy_df_agg['subflow_type'].map(nrj_subflow).fillna(energy_df_agg['subflow_type'])

In [86]:
energy_df_agg = (energy_df_agg.groupby(['main_id', 'facility_name', 'facility_group_id', 'facility_group_name', 'company_id','year', 'mining_processing_type', 'commodities', 'flow_type', 'subflow_type_agg'], dropna=False, as_index=False)
                 .agg(value_normalized_sum=('value_normalized', 'sum'))
)

In [87]:
energy_df_agg

Unnamed: 0,main_id,facility_name,facility_group_id,facility_group_name,company_id,year,mining_processing_type,commodities,flow_type,subflow_type_agg,value_normalized_sum
0,BC-MAIN-599152a0,Copper Mountain,,,CMP-12afc634,2023,"Open-pit, concentrator","Copper, gold, silver",Energy,Diesel,11.798048
1,BC-MAIN-599152a0,Copper Mountain,,,CMP-12afc634,2023,"Open-pit, concentrator","Copper, gold, silver",Energy,Electricity,286.790500
2,BC-MAIN-599152a0,Copper Mountain,,,CMP-12afc634,2023,"Open-pit, concentrator","Copper, gold, silver",Energy,Gasoline,0.053627
3,BC-MAIN-599152a0,Copper Mountain,,,CMP-12afc634,2023,"Open-pit, concentrator","Copper, gold, silver",Energy,LPG-Propane,1.474756
4,BC-MAIN-6b4800fe,Gibraltar,,,CMP-e35f138c,2023,"Open-pit, concentrator","Copper, molybdenum, silver",Energy,Diesel,56.121767
...,...,...,...,...,...,...,...,...,...,...,...
143,,Timmins West,GRP-147b3123,Timmins Operation,CMP-48a36546,2023,Underground,Gold,Energy,Diesel,244.145997
144,,Timmins West,GRP-147b3123,Timmins Operation,CMP-48a36546,2023,Underground,Gold,Energy,Electricity,469.013215
145,,Timmins West,GRP-147b3123,Timmins Operation,CMP-48a36546,2023,Underground,Gold,Energy,Explosives,4.407116
146,,Timmins West,GRP-147b3123,Timmins Operation,CMP-48a36546,2023,Underground,Gold,Energy,Gasoline,1.871728


In [None]:
from visualisation_functions import plot_stacked_energy_by_site

In [105]:
plot_stacked_energy_by_site(energy_df_agg, export_path='data/Parametrization/nrj_by_type.html')

In [106]:
energy_df_agg.to_csv(r'energy_df_agg.csv', index=False)

In [107]:
df = energy_df_agg.copy()

In [108]:
df

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,...,value_tonnes_main,value_tonnes_group,value_tonnes_match,value_normalized,normalization_key,facility_name,facility_group_name,mining_processing_type,commodities,subflow_type_agg
0,TECH-857b7b89-2023-1,2023,Energy,Acetylene,GJ,18.475651,,BC-MAIN-857b7b89,,CMP-4a434d72,...,166000.0,,166000.0,0.111299,main_id,Brucejack,,"Underground, concentrator","Gold, silver",LPG-Propane
1,TECH-857b7b89-2023-2,2023,Energy,Aviation fuel,GJ,72676.110790,,BC-MAIN-857b7b89,,CMP-4a434d72,...,166000.0,,166000.0,437.807896,main_id,Brucejack,,"Underground, concentrator","Gold, silver",Aviation fuel
2,TECH-857b7b89-2023-3,2023,Energy,Diesel,GJ,287042.447232,,BC-MAIN-857b7b89,,CMP-4a434d72,...,166000.0,,166000.0,1729.171369,main_id,Brucejack,,"Underground, concentrator","Gold, silver",Diesel
3,TECH-857b7b89-2023-4,2023,Energy,Gasoline,GJ,13568.450000,,BC-MAIN-857b7b89,,CMP-4a434d72,...,166000.0,,166000.0,81.737651,main_id,Brucejack,,"Underground, concentrator","Gold, silver",Gasoline
4,TECH-857b7b89-2023-5,2023,Energy,Propane,GJ,42071.041300,,BC-MAIN-857b7b89,,CMP-4a434d72,...,166000.0,,166000.0,253.440008,main_id,Brucejack,,"Underground, concentrator","Gold, silver",LPG-Propane
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,TECH-7607a50e-2023-2,2023,Energy,Electricity consumption|Grid electricity,GJ,960748.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,2878047.0,,2878047.0,333.819427,main_id,Young-Davidson,,"Underground, concentrator",Gold,Electricity
177,TECH-7607a50e-2023-3,2023,Energy,Diesel,GJ,140100.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,2878047.0,,2878047.0,48.678844,main_id,Young-Davidson,,"Underground, concentrator",Gold,Diesel
178,TECH-7607a50e-2023-4,2023,Energy,Gasoline,GJ,2124.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,2878047.0,,2878047.0,0.738000,main_id,Young-Davidson,,"Underground, concentrator",Gold,Gasoline
179,TECH-7607a50e-2023-5,2023,Energy,Naphta,GJ,6344.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,2878047.0,,2878047.0,2.204273,main_id,Young-Davidson,,"Underground, concentrator",Gold,Naphtha


In [118]:
import pandas as pd

def summarize_clusters_by_subflow(
    df,
    cluster_cols,       # List of columns to create clusters
    subflow_col,        # Column to group by within clusters (e.g., 'subflow_type')
    value_col,          # Column to compute statistics
    id_cols=None,       # Optional: list of columns to count unique IDs per cluster and subflow
    stats=['min', 'max', 'mean', 'std', 'count']
):
    """
    Summarize statistics for clusters and subflow types in a DataFrame.
    Includes unique ID counts per cluster and per subflow.
    """
    # Create cluster identifier
    df['cluster'] = df[cluster_cols].astype(str).agg(' | '.join, axis=1)

    # Group by cluster and subflow_col, then compute statistics
    result = df.groupby(['cluster', subflow_col])[value_col].agg(stats).reset_index()

    # Count unique IDs per subflow
    if id_cols:
        for col in id_cols:
            result[f'num_unique_{col}'] = df.groupby(['cluster', subflow_col])[col].nunique().values

    # Count unique IDs per cluster
    if id_cols:
        for col in id_cols:
            cluster_unique_counts = df.groupby('cluster')[col].nunique().reset_index()
            cluster_unique_counts = cluster_unique_counts.rename(columns={col: f'num_unique_{col}_in_cluster'})
            result = result.merge(cluster_unique_counts, on='cluster', how='left')

    # Add cluster_id for easier reference
    result['cluster_id'] = result.groupby('cluster').ngroup() + 1

    return result

# Example usage:
cluster_stats = summarize_clusters_by_subflow(
     df, cluster_cols=['mining_processing_type', 'commodities'],
     subflow_col='subflow_type_agg', value_col='value_normalized',
     id_cols=['main_id', 'facility_group_id']
)

In [119]:
cluster_stats

Unnamed: 0,cluster,subflow_type_agg,min,max,mean,std,count,num_unique_main_id,num_unique_facility_group_id,num_unique_main_id_in_cluster,num_unique_facility_group_id_in_cluster,cluster_id
0,Concentrator | Gold,Diesel,934.482175,1079.139344,1006.810760,102.288065,2,0,2,0,2,1
1,Concentrator | Gold,Electricity,13.360574,1583.811856,583.913818,868.810693,3,0,2,0,2,1
2,Concentrator | Gold,Explosives,11.793427,11.793427,11.793427,,1,0,1,0,2,1
3,Concentrator | Gold,Gasoline,13.313992,13.313992,13.313992,,1,0,1,0,2,1
4,Concentrator | Gold,LPG-Propane,0.061156,329.319672,164.690414,232.820930,2,0,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...
85,"Underground, concentrator | Gold, zinc, copper...",Natural gas,111.885138,111.885138,111.885138,,1,1,0,1,0,16
86,"Underground, concentrator | Silver, zinc, lead",Diesel,470.340613,470.340613,470.340613,,1,1,0,1,0,17
87,"Underground, concentrator | Silver, zinc, lead",Electricity,497.564131,497.564131,497.564131,,1,1,0,1,0,17
88,"Underground, concentrator | Silver, zinc, lead",Gasoline,68.294307,68.294307,68.294307,,1,1,0,1,0,17


In [120]:
cluster_stats.to_csv(r'data/Parametrization/cluster_nrj.csv', index=False)

## Total energy

In [13]:
energy_table = energy_df.groupby(['main_id', 'facility_group_id'], dropna=False)['value_normalized'].sum().reset_index()

In [14]:
energy_table.rename(columns={'value_normalized': 'energy_MJ'}, inplace=True)

In [20]:
# Extract all the rows from the production_data_available that have a non NaN main_id
energy_table_f = energy_table[energy_table['main_id'].notna()]
energy_table_f = energy_table_f.merge(main_table[['main_id', 'facility_name', 'facility_type', 'province', 'mining_processing_type', 'commodities']], on='main_id', how='left').drop_duplicates(subset=['main_id'], keep='first')

In [21]:
# Extract all the rows from the production_data_available that have a NaN main_id and non NaN facility_group_id
energy_table_fg = energy_table[energy_table['main_id'].isna() & energy_table['facility_group_id'].notna()]
energy_table_fg = energy_table_fg.merge(main_table[['facility_group_id', 'facility_group_name', 'facility_type', 'province', 'mining_processing_type', 'commodities']], on='facility_group_id', how='left').drop_duplicates(subset=['facility_group_id'], keep='first')


In [22]:
# Ensure all desired columns are present in both DataFrames before concatenation
cols_to_keep = ['main_id', 'facility_name', 'facility_group_id', 'facility_group_name', 'facility_type', 'province', 'mining_processing_type', 'commodities', 'energy_MJ']

# Add missing columns to merged_f
for col in cols_to_keep:
    if col not in energy_table_f.columns:
        energy_table_f[col] = None
# Add missing columns to merged_fg
for col in cols_to_keep:
    if col not in energy_table_fg.columns:
        energy_table_fg[col] = None

# Reorder columns
energy_table_f = energy_table_f[cols_to_keep]
energy_table_fg = energy_table_fg[cols_to_keep]

# Combine the two results
energy_table = pd.concat([energy_table_f, energy_table_fg])

In [23]:
energy_table['energy_GJ'] = energy_table['energy_MJ'] / 1000

In [24]:
from constants import metal_map
metal_map_lower = {k.lower(): v for k, v in metal_map.items()}
energy_table['commodities'] = energy_table['commodities'].apply(lambda x: abbreviate_metals(x, metal_map_lower))

In [25]:
from visualisation_functions import plot_2axes_by_commodity

In [26]:
fig_nrj_html = plot_2axes_by_commodity(energy_table, x_label=' ', y_label='MJ/t ore processed', export_path='data/Parametrization/nrj.html', export_format='html')

# Environmental flows exploration

In [15]:
# Let's keep rows for source_id = https://www.canada.ca/en/environment-climate-change/services/national-pollutant-release-inventory/tools-resources-data/exploredata.html or https://www.canada.ca/en/environment-climate-change/services/environmental-indicators/greenhouse-gas-emissions/large-facilities.html
biosphere_df = biosphere_df[biosphere_df['source_id'].isin([
    'https://www.canada.ca/en/environment-climate-change/services/national-pollutant-release-inventory/tools-resources-data/exploredata.html',
    'https://www.canada.ca/en/environment-climate-change/services/environmental-indicators/greenhouse-gas-emissions/large-facilities.html'
])]

In [16]:
# Merge to add the main_table information and the substance name
biosphere_df =biosphere_df.merge(main_table[['main_id', 'province', 'facility_type']], on='main_id', how='left')
biosphere_df = biosphere_df.merge(substances_table[['substance_id', 'substance_name']], on='substance_id', how='left')

In [17]:
biosphere_df

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,...,value_normalized,normalization_key,unit_normalized,facility_name,facility_group_name,mining_processing_type,commodities,province,facility_type,substance_name
0,npri-1568-2023-1,2023,Air,NA - 16,Emission,Stack Emissions,tonnes,0.63600,,ON-MAIN-1f126a43,...,1.440257e-06,main_id,tonnes/t,Macassa,,"Underground, concentrator","Gold, silver",Ontario,mining,Ammonia (total)
1,npri-1568-2023-2,2023,Air,NA - 02,Emission,Stack Emissions,kg,0.11600,,ON-MAIN-1f126a43,...,2.626883e-07,main_id,kg/t,Macassa,,"Underground, concentrator","Gold, silver",Ontario,mining,Arsenic (and its compounds)
2,npri-1568-2023-3,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,76.20800,,ON-MAIN-1f126a43,...,1.725772e-04,main_id,tonnes/t,Macassa,,"Underground, concentrator","Gold, silver",Ontario,mining,Carbon monoxide
3,npri-1568-2023-4,2023,Air,NA - 04,Emission,Stack Emissions,tonnes,0.00105,,ON-MAIN-1f126a43,...,2.377782e-09,main_id,tonnes/t,Macassa,,"Underground, concentrator","Gold, silver",Ontario,mining,Chromium (and its compounds)
4,npri-1568-2023-5,2023,Air,NA - 05,Emission,Stack Emissions,kg,0.18000,,ON-MAIN-1f126a43,...,4.076198e-07,main_id,kg/t,Macassa,,"Underground, concentrator","Gold, silver",Ontario,mining,Cobalt (and its compounds)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2710,GHG-10791-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,104880.00000,,QC-MAIN-c1c7eb99,...,6.096276e-01,main_id,tCO2eq/t,Arvida,Saguenay–Lac-Saint-Jean Operations,Smelter,"Aluminum (pure or alloyed), alumina, aluminum ...",Quebec,manufacturing,GHG
2711,GHG-11730-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,406650.00000,,QC-MAIN-92628f16,...,2.921744e-01,main_id,tCO2eq/t,Vaudreuil,Saguenay–Lac-Saint-Jean Operations,Refinery,Alumina,Quebec,manufacturing,GHG
2712,GHG-11765-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,15600.00000,,ON-MAIN-4c35e094,...,5.032258e-03,main_id,tCO2eq/t,ArcelorMittal Dofasco,,"Basic oxygen furnace, electric arc furnace",Steel,Ontario,manufacturing,GHG
2713,GHG-11989-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,952950.00000,,QC-MAIN-1eee4ace,...,5.015526e-01,facility_group_id,tCO2eq/t,Contrecœur East,Contrecoeur,Electric arc furnace,Steel,Quebec,manufacturing,GHG


In [27]:
biosphere_df['commodities'] = biosphere_df['commodities'].apply(lambda x: abbreviate_metals(x, metal_map_lower))

In [28]:
biosphere_mining_df = biosphere_df[biosphere_df['facility_type'] == 'mining']
biosphere_man_df = biosphere_df[biosphere_df['facility_type'] == 'manufacturing']

In [29]:
biosphere_df

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,...,value_normalized,normalization_key,unit_normalized,facility_name,facility_group_name,mining_processing_type,commodities,province,facility_type,substance_name
0,npri-1568-2023-1,2023,Air,NA - 16,Emission,Stack Emissions,tonnes,0.63600,,ON-MAIN-1f126a43,...,1.440257e-06,main_id,tonnes/t,Macassa,,"Underground, concentrator","Au, Ag",Ontario,mining,Ammonia (total)
1,npri-1568-2023-2,2023,Air,NA - 02,Emission,Stack Emissions,kg,0.11600,,ON-MAIN-1f126a43,...,2.626883e-07,main_id,kg/t,Macassa,,"Underground, concentrator","Au, Ag",Ontario,mining,Arsenic (and its compounds)
2,npri-1568-2023-3,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,76.20800,,ON-MAIN-1f126a43,...,1.725772e-04,main_id,tonnes/t,Macassa,,"Underground, concentrator","Au, Ag",Ontario,mining,Carbon monoxide
3,npri-1568-2023-4,2023,Air,NA - 04,Emission,Stack Emissions,tonnes,0.00105,,ON-MAIN-1f126a43,...,2.377782e-09,main_id,tonnes/t,Macassa,,"Underground, concentrator","Au, Ag",Ontario,mining,Chromium (and its compounds)
4,npri-1568-2023-5,2023,Air,NA - 05,Emission,Stack Emissions,kg,0.18000,,ON-MAIN-1f126a43,...,4.076198e-07,main_id,kg/t,Macassa,,"Underground, concentrator","Au, Ag",Ontario,mining,Cobalt (and its compounds)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2710,GHG-10791-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,104880.00000,,QC-MAIN-c1c7eb99,...,6.096276e-01,main_id,tCO2eq/t,Arvida,Saguenay–Lac-Saint-Jean Operations,Smelter,"Aluminum (pure or alloyed), alumina, aluminum ...",Quebec,manufacturing,GHG
2711,GHG-11730-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,406650.00000,,QC-MAIN-92628f16,...,2.921744e-01,main_id,tCO2eq/t,Vaudreuil,Saguenay–Lac-Saint-Jean Operations,Refinery,Alumina,Quebec,manufacturing,GHG
2712,GHG-11765-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,15600.00000,,ON-MAIN-4c35e094,...,5.032258e-03,main_id,tCO2eq/t,ArcelorMittal Dofasco,,"Basic oxygen furnace, electric arc furnace",Steel,Ontario,manufacturing,GHG
2713,GHG-11989-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,952950.00000,,QC-MAIN-1eee4ace,...,5.015526e-01,facility_group_id,tCO2eq/t,Contrecœur East,Contrecoeur,Electric arc furnace,Steel,Quebec,manufacturing,GHG


In [30]:
from visualisation_functions import plot_biosphere

In [32]:
fig_min = plot_biosphere(
    biosphere_df=biosphere_mining_df,
    x_col='commodities',
    y_col='value_normalized',
    color_col='province',
    symbol_col='mining_processing_type',
    hover_name_cols=['facility_name', 'facility_group_name'],
    y_unit_col='unit_normalized',
    save_path="data/Parametrization/npri_min.html"
)


In [33]:
fig_man = plot_biosphere(
    biosphere_df=biosphere_man_df,
    x_col='commodities',
    y_col='value_normalized',
    color_col='province',
    symbol_col='mining_processing_type',
    hover_name_cols=['facility_name', 'facility_group_name'],
    y_unit_col='unit_normalized',
    save_path="data/Parametrization/npri_man.html"
)

# Land occupation exploration

In [34]:
land_table

Unnamed: 0,land_occupation_id,area_km2,geometry,distance_km,main_id,tailing_id,source_id
0,45954,2.622800,POLYGON Z ((-62.920418000502096 44.98665598524...,0.000000,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
1,45955,0.441869,POLYGON Z ((-62.92612599959345 44.991678995543...,1.186584,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
2,7631,0.209038,POLYGON Z ((-53.8260233402639 47.4043112260747...,0.811781,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
3,7641,0.113569,POLYGON Z ((-53.7468445297622 47.4086824659858...,4.647047,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
4,7633,0.213592,POLYGON Z ((-53.81396413042625 47.413655936064...,0.000000,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
...,...,...,...,...,...,...,...
789,QC-LI-219a2e44,2.926809,MULTIPOLYGON Z (((-79.08481557072447 48.229685...,,QC-MAIN-30c1828c,,https://www.nature.com/articles/s41597-025-052...
790,BC-LI-9c3aa818,0.122644,POLYGON Z ((-130.49619675383414 56.64850845409...,,BC-MAIN-b1fe389a,,https://www.nature.com/articles/s41597-025-052...
791,ON-LI-1a9a061c,4.202254,MULTIPOLYGON Z (((-81.36899483562593 48.678821...,,ON-MAIN-f8313ebd,,https://www.nature.com/articles/s41597-025-052...
792,QC-LI-94bd8222,6.202897,POLYGON Z ((-79.22195784417586 49.574666146450...,,QC-MAIN-b86f7d07,,https://www.nature.com/articles/s41597-025-052...


In [41]:
# Step 1: Find main_id that have more than one unique source_id
multi_source_main_ids = land_table.groupby('main_id')['source_id'].nunique()
multi_source_main_ids = multi_source_main_ids[multi_source_main_ids > 1].index

# Step 2: Filter the dataframe
land_table = land_table[
    ~land_table['main_id'].isin(multi_source_main_ids) |
    (land_table['source_id'] == "https://www.nature.com/articles/s41597-025-05296-y")
]

In [42]:
land_table

Unnamed: 0,land_occupation_id,area_km2,geometry,distance_km,main_id,tailing_id,source_id
0,45954,2.622800,POLYGON Z ((-62.920418000502096 44.98665598524...,0.000000,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
1,45955,0.441869,POLYGON Z ((-62.92612599959345 44.991678995543...,1.186584,NS-MAIN-c9c77a59,,https://doi.org/10.1038/s43247-023-00805-6
2,7631,0.209038,POLYGON Z ((-53.8260233402639 47.4043112260747...,0.811781,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
3,7641,0.113569,POLYGON Z ((-53.7468445297622 47.4086824659858...,4.647047,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
4,7633,0.213592,POLYGON Z ((-53.81396413042625 47.413655936064...,0.000000,NL-MAIN-d9036091,,https://doi.org/10.1038/s43247-023-00805-6
...,...,...,...,...,...,...,...
789,QC-LI-219a2e44,2.926809,MULTIPOLYGON Z (((-79.08481557072447 48.229685...,,QC-MAIN-30c1828c,,https://www.nature.com/articles/s41597-025-052...
790,BC-LI-9c3aa818,0.122644,POLYGON Z ((-130.49619675383414 56.64850845409...,,BC-MAIN-b1fe389a,,https://www.nature.com/articles/s41597-025-052...
791,ON-LI-1a9a061c,4.202254,MULTIPOLYGON Z (((-81.36899483562593 48.678821...,,ON-MAIN-f8313ebd,,https://www.nature.com/articles/s41597-025-052...
792,QC-LI-94bd8222,6.202897,POLYGON Z ((-79.22195784417586 49.574666146450...,,QC-MAIN-b86f7d07,,https://www.nature.com/articles/s41597-025-052...


In [44]:
# For each main_id, give me the sum of area_km2 associated
land_table = land_table.groupby('main_id')['area_km2'].sum().reset_index()

In [45]:
land_table = land_table.merge(main_table[['main_id', 'facility_group_id', 'facility_name', 'facility_group_name', 'province', 'facility_type', 'mining_processing_type', 'commodities']], on='main_id', how='left')

In [46]:
land_table

Unnamed: 0,main_id,area_km2,facility_group_id,facility_name,facility_group_name,province,facility_type,mining_processing_type,commodities
0,BC-MAIN-23155c25,1.499690,,Myra Falls,,British Columbia,mining,Underground,"Zinc, copper, silver, gold, lead"
1,BC-MAIN-3ef4f421,1.396089,,Avanti Kitsault,,British Columbia,project,,"Molybdenum, silver, lead"
2,BC-MAIN-3f490561,7.967835,,Mount Polley,,British Columbia,mining,"Open-pit, concentrator","Gold, copper, silver"
3,BC-MAIN-4724f4ba,0.416737,,Elk,,British Columbia,mining,Open-pit,Gold
4,BC-MAIN-599152a0,13.233210,,Copper Mountain,,British Columbia,mining,"Open-pit, concentrator","Copper, gold, silver"
...,...,...,...,...,...,...,...,...,...
111,SK-MAIN-9dd2b7f8,4.345047,,Horseshoe-Raven,,Saskatchewan,project,,Uranium
112,SK-MAIN-bb89158f,10.235647,GRP-21eee27d,Key Lake,Key Lake + McArthur River,Saskatchewan,mining,Concentrator,Uranium
113,SK-MAIN-d3c471e8,1.973892,GRP-21eee27d,McArthur River,Key Lake + McArthur River,Saskatchewan,mining,Underground,Uranium
114,YT-MAIN-44857446,5.293594,,Keno Hill Silver District,,Yukon,mining,"Underground, concentrator","Silver, zinc, lead"


In [47]:
land_table_mining = land_table[land_table['facility_type'] == 'mining']

In [48]:
land_table_mining['commodities'] = land_table_mining['commodities'].apply(lambda x: abbreviate_metals(x, metal_map_lower))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [49]:
land_mining_plot = plot_2axes_by_commodity(land_table_mining, y_col='area_km2',x_label=' ', y_label='km2', export_path='data/Parametrization/land.html', export_format='html')

In [34]:
ta_grade = tech_attributes_table[tech_attributes_table['material_type'] == 'Grade']
ta_strip = tech_attributes_table[tech_attributes_table['material_type'] == 'Strip ratio']

In [35]:
ta_grade_ids = set(tech_attributes_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
ta_strip_ids = set(tech_attributes_table[['main_id', 'facility_group_id']].apply(tuple, axis=1))
ta_common_ids = ta_grade_ids.intersection(ta_strip_ids)

## Prepare dfs

In [60]:
biosphere_param_df = get_info_for_ids(biosphere_df, ta_grade_ids)
biosphere_param_df

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,...,value_tonnes_match,value_normalized,normalization_key,unit_normalized,facility_name,province,facility_type,mining_processing_type,commodities,substance_name
18,npri-25188-2023-1,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,3.03,,QC-MAIN-e7e6a960,...,19594928.0,1.546319e-07,main_id,tonnes/t,Canadian Malartic,Quebec,mining,"Open-pit, concentrator","Au, Ag",Carbon monoxide
19,npri-25188-2023-2,2023,Air,11104-93-1,Emission,Stack Emissions,tonnes,9.03,,QC-MAIN-e7e6a960,...,19594928.0,4.608335e-07,main_id,tonnes/t,Canadian Malartic,Quebec,mining,"Open-pit, concentrator","Au, Ag",Nitrogen oxides (expressed as nitrogen dioxide)
20,npri-25188-2023-3,2023,Air,NA - M09,Emission,Stack Emissions,tonnes,1.73,,QC-MAIN-e7e6a960,...,19594928.0,8.828815e-08,main_id,tonnes/t,Canadian Malartic,Quebec,mining,"Open-pit, concentrator","Au, Ag",PM10 - Particulate Matter <= 10 Micrometers
21,npri-25188-2023-4,2023,Air,NA - M10,Emission,Stack Emissions,tonnes,0.61,,QC-MAIN-e7e6a960,...,19594928.0,3.113050e-08,main_id,tonnes/t,Canadian Malartic,Quebec,mining,"Open-pit, concentrator","Au, Ag",PM2.5 - Particulate Matter <= 2.5 Micrometers
22,npri-25188-2023-5,2023,Air,NA - M08,Emission,Stack Emissions,tonnes,7.86,,QC-MAIN-e7e6a960,...,19594928.0,4.011242e-07,main_id,tonnes/t,Canadian Malartic,Quebec,mining,"Open-pit, concentrator","Au, Ag",Total particulate matter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,GHG-11894-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,70400.00,,BC-MAIN-599152a0,...,6862152.0,1.025917e-02,main_id,tCO2eq/t,Copper Mountain,British Columbia,mining,"Open-pit, concentrator","Cu, Au, Ag",GHG
2696,GHG-11951-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,26800.00,,ON-MAIN-7f050560,...,841000.0,3.186683e-02,main_id,tCO2eq/t,Red Lake,Ontario,mining,"Underground, concentrator","Au, Ag",GHG
2697,GHG-12304-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,12400.00,,ON-MAIN-cb85213a,...,222627.0,5.569855e-02,main_id,tCO2eq/t,Eagle River,Ontario,mining,"Underground, concentrator",Au,GHG
2698,GHG-12553-2023-1,2023,Air,NA - GHG,Emission,Unspecified,tCO2eq,95700.00,,BC-MAIN-8eb8be0d,...,1139000.0,8.402107e-02,main_id,tCO2eq/t,Red Chris,British Columbia,mining,"Open-pit, concentrator","Au, Cu, Ag",GHG


In [54]:
# Extract all the rows from the production_data_available that have a non NaN main_id
biosphere_param_df_f = biosphere_param_df[biosphere_param_df['main_id'].notna()]
biosphere_param_df_f = biosphere_param_df_f.merge(main_table[['main_id', 'facility_name', 'facility_type', 'province', 'mining_processing_type', 'commodities']], on='main_id', how='left').drop_duplicates(subset=['main_id'], keep='first')

In [55]:
# Extract all the rows from the production_data_available that have a NaN main_id and non NaN facility_group_id
biosphere_param_df_fg = biosphere_param_df[biosphere_param_df['main_id'].isna() & biosphere_param_df['facility_group_id'].notna()]
biosphere_param_df_fg = biosphere_param_df_fg.merge(main_table[['facility_group_id', 'facility_group_name', 'facility_type', 'province', 'mining_processing_type', 'commodities']], on='facility_group_id', how='left').drop_duplicates(subset=['facility_group_id'], keep='first')


In [58]:
biosphere_param_df_f.columns

Index(['env_id', 'year', 'compartment_name', 'substance_id', 'flow_direction',
       'release_pathway', 'unit', 'value', 'comment', 'main_id',
       'facility_group_id', 'company_id', 'source_id', 'value_tonnes_main',
       'value_tonnes_group', 'value_tonnes_match', 'value_normalized',
       'normalization_key', 'unit_normalized', 'facility_name_x', 'province_x',
       'facility_type_x', 'mining_processing_type_x', 'commodities_x',
       'substance_name', 'facility_name_y', 'facility_type_y', 'province_y',
       'mining_processing_type_y', 'commodities_y'],
      dtype='object')

In [59]:
biosphere_param_df_fg.columns

Index(['env_id', 'year', 'compartment_name', 'substance_id', 'flow_direction',
       'release_pathway', 'unit', 'value', 'comment', 'main_id',
       'facility_group_id', 'company_id', 'source_id', 'value_tonnes_main',
       'value_tonnes_group', 'value_tonnes_match', 'value_normalized',
       'normalization_key', 'unit_normalized', 'facility_name', 'province_x',
       'facility_type_x', 'mining_processing_type_x', 'commodities_x',
       'substance_name', 'facility_group_name', 'facility_type_y',
       'province_y', 'mining_processing_type_y', 'commodities_y'],
      dtype='object')

In [16]:
# Ensure all desired columns are present in both DataFrames before concatenation
cols_to_keep = ['main_id', 'facility_name', 'facility_group_id', 'facility_group_name', 'facility_type', 'province', 'mining_processing_type', 'commodities', 'energy_MJ']

# Add missing columns to merged_f
for col in cols_to_keep:
    if col not in energy_table_f.columns:
        energy_table_f[col] = None
# Add missing columns to merged_fg
for col in cols_to_keep:
    if col not in energy_table_fg.columns:
        energy_table_fg[col] = None

# Reorder columns
energy_table_f = energy_table_f[cols_to_keep]
energy_table_fg = energy_table_fg[cols_to_keep]

# Combine the two results
energy_table = pd.concat([energy_table_f, energy_table_fg])

In [59]:
biosphere_param_df.to_excel(r'data/Parametrization/param_biosphere.xlsx', index=False)

In [58]:
energy_param_df = get_info_for_ids(energy_df, ta_grade_ids)
energy_param_df

Unnamed: 0,technosphere_id,year,flow_type,subflow_type,unit,value,comment,main_id,facility_group_id,company_id,...,value_MJ,unit_source,assumption_note,unit_standard,needs_factor,value_tonnes_main,value_tonnes_group,value_tonnes_match,value_normalized,normalization_key
0,TECH-857b7b89-2023-1,2023,Energy,Acetylene,GJ,18.475651,,BC-MAIN-857b7b89,,CMP-4a434d72,...,1.847565e+04,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,0.111299,main_id
1,TECH-857b7b89-2023-2,2023,Energy,Aviation fuel,GJ,72676.110790,,BC-MAIN-857b7b89,,CMP-4a434d72,...,7.267611e+07,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,437.807896,main_id
2,TECH-857b7b89-2023-3,2023,Energy,Diesel,GJ,287042.447232,,BC-MAIN-857b7b89,,CMP-4a434d72,...,2.870424e+08,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,1729.171369,main_id
3,TECH-857b7b89-2023-4,2023,Energy,Gasoline,GJ,13568.450000,,BC-MAIN-857b7b89,,CMP-4a434d72,...,1.356845e+07,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,81.737651,main_id
4,TECH-857b7b89-2023-5,2023,Energy,Propane,GJ,42071.041300,,BC-MAIN-857b7b89,,CMP-4a434d72,...,4.207104e+07,direct_unit,gj→MJ factor=1000.0,MJ,False,166000.0,,166000.0,253.440008,main_id
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,TECH-7607a50e-2023-2,2023,Energy,Electricity consumption|Grid electricity,GJ,960748.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,9.607480e+08,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,333.819427,main_id
177,TECH-7607a50e-2023-3,2023,Energy,Diesel,GJ,140100.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,1.401000e+08,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,48.678844,main_id
178,TECH-7607a50e-2023-4,2023,Energy,Gasoline,GJ,2124.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,2.124000e+06,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,0.738000,main_id
179,TECH-7607a50e-2023-5,2023,Energy,Naphta,GJ,6344.000000,,ON-MAIN-7607a50e,,CMP-3a4ccc7f,...,6.344000e+06,direct_unit,gj→MJ factor=1000.0,MJ,False,2878047.0,,2878047.0,2.204273,main_id


## NRJ

In [37]:
nrj_param = pd.read_excel(r'data/Parametrization/energy_parametrization.xlsx', sheet_name='RECAP')

In [38]:
# Make sure garde is numeric
nrj_param ['Grade'] = pd.to_numeric(nrj_param ['Grade'], errors='coerce')
nrj_param ['energy_MJ'] = pd.to_numeric(nrj_param ['energy_MJ'], errors='coerce')

In [39]:
# Apply
nrj_param ['commodities'] = nrj_param['commodities'].apply(lambda x: abbreviate_metals(x, metal_map_lower))

In [40]:
nrj_param

Unnamed: 0,main_id,facility_name,facility_group_id,facility_group_name,facility_type,province,mining_processing_type,commodities,energy_MJ,energy_GJ,Grade,Concentrate grade,Strip ratio,Recovery rate,Comment
0,BC-MAIN-599152a0,Copper Mountain,,,mining,British Columbia,"Open-pit, concentrator","Cu, Au, Ag",300.116931,0.300117,0.35,23.1,3.82,79.7,Assumed Cu is the main product
1,BC-MAIN-6b4800fe,Gibraltar,,,mining,British Columbia,"Open-pit, concentrator","Cu, Mo, Ag",120.184633,0.120185,0.25,,1.3,82.6,Only available for Cu
2,BC-MAIN-857b7b89,Brucejack,,,mining,British Columbia,"Underground, concentrator","Au, Ag",6463.585853,6.463586,0.000568,,,96.0,Ore
3,BC-MAIN-8eb8be0d,Red Chris,,,mining,British Columbia,"Open-pit, concentrator","Au, Cu, Ag",2518.230578,2.518231,,,,54.2,Assumed Au is the main product
4,BC-MAIN-aa76f6f2,New Afton,,,mining,British Columbia,"Underground, concentrator","Au, Cu, Ag",156.070253,0.15607,,,0.187133,90.0,Assumed Au is the main product
5,NL-MAIN-b64bae7a,Scully,,,mining,Newfoundland and Labrador,"Open-pit, concentrator",Fe,768.539326,0.768539,,,,,
6,NL-MAIN-dd723db4,Carol Lake,,,mining,Newfoundland and Labrador,"Open-pit, concentrator",Fe,834.563758,0.834564,,,,,
7,NU-MAIN-8b0264c9,Meliadine,,,mining,Nunavut,"Open-pit, underground, concentrator",Au,1274.879784,1.27488,0.000611,,3.148294,96.6,
8,ON-MAIN-0aadf28f,Rainy River,,,mining,Ontario,"Open-pit, underground, concentrator","Au, Ag",541.632505,0.541633,,,3.47259,91.0,Assumed Au is the main product (for grade)
9,ON-MAIN-1f126a43,Macassa,,,mining,Ontario,"Underground, concentrator","Au, Ag",2639.074911,2.639075,,,,,


In [41]:
def plot_energy_vs_grade(
    df,
    x_col='Grade',
    y_col='energy_MJ',
    color_col='commodities',
    symbol_col='mining_processing_type',
    hover_name_cols=['facility_name', 'facility_group_name'],
    x_label='Grade (%)',
    y_label='MJ/t ore processed',
    x_log=False,
    font_color="#333333",
    size_marker=10,
    save_path=None
):
    """
    Scatter plot of energy vs grade.
    Color = commodities
    Symbol = mining/processing type
    Fixed mapping for symbols to match the legend
    """
    df = df.copy()

    # Build hover_name
    if hover_name_cols and all(col in df.columns for col in hover_name_cols):
        df['hover_name'] = (
            df[hover_name_cols[0]].astype(str)
            + " (" + df[hover_name_cols[1]].astype(str) + ")"
        )
    else:
        df['hover_name'] = df[hover_name_cols[0]] if hover_name_cols else None

    # Prepare color and symbol sequences
    color_sequence = px.colors.qualitative.Plotly
    symbol_sequence = [
        "circle", "square", "diamond", "cross", "x",
        "triangle-up", "triangle-down", "triangle-left", "triangle-right",
        "star", "hexagon", "pentagon"
    ]

    # Create symbol map for unique values
    unique_symbols = df[symbol_col].dropna().unique()
    symbol_map = {sym: symbol_sequence[i % len(symbol_sequence)] for i, sym in enumerate(unique_symbols)}

    # Create color map for unique commodities
    unique_colors = df[color_col].dropna().unique()
    color_map = {col: color_sequence[i % len(color_sequence)] for i, col in enumerate(unique_colors)}

    # Apply the mappings
    df['marker_symbol'] = df[symbol_col].map(symbol_map)
    df['marker_color'] = df[color_col].map(color_map)

    # Initialize figure
    fig = go.Figure()

    # Scatter trace
    fig.add_trace(
        go.Scatter(
            x=df[x_col],
            y=df[y_col],
            mode='markers',
            marker=dict(
                size=size_marker,
                color=df['marker_color'],
                symbol=df['marker_symbol']
            ),
            text=df['hover_name'],
            hovertemplate='%{text}<br>%{x:.4f} ' + x_col + '<br>%{y:.2f} ' + y_col + '<extra></extra>',
            showlegend=False
        )
    )

    # Add dummy traces for color legend
    for col, col_color in color_map.items():
        fig.add_trace(
            go.Scatter(
                x=[None], y=[None], mode='markers',
                marker=dict(symbol="circle", size=size_marker, color=col_color),
                legendgroup="Color",
                showlegend=True,
                name=str(col)
            )
        )

    # Add dummy traces for symbol legend
    for sym, sym_marker in symbol_map.items():
        fig.add_trace(
            go.Scatter(
                x=[None], y=[None], mode='markers',
                marker=dict(symbol=sym_marker, size=size_marker, color="grey"),
                legendgroup="Symbol",
                showlegend=True,
                name=str(sym)
            )
        )

    # Layout
    fig.update_layout(
        xaxis_title=x_label if x_label else x_col.replace('_', ' ').title(),
        yaxis_title=y_label if y_label else y_col.replace('_', ' ').title(),
        font=dict(color=font_color, size=14),
        template="plotly_white",
        legend=dict(tracegroupgap=20, itemsizing='constant'),
        height=600
    )

    if x_log:
        fig.update_xaxes(type="log")

    if save_path:
        fig.write_html(save_path, include_plotlyjs='cdn')

    fig.show()
    return fig

In [43]:
fig_log = plot_energy_vs_grade(
    nrj_param,
    x_col='Grade',
    y_col='energy_MJ',
    color_col='commodities',
    symbol_col='mining_processing_type',
    x_log=True,
    font_color='black',
    save_path=r'data/Parametrization/param_nrj_log.html'
)

# Filtered grades < 0.01
df_filtered = nrj_param[nrj_param['Grade'] < 0.01]
fig_filtered = plot_energy_vs_grade(
    df_filtered,
    x_col='Grade',
    y_col='energy_MJ',
    color_col='commodities',
    symbol_col='mining_processing_type',
    font_color='black',
    save_path=r'data/Parametrization/param_nrj.html'
)

## Environmental flows

In [61]:
biosphere_param = pd.read_excel(r'data/Parametrization/biosphere_parametrization.xlsx')

In [62]:
biosphere_param

Unnamed: 0,env_id,year,compartment_name,substance_id,flow_direction,release_pathway,unit,value,comment,main_id,...,Recovery rate,Comment,company_id,source_id,value_tonnes_main,value_tonnes_group,value_tonnes_match,value_normalized,normalization_key,unit_normalized
0,npri-2132-2023-1,2023,Air,NA - 16,Emission,Stack Emissions,tonnes,127.312,,AB-MAIN-d3a4aba9,...,83.5,Assumed Ni is the main product (Co also included),,https://www.canada.ca/en/environment-climate-c...,35636.0,,35636.0,0.003573,main_id,tonnes/t
1,npri-2132-2023-2,2023,Air,630-08-0,Emission,Stack Emissions,tonnes,189.137,,AB-MAIN-d3a4aba9,...,83.5,,,https://www.canada.ca/en/environment-climate-c...,35636.0,,35636.0,0.005307,main_id,tonnes/t
2,npri-2132-2023-3,2023,Air,11104-93-1,Emission,Stack Emissions,tonnes,1099.925,,AB-MAIN-d3a4aba9,...,83.5,,,https://www.canada.ca/en/environment-climate-c...,35636.0,,35636.0,0.030866,main_id,tonnes/t
3,npri-2132-2023-4,2023,Air,NA - M09,Emission,Stack Emissions,tonnes,43.701,,AB-MAIN-d3a4aba9,...,83.5,,,https://www.canada.ca/en/environment-climate-c...,35636.0,,35636.0,0.001226,main_id,tonnes/t
4,npri-2132-2023-5,2023,Air,NA - M10,Emission,Stack Emissions,tonnes,8.343,,AB-MAIN-d3a4aba9,...,83.5,,,https://www.canada.ca/en/environment-climate-c...,35636.0,,35636.0,0.000234,main_id,tonnes/t
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1284,ENV-GRP-14bfbb82-2023-1,2023,Air,NA - GHG,Emission,,tCO2eq,14002.000,,,...,97.0,,CMP-4a73c5f8,SRC_SSRMiningInc_ESG_2023,,122000.0,122000.0,0.114770,facility_group_id,tCO2eq/t
1285,ENV-GRP-14bfbb82-2023-2,2023,Air,NA - GHG,Emission,,tCO2eq,30057.000,,,...,97.0,,CMP-4a73c5f8,SRC_SSRMiningInc_ESG_2023,,122000.0,122000.0,0.246369,facility_group_id,tCO2eq/t
1286,ENV-GRP-14bfbb82-2023-3,2023,Water,7732-18-5,Withdrawal,,m3,183118.000,,,...,97.0,,CMP-4a73c5f8,SRC_SSRMiningInc_ESG_2023,,122000.0,122000.0,1.500967,facility_group_id,m3/t
1287,ENV-GRP-14bfbb82-2023-4,2023,Water,7732-18-5,Discharged,,m3,25131.000,,,...,97.0,,CMP-4a73c5f8,SRC_SSRMiningInc_ESG_2023,,122000.0,122000.0,0.205992,facility_group_id,m3/t


In [50]:
# Make sure garde is numeric
biosphere_param['Grade'] = pd.to_numeric(biosphere_param['Grade'], errors='coerce')
biosphere_param['Strip ratio'] = pd.to_numeric(biosphere_param['Strip ratio'], errors='coerce')
biosphere_param['Recovery rate'] = pd.to_numeric(biosphere_param['Recovery rate'], errors='coerce')
biosphere_param['value_normalized'] = pd.to_numeric(biosphere_param['value_normalized'], errors='coerce')

In [39]:
# Apply
biosphere_param['commodities'] = biosphere_param['commodities'].apply(lambda x: abbreviate_metals(x, metal_map_lower))

In [51]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

def plot_energy_vs_grade_by_substance(
    df,
    x_col='Grade',
    y_col='value',
    substance_col='substance_id',
    color_col='commodities',
    symbol_col='mining_processing_type',
    hover_name_cols=['facility_name', 'facility_group_name'],
    x_label='Grade (%)',
    y_label='MJ/t ore processed',
    x_log=False,
    font_color="#333333",
    size_marker=10,
    save_path=None
):
    """
    Scatter plot of energy vs grade, with dropdown by substance_id.
    Color = commodities
    Symbol = mining/processing type
    Dropdown allows filtering by substance_id.
    """

    df = df.copy()

    # Build hover_name
    if hover_name_cols and all(col in df.columns for col in hover_name_cols):
        df['hover_name'] = (
            df[hover_name_cols[0]].astype(str)
            + " (" + df[hover_name_cols[1]].astype(str) + ")"
        )
    else:
        df['hover_name'] = df[hover_name_cols[0]] if hover_name_cols else None

    # --- Filter for unique substances ---
    substances = df[substance_col].dropna().unique()
    substances = sorted(substances, key=lambda x: str(x))

    # Prepare color and symbol sequences
    color_sequence = px.colors.qualitative.Plotly
    symbol_sequence = [
        "circle", "square", "diamond", "cross", "x",
        "triangle-up", "triangle-down", "triangle-left", "triangle-right",
        "star", "hexagon", "pentagon"
    ]

    # Create symbol map for unique values
    unique_symbols = df[symbol_col].dropna().unique()
    symbol_map = {sym: symbol_sequence[i % len(symbol_sequence)] for i, sym in enumerate(unique_symbols)}

    # Create color map for unique commodities
    unique_colors = df[color_col].dropna().unique()
    color_map = {col: color_sequence[i % len(color_sequence)] for i, col in enumerate(unique_colors)}

    # Initialize figure
    fig = go.Figure()

    # --- Add one trace per substance ---
    for i, substance in enumerate(substances):
        df_sub = df[df[substance_col] == substance]
        df_sub = df_sub.dropna(subset=[x_col, y_col])
        visible = True if i == 0 else False

        fig.add_trace(
            go.Scatter(
                x=df_sub[x_col],
                y=df_sub[y_col],
                mode='markers',
                marker=dict(
                    size=size_marker,
                    color=df_sub[color_col].map(color_map),
                    symbol=df_sub[symbol_col].map(symbol_map)
                ),
                text=df_sub['hover_name'],
                hovertemplate='%{text}<br>%{x:.4f} ' + x_col + '<br>%{y:.2f} ' + y_label + '<extra></extra>',
                name=str(substance),
                visible=visible,
                showlegend=False
            )
        )

    # --- Add dummy traces for color legend ---
    for col, col_color in color_map.items():
        fig.add_trace(
            go.Scatter(
                x=[None], y=[None], mode='markers',
                marker=dict(symbol="circle", size=size_marker, color=col_color),
                legendgroup="Color",
                showlegend=True,
                name=str(col)
            )
        )

    # --- Add dummy traces for symbol legend ---
    for sym, sym_marker in symbol_map.items():
        fig.add_trace(
            go.Scatter(
                x=[None], y=[None], mode='markers',
                marker=dict(symbol=sym_marker, size=size_marker, color="grey"),
                legendgroup="Symbol",
                showlegend=True,
                name=str(sym)
            )
        )

    # --- Dropdown buttons ---
    buttons = []
    for i, substance in enumerate(substances):
        visibility = [False]*len(substances) + [True]*(len(color_map)+len(symbol_map))
        visibility[i] = True
        buttons.append(
            dict(
                label=str(substance),
                method="update",
                args=[{"visible": visibility},
                      {"title": f"Energy vs Grade – {substance}"}]
            )
        )

    # Layout
    fig.update_layout(
        updatemenus=[dict(buttons=buttons, direction="down", showactive=True)],
        xaxis_title=x_label if x_label else x_col.replace('_', ' ').title(),
        yaxis_title=y_label if y_label else y_col.replace('_', ' ').title(),
        font=dict(color=font_color, size=14),
        template="plotly_white",
        legend=dict(tracegroupgap=20, itemsizing='constant'),
        height=600,
        title=f"Energy vs Grade – {substances[0]}"
    )

    if x_log:
        fig.update_xaxes(type="log")

    if save_path:
        fig.write_html(save_path, include_plotlyjs='cdn')

    fig.show()
    return fig


In [53]:
plot_energy_vs_grade_by_substance(
    biosphere_param,
    x_col='Grade',            # ou autre colonne quantitative
    y_col='value_normalized', # par exemple
    substance_col='substance_id',
    color_col='commodities',
    symbol_col='mining_processing_type',
    y_label='MJ/t ore processed'
)

KeyError: 'facility_name'

## Land occupation