In [1]:
import pandas as pd

# Import EI and MetalliCan classification

## Extract biosphere and technosphere flows from EI

In [36]:
# Brightway imports
import bw2analyzer as ba
import bw2calc as bc
import bw2data as bd
import bw2io as bi
import brightway2 as bw

In [37]:
BW_PROJECT = 'metallican' # insert your project name here
bd.projects.set_current(BW_PROJECT)
bd.databases

Databases dictionary with 6 object(s):
	MetalliCan
	Regioinvent
	biosphere3
	biosphere3_spatialized_flows
	cutoff310
	cutoff310 regionalized

In [38]:
# For biosphere 
db = bw.Database('biosphere3')
records = []
for ds in db:
    records.append({
            "Activity": ds.get("name"),
            "Categories": ds.get("categories"),
            "Type": ds.get("type"),
            "Unit": ds.get("unit"),
            "Flow key": ds.key,
        })
df_bio_ei = pd.DataFrame(records)

In [39]:
df_bio_ei

Unnamed: 0,Activity,Categories,Type,Unit,Flow key
0,Propanol,"(water,)",emission,kilogram,"(biosphere3, 4d567c70-f938-4a8f-b2ad-10c0b8f13..."
1,Hydrogen peroxide,"(water, ocean)",emission,kilogram,"(biosphere3, 77aa43c5-76c6-4465-b0b4-f00d4d14c..."
2,Protactinium-234,"(water, ground-)",emission,kilo Becquerel,"(biosphere3, 769d4ef0-455c-42a8-9eea-92ede2690..."
3,Propyl amine,"(air, non-urban air or from high stacks)",emission,kilogram,"(biosphere3, 3d5a4fbf-e44b-4b0d-885d-534fc6968..."
4,Fluoranthene,"(air,)",emission,kilogram,"(biosphere3, d57ab983-120a-448b-a681-b876e1add..."
...,...,...,...,...,...
4357,Nitrate,"(water, ground-)",emission,kilogram,"(biosphere3, b9291c72-4b1d-4275-8068-4c707dc3c..."
4358,Neodymium,"(natural resource, in ground)",natural resource,kilogram,"(biosphere3, db0c855c-e9ef-58d9-97cc-960e646fc..."
4359,Mercury,"(natural resource, in ground)",natural resource,kilogram,"(biosphere3, 54b9cbd0-65df-4fd3-8a19-dd3b8eccc..."
4360,Imazamox,"(water, ground-)",emission,kilogram,"(biosphere3, d2060aa6-f2e8-5d8e-973a-54dd7bab6..."


In [40]:
df_bio_ei.to_csv(r'data/Mappings/biosphere3.csv', index=False)

In [9]:
# For spatialized biosphere 
db = bw.Database('biosphere3_spatialized_flows')
records = []
for ds in db:
    records.append({
            "Activity": ds.get("name"),
            "Categories": ds.get("categories"),
            "Type": ds.get("type"),
            "Unit": ds.get("unit"),
            "Flow key": ds.key,
        })
df_bio_ei_reg = pd.DataFrame(records)

In [10]:
df_bio_ei_reg

Unnamed: 0,Activity,Categories,Type,Unit,Flow key
0,"Sulfuric acid, US-IL","(air, low population density, long-term)",emission,kilogram,"(biosphere3_spatialized_flows, 8c7f7e69-9bec-4..."
1,"Transformation, to traffic area, rail/road emb...","(natural resource, land)",natural resource,square meter,"(biosphere3_spatialized_flows, 6694c89f-2ee1-4..."
2,"Transformation, from inland waterbody, unspeci...","(natural resource, land)",natural resource,square meter,"(biosphere3_spatialized_flows, 786c4682-66a7-4..."
3,"Occupation, pasture, man made, CA-NU","(natural resource, land)",natural resource,square meter-year,"(biosphere3_spatialized_flows, 7d278611-732a-4..."
4,"Water, unspecified natural origin, GU","(natural resource, fossil well)",natural resource,cubic meter,"(biosphere3_spatialized_flows, a536e0d6-2aba-4..."
...,...,...,...,...,...
110554,"Particulate Matter, < 2.5 um, AE","(air, low population density, long-term)",emission,kilogram,"(biosphere3_spatialized_flows, b5c50bae-dd96-4..."
110555,"Transformation, to river, natural (non-use), SE","(natural resource, land)",natural resource,square meter,"(biosphere3_spatialized_flows, 2d8cf930-6378-4..."
110556,"Transformation, to annual crop, non-irrigated,...","(natural resource, land)",natural resource,square meter,"(biosphere3_spatialized_flows, af5bed6d-e7fc-4..."
110557,"Transformation, from annual crop, KM","(natural resource, land)",natural resource,square meter,"(biosphere3_spatialized_flows, fa855584-22a3-4..."


In [11]:
df_bio_ei_reg.to_csv(r'data/Mappings/biosphere3_spatialized_flows.csv', index=False)

In [12]:
# For technosphere
db = bw.Database('ecoinvent-3.10.1-cutoff regionalized')
provider_keys = set()
# collect every provider that appears in any technosphere exchange
for ds in db:
    for exc in ds.technosphere():
        provider_keys.add(exc.input.key)
records = []
for key in provider_keys:
    act = bw.get_activity(key)
    records.append({
            "Reference product": act.get("reference product"),
            "Flow name": act.get("name"),
            "Location": act.get("location"),
            "Unit": act.get("unit"),
            "Flow key": key,
        })
df_tech_ei_reg = pd.DataFrame(records)

In [13]:
df_tech_ei_reg

Unnamed: 0,Reference product,Flow name,Location,Unit,Flow key
0,"heat, district or industrial, other than natur...","heat and power co-generation, wood chips, 6667...",RU,megajoule,"(ecoinvent-3.10.1-cutoff regionalized, 8d1ee21..."
1,"manure, solid, cattle","market for manure, solid, cattle",GLO,kilogram,"(ecoinvent-3.10.1-cutoff regionalized, 536a02e..."
2,"electricity, high voltage","electricity production, hard coal",AR,kilowatt hour,"(ecoinvent-3.10.1-cutoff regionalized, 3b8e707..."
3,goods wagon,market for goods wagon,GLO,unit,"(ecoinvent-3.10.1-cutoff regionalized, c91b004..."
4,hard coal,consumption market for hard coal,AR,kilogram,"(Regioinvent, aeaa45e522914312b4e25ad436f32112)"
...,...,...,...,...,...
25441,"photovoltaic slanted-roof installation, 3kWp, ...",consumption market for photovoltaic slanted-ro...,JP,unit,"(Regioinvent, 8906ba4feb2b495eb3d76340f757ef4c)"
25442,"sewage sludge, 70% water, WWT-SLF, waste plast...","market for sewage sludge, 70% water, WWT-SLF, ...",RoW,kilogram,"(ecoinvent-3.10.1-cutoff regionalized, 0ef2c69..."
25443,monoammonium phosphate,monoammonium phosphate production,RNA,kilogram,"(ecoinvent-3.10.1-cutoff regionalized, 1e81ccf..."
25444,"concrete, 30MPa","concrete production, 30MPa, for drilled piles,...",CH,cubic meter,"(ecoinvent-3.10.1-cutoff regionalized, ca6bd76..."


In [14]:
df_tech_ei_reg.to_csv(r'data/Mappings/ecoinvent-3.10.1-cutoff_regionalized.csv', index=False)

In [15]:
# For technosphere
db = bw.Database('Regioinvent')
provider_keys = set()
# collect every provider that appears in any technosphere exchange
for ds in db:
    for exc in ds.technosphere():
        provider_keys.add(exc.input.key)
records = []
for key in provider_keys:
    act = bw.get_activity(key)
    records.append({
            "Reference product": act.get("reference product"),
            "Flow name": act.get("name"),
            "Location": act.get("location"),
            "Unit": act.get("unit"),
            "Flow key": key,
        })
df_regioinvent = pd.DataFrame(records)

In [16]:
df_regioinvent.to_csv(r'data/Mappings/Regioinvent.csv', index=False)

## Extract information from MetalliCan

### Import energy table from MetalliCan

In [2]:
energy_table = pd.read_csv(r'C:\Users\mp_ma\OneDrive - polymtlus\Desktop\POST_DOC\Project\canada_metal_sustainability_db\database\CSV\energy_table.csv')

In [3]:
energy_table

Unnamed: 0,row_id,year,reporting_level,reported_company_nrcan,name,main_id,facility_group_id,company_id,facility_type,mining_processing_type,commodity,energy_type,unit,value,source,source_id
0,NRJ-e7e6a960-2023-1,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Fuel consumption,GJ,4266454,2023-Sustainability-Performance_data,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...
1,NRJ-e7e6a960-2023-2,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Diesel,GJ,3947008,2023-Sustainability-Performance_data,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...
2,NRJ-e7e6a960-2023-3,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Light Fuel & Gasoline,GJ,71904,2023-Sustainability-Performance_data,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...
3,NRJ-e7e6a960-2023-4,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Propane,GJ,182468,2023-Sustainability-Performance_data,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...
4,NRJ-e7e6a960-2023-5,2023,Site-specific,Agnico Eagle Mines Limited,Detour Lake,QC-MAIN-e7e6a960,-,,mining,"Open-pit, concentrator",Gold,Explosives,GJ,65074,2023-Sustainability-Performance_data,SRC_AgnicoEagleMinesLimited_2023-Sustainabilit...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,NRJ-c5fefb01-2023-7,2023,Site-specific,Wesdome Gold Mines Ltd.,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Energy intensity,GJ/t milled,1.07,ESG_data_2023.xlsx,SRC_WesdomeGoldMinesLtd._ESG_data_2023
265,NRJ-c5fefb01-2023-8,2023,Site-specific,Wesdome Gold Mines Ltd.,Kiena,ON-MAIN-c5fefb01,-,,mining,"Open-pit, concentrator",Gold,Energy intensity,GJ/oz,5.78,ESG_data_2023.xlsx,SRC_WesdomeGoldMinesLtd._ESG_data_2023
266,NRJ-CMP-d5f1c66b-2023-18,2023,Company-level,Glencore,-,-,-,CMP-d5f1c66b,mining and manufacturing,x,x,Energy intensity of metals mining,GJ/t Cu-eq,40.3,2024+ESG+data+book.xlsx,SRC_Glencore_2024+ESG+data+book
267,NRJ-CMP-d5f1c66b-2023-19,2023,Company-level,Glencore,-,-,-,CMP-d5f1c66b,mining and manufacturing,x,x,Energy intensity of metals smelting,GJ/t Cu-eq,30.3,2024+ESG+data+book.xlsx,SRC_Glencore_2024+ESG+data+book


In [4]:
# Create a df with unique 'energy_type' value
energy_types = energy_table[['energy_type']].drop_duplicates().reset_index(drop=True)

In [5]:
energy_types

Unnamed: 0,energy_type
0,Fuel consumption
1,Diesel
2,Light Fuel & Gasoline
3,Propane
4,Explosives
...,...
58,Energy consumed from renewable energy
59,Coke & Petroleum coke
60,Energy intensity of metals mining
61,Energy intensity of metals smelting


In [6]:
energy_types.to_csv(r'data/Mappings/energy_flows_MetalliCan.csv', index=False)

### Import material use table from MetalliCan

In [8]:
material_use_table = pd.read_excel(r'C:\Users\mp_ma\OneDrive - polymtlus\Desktop\POST_DOC\Project\canada_metal_sustainability_db\data\Tables\extracted_data.xlsx', sheet_name='Material_use')

In [9]:
material_use_table

Unnamed: 0,year,reporting_level,reported_company_nrcan,name,main_id,facility_group_id,company_id,facility_type,mining_processing_type,commodity,compartment,flow_type,subflow_type,substance_name,unit,value,comment,source
0,2023,Site-specific,Alamos Gold Inc,Young-Davidson,ON-MAIN-687b8c8d,-,,mining,"Underground, concentrator",Gold,-,material use,Total sodium cyanide used,-,t,1955.7,,Data_2023.xlsx
1,2023,Site-specific,Alamos Gold Inc,Young-Davidson,ON-MAIN-687b8c8d,-,,mining,"Underground, concentrator",Gold,-,material use,Total blasting agents used e.g. ANFO,-,t,2599.5,,Data_2023.xlsx
2,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Explosives,-,t,1483.4,,2023_iamgold-esg-performance-data-final_protected
3,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Lime,-,t,5069.3,,2023_iamgold-esg-performance-data-final_protected
4,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Cyanide,-,t,399.3,,2023_iamgold-esg-performance-data-final_protected
5,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Caustic soda,-,t,86.6,,2023_iamgold-esg-performance-data-final_protected
6,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Sulfur dioxide,-,t,539.0,,2023_iamgold-esg-performance-data-final_protected
7,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Hydraulic oil,-,t,200.2,,2023_iamgold-esg-performance-data-final_protected
8,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Flocculant,-,t,46.2,,2023_iamgold-esg-performance-data-final_protected
9,2023,Site-specific,IAMGOLD Corporation,Westwood-Doyon,QC-MAIN-02884fb5,-,,mining,"Underground, concentrator","Gold, silver",-,material use,Transmission oil,-,t,8.1,,2023_iamgold-esg-performance-data-final_protected


In [10]:
# Create a df with unique 'subflow_type' value
material_use_types = material_use_table[['subflow_type']].drop_duplicates().reset_index(drop=True)

In [11]:
material_use_types

Unnamed: 0,subflow_type
0,Total sodium cyanide used
1,Total blasting agents used e.g. ANFO
2,Explosives
3,Lime
4,Cyanide
5,Caustic soda
6,Sulfur dioxide
7,Hydraulic oil
8,Flocculant
9,Transmission oil


In [12]:
material_use_types.to_csv(r'data/Mappings/material_use_flows_MetalliCan.csv', index=False)

### Import environmental table from MetalliCan

In [12]:
env_table = pd.read_csv(r'C:\Users\mp_ma\OneDrive - polymtlus\Desktop\POST_DOC\Project\canada_metal_sustainability_db\database\CSV\environment_table.csv')

In [13]:
env_table

Unnamed: 0,env_id,year,compartment_id,substance_id,unit,value,comment,main_id,facility_group_id,company_id,source_id
0,1754055,2023,comp82641eec,NA - GHG,tCO2eq,5610.0000240055615,,ON-MAIN-6e9b2449,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
1,1754056,2023,comp82641eec,NA - GHG,tCO2eq,6403.000027398861,,ON-MAIN-63b394c3,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
2,1754057,2023,comp82641eec,NA - GHG,tCO2eq,229916.00098382577,,BC-MAIN-599152a0,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
3,1754058,2023,comp82641eec,NA - GHG,tCO2eq,2639.000011292456,,ON-MAIN-52224e1e,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
4,1754060,2023,comp82641eec,NA - GHG,tCO2eq,4225.000018079055,,ON-MAIN-48fe2205,,,"Jolleys, M. et al (2024). Mineral Extraction s..."
...,...,...,...,...,...,...,...,...,...,...,...
4447,ENV-6b4800fe-2023-7,2023,comp82641eec,NA - M16,?,3.5,,BC-MAIN-6b4800fe,-,,SRC_TasekoMinesLimited_ESG_2023
4448,ENV-bf503b6b-2023-8,2023,comp82641eec,NA - M16,t,45.9,,BC-MAIN-bf503b6b,-,,SRC_TeckResourcesLimited_Teck-Sustainability-D...
4449,ENV-3bb6b7cd-2023-9,2023,comp82641eec,NA - M16,t,12.52,,BC-MAIN-3bb6b7cd,-,,SRC_TeckResourcesLimited_Teck-Sustainability-D...
4450,ENV-9de9bb0d-2023-14,2023,comp82641eec,NA - M16,t,0.2,,QC-MAIN-9de9bb0d,-,,SRC_WesdomeGoldMinesLtd_2023-ESG-Data-Tables


In [14]:
# Extract unique combination of compartment_id and substance_id
env_substances = env_table[['compartment_id', 'substance_id']].drop_duplicates().reset_index(drop=True)
env_substances

Unnamed: 0,compartment_id,substance_id
0,comp82641eec,NA - GHG
1,compa50ae012,NA - 16
2,compa50ae012,NA - 02
3,compa50ae012,630-08-0
4,compa50ae012,NA - 04
...,...,...
357,comp82641eec,NA - 12
358,comp82641eec,2025884
359,comp82641eec,NA - SOx
360,comp82641eec,7440-61-1


In [16]:
substances_table = pd.read_csv(r'C:\Users\mp_ma\OneDrive - polymtlus\Desktop\POST_DOC\Project\canada_metal_sustainability_db\database\CSV/substances_table_new.csv')
compartments_table = pd.read_csv(r'C:\Users\mp_ma\OneDrive - polymtlus\Desktop\POST_DOC\Project\canada_metal_sustainability_db\database\CSV/compartments_table_new.csv')

In [18]:
# Merge env_substances and substances + compartments_table to get the substance and compartment names
env_substances = env_substances.merge(substances_table[['substance_id', 'substance_name']],
                                       on='substance_id',
                                       how='left')

In [19]:
env_substances

Unnamed: 0,compartment_id,substance_id,substance_name
0,comp82641eec,NA - GHG,GHG
1,compa50ae012,NA - 16,Ammonia (total)
2,compa50ae012,NA - 02,Arsenic (and its compounds)
3,compa50ae012,NA - 02,Arsenic
4,compa50ae012,630-08-0,Carbon monoxide
...,...,...,...
419,comp82641eec,2025884,SO2
420,comp82641eec,NA - SOx,SOx
421,comp82641eec,7440-61-1,Uranium
422,comp82641eec,NA - M16,Volatile Organic Compounds (Total)


In [20]:
compartments_table

Unnamed: 0,compartment_name,release_pathway,flow_direction,compartment_id
0,Air,Stack Emissions,Emission,compa50ae012
1,Air,Storage / Handling,Emission,comp1dcc546c
2,Air,Fugitive Emissions,Emission,comp7d3e3488
3,Air,Other,Emission,compb30db82f
4,Water,Direct Discharge,Emission,comp567d1198
5,Water,Waterbody,Emission,compb4e31bc7
6,Land,Spills,Emission,comp4fceff4c
7,Land,Other,Emission,comp3af7aa38
8,Air,Spills,Emission,comp3ca42ad5
9,Water,Spills,Emission,compea433608


In [21]:
env_substances = env_substances.merge(compartments_table[['compartment_id', 'compartment_name', 'release_pathway', 'flow_direction']],
                                       on='compartment_id',
                                       how='left')

In [22]:
env_substances

Unnamed: 0,compartment_id,substance_id,substance_name,compartment_name,release_pathway,flow_direction
0,comp82641eec,NA - GHG,GHG,Air,Unspecified,Emission
1,compa50ae012,NA - 16,Ammonia (total),Air,Stack Emissions,Emission
2,compa50ae012,NA - 02,Arsenic (and its compounds),Air,Stack Emissions,Emission
3,compa50ae012,NA - 02,Arsenic,Air,Stack Emissions,Emission
4,compa50ae012,630-08-0,Carbon monoxide,Air,Stack Emissions,Emission
...,...,...,...,...,...,...
419,comp82641eec,2025884,SO2,Air,Unspecified,Emission
420,comp82641eec,NA - SOx,SOx,Air,Unspecified,Emission
421,comp82641eec,7440-61-1,Uranium,Air,Unspecified,Emission
422,comp82641eec,NA - M16,Volatile Organic Compounds (Total),Air,Unspecified,Emission


In [23]:
# Classify env_substances alphabetically by substance_name and compartment_name
env_substances.sort_values(by=['compartment_name', 'substance_name', 'release_pathway', 'flow_direction'], inplace=True)

In [24]:
env_substances

Unnamed: 0,compartment_id,substance_id,substance_name,compartment_name,release_pathway,flow_direction
310,compb30db82f,95-63-6,"1,2,4-Trimethylbenzene",Air,Other,Emission
292,comp7d3e3488,57-97-6,"7,12-Dimethylbenz[a]anthracene",Air,Fugitive Emissions,Emission
213,compa50ae012,194-59-2,"7H-Dibenzo[c,g]carbazole",Air,Stack Emissions,Emission
255,comp7d3e3488,83-32-9,Acenaphthene,Air,Fugitive Emissions,Emission
51,compa50ae012,83-32-9,Acenaphthene,Air,Stack Emissions,Emission
...,...,...,...,...,...,...
388,comp93f08204,7732-18-5,Water,Water,Unspecified,Consumption
389,compc80509fb,7732-18-5,Water,Water,Unspecified,Discharged
387,comp412a1227,7732-18-5,Water,Water,Unspecified,Withdrawal
150,comp567d1198,NA - 14,Zinc (and its compounds),Water,Direct Discharge,Emission


In [25]:
env_substances.to_csv(r'data/Mappings/env_substances_MetalliCan.csv', index=False)

In [7]:
# # Create mapping NPRI-EI through IW+ mapping
# # Import existing concordances 
# mapping_npri_iw = pd.read_excel(r'data/Mappings/openIO_IW_EI_concordance.xlsx', sheet_name='NPRI_to_IW21')
# mapping_ei_iw = pd.read_excel(r'data/Mappings/openIO_IW_EI_concordance.xlsx', sheet_name='EI_to_IW+')
# 
# # Standardize column names for merging
# mapping_npri_iw.rename(columns={'IMPACT World+ flows': 'iw_name'}, inplace=True)
# mapping_ei_iw.rename(columns={'iw name': 'iw_name'}, inplace=True)
# 
# # Standardize column names for merging
# mapping_npri_iw.rename(columns={'IMPACT World+ flows': 'iw_name'}, inplace=True)
# mapping_ei_iw.rename(columns={'iw name': 'iw_name'}, inplace=True)
# 
# # Add the 'EI' column to df_npri_iw based on matching iw_name with df_ei_iw
# mapping_npri_iw['EI'] = mapping_npri_iw['iw_name'].map(
#     lambda iw: mapping_ei_iw[mapping_ei_iw['iw_name'] == iw]['ecoinvent name'].iloc[0]
#     if iw in mapping_ei_iw['iw_name'].values else 'No match found'
# )


In [13]:
substances_table_df = substances_table.merge(mapping_npri_iw[['substance_name', 'EI']],
              on='substance_name',
              how='left')

In [14]:
substances_table_df

Unnamed: 0,substance_id,substance_name,EI
0,SUB07971a0766,"1,2,4-Trimethylbenzene",No match found
1,SUBbdfdffeb95,1-Nitropyrene,No match found
2,SUB0f01e7c1d0,2-Butoxyethanol,No match found
3,SUB324959500a,"7,12-Dimethylbenz[a]anthracene",No match found
4,SUBfee6b26acd,"7H-Dibenzo[c,g]carbazole",No match found
...,...,...,...
169,SUB787e5ac1a2,Vanadium (and its compounds),Vanadium
170,SUBe849a2ff09,Xylene (all isomers),Xylene
171,SUBae95939c3f,Zinc (and its compounds),Zinc
172,SUB090a1237c1,i-Butyl alcohol,2-Methyl-1-propanol


# Create biosphere mappings

In [26]:
df_bio_ei_reg = pd.read_csv(r'data/Mappings/biosphere3_spatialized_flows.csv')

In [27]:
df_bio_ei_reg

Unnamed: 0,Activity,Categories,Type,Unit,Flow key
0,"Sulfuric acid, US-IL","('air', 'low population density, long-term')",emission,kilogram,"('biosphere3_spatialized_flows', '8c7f7e69-9be..."
1,"Transformation, to traffic area, rail/road emb...","('natural resource', 'land')",natural resource,square meter,"('biosphere3_spatialized_flows', '6694c89f-2ee..."
2,"Transformation, from inland waterbody, unspeci...","('natural resource', 'land')",natural resource,square meter,"('biosphere3_spatialized_flows', '786c4682-66a..."
3,"Occupation, pasture, man made, CA-NU","('natural resource', 'land')",natural resource,square meter-year,"('biosphere3_spatialized_flows', '7d278611-732..."
4,"Water, unspecified natural origin, GU","('natural resource', 'fossil well')",natural resource,cubic meter,"('biosphere3_spatialized_flows', 'a536e0d6-2ab..."
...,...,...,...,...,...
110554,"Particulate Matter, < 2.5 um, AE","('air', 'low population density, long-term')",emission,kilogram,"('biosphere3_spatialized_flows', 'b5c50bae-dd9..."
110555,"Transformation, to river, natural (non-use), SE","('natural resource', 'land')",natural resource,square meter,"('biosphere3_spatialized_flows', '2d8cf930-637..."
110556,"Transformation, to annual crop, non-irrigated,...","('natural resource', 'land')",natural resource,square meter,"('biosphere3_spatialized_flows', 'af5bed6d-e7f..."
110557,"Transformation, from annual crop, KM","('natural resource', 'land')",natural resource,square meter,"('biosphere3_spatialized_flows', 'fa855584-22a..."


In [85]:
from rapidfuzz import process, fuzz
import pandas as pd
import ast

def extract_main_compartment(categories):
    """
    Safely extracts the first level compartment like 'air', 'water', etc.
    """
    try:
        if isinstance(categories, str):
            parsed = ast.literal_eval(categories)
            if isinstance(parsed, (list, tuple)) and len(parsed) > 0:
                return parsed[0].strip().lower()
    except Exception:
        pass
    return None

def match_substances_to_ecoinvent(substance_df, ecoinvent_df, country_code='CA', score_cutoff=60, spatialized=True):
    """
    Matches substance names and compartments to ecoinvent biosphere flows using fuzzy string matching.
    """

    # Optional filtering for spatialized biosphere flows
    if spatialized:
        ecoinvent_filtered = ecoinvent_df[
            ecoinvent_df['Activity'].str.contains(f'{country_code}-', na=False) |
            ecoinvent_df['Categories'].astype(str).str.contains(f'{country_code}-', na=False)
        ].copy()
    else:
        ecoinvent_filtered = ecoinvent_df.copy()

    # Ensure compartment parsing
    ecoinvent_filtered['main_compartment'] = ecoinvent_filtered['Categories'].apply(extract_main_compartment)

    results = []
    for _, row in substance_df.iterrows():
        substance = str(row['substance_name']).strip()
        compartment = str(row['compartment_name']).strip().lower()

        # Filter candidates with the same compartment
        candidates = ecoinvent_filtered[ecoinvent_filtered['main_compartment'] == compartment].copy()

        # Normalize activity column
        candidates['normalized_activity'] = candidates['Activity'].astype(str).str.strip().str.lower()
        search_list = list(candidates['normalized_activity'])

        # Match using rapidfuzz
        if search_list:
            match_result = process.extractOne(
                substance.lower(), search_list, scorer=fuzz.token_sort_ratio, score_cutoff=score_cutoff
            )

            if match_result:
                match_str, score, _ = match_result
                best_match = candidates[candidates['normalized_activity'] == match_str].iloc[0]

                results.append({
                    'substance_name': substance,
                    'compartment_name': row['compartment_name'],
                    'release_pathway': row.get('release_pathway'),
                    'flow_direction': row.get('flow_direction'),
                    'DB_to_map': 'biosphere3_spatialized_flows' if spatialized else 'biosphere3',
                    'Activity': best_match['Activity'],
                    'Categories': best_match['Categories'],
                    'Type': best_match['Type'],
                    'Unit': best_match['Unit'],
                    'Match_score': score
                })
                continue

        # No match fallback
        results.append({
            'substance_name': substance,
            'compartment_name': row['compartment_name'],
            'release_pathway': row.get('release_pathway'),
            'flow_direction': row.get('flow_direction'),
            'DB_to_map': None,
            'Activity': None,
            'Categories': None,
            'Type': None,
            'Unit': None,
            'Match_score': None
        })

    return pd.DataFrame(results)


In [86]:
mapping_spatialized = match_substances_to_ecoinvent(env_substances, df_bio_ei_reg, spatialized=True)

In [87]:
mapping_spatialized

Unnamed: 0,substance_name,compartment_name,release_pathway,flow_direction,DB_to_map,Activity,Categories,Type,Unit,Match_score
0,"1,2,4-Trimethylbenzene",Air,Other,Emission,,,,,,
1,"7,12-Dimethylbenz[a]anthracene",Air,Fugitive Emissions,Emission,,,,,,
2,"7H-Dibenzo[c,g]carbazole",Air,Stack Emissions,Emission,,,,,,
3,Acenaphthene,Air,Fugitive Emissions,Emission,,,,,,
4,Acenaphthene,Air,Stack Emissions,Emission,,,,,,
...,...,...,...,...,...,...,...,...,...,...
419,Water,Water,Unspecified,Consumption,,,,,,
420,Water,Water,Unspecified,Discharged,,,,,,
421,Water,Water,Unspecified,Withdrawal,,,,,,
422,Zinc (and its compounds),Water,Direct Discharge,Emission,,,,,,


In [82]:
df_bio_ei['main_compartment'] = df_bio_ei['Categories'].apply(extract_main_compartment)

In [88]:
mapping_global = match_substances_to_ecoinvent(env_substances, df_bio_ei, spatialized=False)

In [89]:
mapping_global

Unnamed: 0,substance_name,compartment_name,release_pathway,flow_direction,DB_to_map,Activity,Categories,Type,Unit,Match_score
0,"1,2,4-Trimethylbenzene",Air,Other,Emission,,,,,,
1,"7,12-Dimethylbenz[a]anthracene",Air,Fugitive Emissions,Emission,,,,,,
2,"7H-Dibenzo[c,g]carbazole",Air,Stack Emissions,Emission,,,,,,
3,Acenaphthene,Air,Fugitive Emissions,Emission,,,,,,
4,Acenaphthene,Air,Stack Emissions,Emission,,,,,,
...,...,...,...,...,...,...,...,...,...,...
419,Water,Water,Unspecified,Consumption,,,,,,
420,Water,Water,Unspecified,Discharged,,,,,,
421,Water,Water,Unspecified,Withdrawal,,,,,,
422,Zinc (and its compounds),Water,Direct Discharge,Emission,,,,,,


In [66]:
mapping_reg_df = match_substances_to_ecoinvent(env_substances, df_bio_ei)

# Data coverage statistics

In [17]:
data_coverage = pd.read_csv('data/MetalliCan/data_coverage_per_table.csv')

In [20]:
data_coverage

Unnamed: 0,main_id,facility_type,facility_name,commodities,Archetype table,By-product ratios table,Climate category table,Conflict table,Energy table,Environmental intensity table,...,Land occupation table,Peatland table,Population table,Prioritize conservation areas table,Production table,Protected & Indigenous land table,Reserves table,Tailings table,Water risk table,Weather table
0,QC-MAIN-5801b453,advanced project,Abcourt-Barvue,"Zinc, silver",0,0,9,0,0,0,...,2,1,4,1,0,30,0,0,32,128
1,QC-MAIN-d29e0839,mining,Sleeping Giant,"Gold, silver",1,0,9,0,0,0,...,6,1,4,1,0,44,2,0,32,128
2,QC-MAIN-f1ff4920,advanced project,Akasaba West,"Gold, copper",0,0,9,0,0,0,...,10,1,4,1,0,46,0,0,32,128
3,NU-MAIN-4ea8ac9d,mining,Amaruq,Gold,1,0,9,0,8,2,...,0,1,4,1,5,1,4,0,32,128
4,QC-MAIN-e7e6a960,mining,Canadian Malartic,"Gold, silver",1,0,9,0,8,2,...,6,1,4,1,1,41,5,2,32,128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,ON-MAIN-c5fefb01,mining,Mishi,Gold,1,0,9,0,8,6,...,4,1,4,1,5,11,5,0,32,128
266,BC-MAIN-feb13c30,advanced project,Record Ridge,Magnesium,0,0,9,0,0,0,...,0,1,4,1,0,24,0,0,32,128
267,YT-MAIN-5e0d8b48,advanced project,Casino,"Copper, gold, molybdenum, silver",0,0,9,0,0,0,...,0,1,4,1,0,2,0,0,32,128
268,ON-MAIN-12c68d49,advanced project,Eagle's Nest,"Nickel, copper, gold, platinum, palladium",0,0,9,0,0,0,...,0,1,4,1,0,1,0,0,32,128


In [24]:
col_of_interest = ['Energy table', 'Environmental intensity table', 'Production table', 'Environmental flows table', 'Tailings table']
stat_data_coverage = data_coverage[col_of_interest].describe()

In [25]:
stat_data_coverage

Unnamed: 0,Energy table,Environmental intensity table,Production table,Environmental flows table,Tailings table
count,270.0,270.0,270.0,270.0,270.0
mean,0.655556,0.218519,1.177778,52.762963,0.318519
std,2.035892,0.990864,2.724982,102.823067,1.268151
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,48.75,0.0
max,9.0,8.0,21.0,758.0,15.0


In [None]:
data_coverage