In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

# Imports and cleaning

## Imports

In [2]:
# Import files 
main_gdf = gpd.read_file(r'data/Tables/GeoJSON/main_gdf.geojson')
tailings_gdf = gpd.read_file(r'data/Tables/GeoJSON/tailings_gdf.geojson')
min_can_gdf = gpd.read_file(r'data/Tables/GeoJSON/min_can_gdf.geojson')
conflict_gdf = gpd.read_file(r'data/Tables/GeoJSON/conflict_gdf.geojson')
ghg_gdf = gpd.read_file(r'data/Tables/GeoJSON/ghg_gdf.geojson')
pollutant_gdf = gpd.read_file(r'data/Tables/GeoJSON/pollutant_gdf.geojson')
climate_category_gdf = gpd.read_file(r'data/Tables/GeoJSON/climate_category_gdf.geojson')
weather_gdf = gpd.read_file(r'data/Tables/GeoJSON/weather_gdf.geojson')
peatland_gdf = gpd.read_file(r'data/Tables/GeoJSON/peatland_gdf.geojson')
population_gdf = gpd.read_file(r'data/Tables/GeoJSON/population_gdf.geojson')
water_risk_gdf = gpd.read_file(r'data/Tables/GeoJSON/water_risk_gdf.geojson')
land_cover_gdf = gpd.read_file(r'data/Tables/GeoJSON/land_cover_gdf.geojson')
npv_gdf = gpd.read_file(r'data/Tables/GeoJSON/npv_gdf.geojson')
indigenous_can_gdf = gpd.read_file(r'data/Tables/GeoJSON/indigenous_can_gdf.geojson')
wpda_gdf_polygons = gpd.read_file(r'data/Tables/GeoJSON/wpda_gdf_polygons.geojson')
wpda_gdf_points = gpd.read_file(r'data/Tables/GeoJSON/wpda_gdf_points.geojson')
tang_canada_gdf = gpd.read_file(r'data/Tables/GeoJSON/tang_canada_gdf.geojson')
maus_canada_gdf = gpd.read_file(r'data/Tables/GeoJSON/maus_canada_gdf.geojson')

In [18]:
peatland_gdf['peatland_presence']

0                  No data
1      peat in soil mosaic
2           peat dominated
3                  No data
4                  No data
              ...         
265                No data
266                No data
267                No data
268         peat dominated
269                No data
Name: peatland_presence, Length: 270, dtype: object

In [16]:
weather_gdf['scenario'].unique()

array(['ssp126', 'ssp245', 'ssp370', 'ssp585'], dtype=object)

In [15]:
weather_gdf['year'].unique()

array([2025, 2030, 2035, 2040, 2045, 2050, 2055, 2060, 2065, 2070, 2075,
       2080, 2085, 2090, 2095, 2100], dtype=int64)

In [8]:
pollutant_gdf['watershed'].unique()

array(['St. Lawrence Drainage Area', 'Arctic Drainage Area',
       'Southwestern Hudson Bay Drainage Area',
       'Western and Northern Hudson Bay Drainage Area',
       'Great Slave Lake Drainage Area',
       'Northern Québec and Labrador Drainage Area',
       'Pacific Drainage Area', 'Nelson River Drainage Area',
       'Maritime Provinces Drainage Area', 'Yukon River Drainage Area'],
      dtype=object)

In [17]:
ownership_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Ownership')
production_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Production')
energy_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Energy')
environment_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Env')
archetype_df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='Archetype')

In [19]:
production_df['material_type'].unique()

array(['Ore processed', 'Intermediate metal', 'Concentrate', 'Ore mined',
       'Ore grade', 'Mill grade', 'Mill recovery rate',
       'Intermediate metal?', '?', 'Concentrate & pellets', 'Flat',
       'Long/wire rod, bars, slabs', 'Mining production', nan,
       'Concentrate produced', 'Intermediate metal ', 'Waste rock mined',
       'Waste mined', 'Carats recovered', 'Tonnes mined', 'Head grade',
       'Recovery rate', 'Ore mined and hauled', 'Concentrate grade',
       'Carats recovered|Canada', 'Carats recovered|World', 'Ore milled',
       'Material mined', 'EAF production', 'Nominal crude steel capacity',
       'Refined metal', 'Refinery production', 'Ore grade?',
       'Head grade?', 'Contained metal in concentrate',
       'Total contained metal in concentrate and doré produced',
       'Strip ratio', 'Grade', 'Ore mined?', 'Mill ore grade',
       'Ore mined|OP ore', 'Ore mined|OP waste', 'Ore mined|Underground',
       'Smelter production', 'Smelter grade alumina', 'D

## Cleaning 

In [86]:
main_gdf.drop(columns=["longitude", "latitude"], inplace=True)

In [87]:
wpda_gdf_polygons.rename(columns={
    "WDPA_PID": "wpda_id",
    "NAME": "wpda_name",
    "DESIG": "type",
    "OWN_TYPE": "ownership",
    "MANG_AUTH": "operator",
    "STATUS_YR": "status_year"
},     
    inplace=True)

In [88]:
indigenous_can_gdf.rename(columns={
    "Name": "indigenous_land_name",
    "Category": "land_category",
    "Data_Src": "data_source",
    "Data_Date": "status_date"
},     
    inplace=True)

# 1) Matching 

## Point to point matching

In [23]:
from matching_functions import match_facilities, one_to_many_relationships

### MinCan

Some mines in MinCan dataset have very slightly different name (e.g. LaRonde - Zone 5 and LaRonde), or several companies for the same mines.

In [7]:
lt_main_mincan = match_facilities(main_gdf, min_can_gdf,
                     id_col1='main_id', id_col2='mincan_id',
                     name_col1='facility_name', name_col2='namemine'
                     )

In [8]:
matches_main_mincan = one_to_many_relationships(
    lt_main_mincan,
    id_main_col='main_id',
    id_sat_col='mincan_id',
    distance_threshold_m=2000,
    similarity_threshold=80,
    similarity_metric="token_set"  
)

In [9]:
min_can_gdf = min_can_gdf.merge(
    matches_main_mincan,
    on="mincan_id", 
    how="left"
)

In [11]:
# For SQL
min_can_gdf = min_can_gdf[min_can_gdf["main_id"].notna()].copy()

In [14]:
col_to_keep = ["mincan_id", "main_id", "mine_status", "operation_periods"]
min_can_gdf = min_can_gdf[col_to_keep]
min_can_gdf

Unnamed: 0,mincan_id,main_id,mine_status,operation_periods
4,QC-MINCAN-d1e8b6b1,QC-MAIN-30c1828c,Inactive,1927–1976
53,NL-MINCAN-01316355,NL-MAIN-842ba1b4,Inactive,1997–2010
55,QC-MINCAN-494c7715,QC-MAIN-9de9bb0d,Active,1936–1996; 2005–2013; 2021–open
117,BC-MINCAN-8cc41b53,BC-MAIN-6b4800fe,Active,1972–1998; 2004–open
136,QC-MINCAN-8321e85e,QC-MAIN-e51eda66,Active,1988–open
...,...,...,...,...
871,NL-MINCAN-a34e9244,NL-MAIN-c139de6d,Inactive,1971–2019
872,BC-MINCAN-3e33555f,BC-MAIN-b1fe389a,Inactive,1971–2008
897,SK-MINCAN-b20ab2db,SK-MAIN-bb89158f,Inactive,1983–2018
906,ON-MINCAN-7b29d7b3,ON-MAIN-bfbcd72e,Active,1987–1994; 2012–open


### Conflict

In [15]:
lt_main_conflict = match_facilities(main_gdf, conflict_gdf,
                     id_col1='main_id', id_col2='conflict_id',
                     name_col1='facility_name', name_col2='facility_name_ejatlas'
                     )

In [16]:
matches_main_conflict = one_to_many_relationships(
    lt_main_conflict,
    id_main_col='main_id',
    id_sat_col='conflict_id',
    distance_threshold_m=20000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [17]:
conflict_gdf = conflict_gdf.merge(
    matches_main_conflict,
    on="conflict_id",  
    how="left"
)

In [18]:
# For SQL
conflict_gdf = conflict_gdf[conflict_gdf["main_id"].notna()].copy()

In [21]:
conflict_gdf

Unnamed: 0,conflict_id,ej_atlas_id,case_name,facility_name_ejatlas,province,longitude,latitude,geolocation_accuracy,conflict_description,conflict_details,population_affected,conflict_intensity,start_date,end_date,project_status,reference,geometry,main_id
9,NL-CONFLICT-fd990d3e,4135,"Vale Mining in Voisey's Bay, Labrador, Canada",Voisey's Bay,Newfoundland Labrador,-62.103056,56.334722,MEDIUM (Regional level),<p>Voiseyâ€™s Bay nickel mine in Labrador was ...,"Voiseyâ€™s Bay is a â€œ6,000 tonnes-per-day fa...",Rural,"MEDIUM (street protests, visible mobilization)",11/1994,,In operation,"(Mining Data Online, n.d.)\nhttps://miningdata...",POINT (-62.10306 56.33472),NL-MAIN-2d8801d6
10,ON-CONFLICT-928fd562,4145,Port Colborne Class Action Lawsuit Against Val...,Port Colborne,Ontario,-79.25,42.883333,MEDIUM (Regional level),"<p>In Pearson (later Smith) v. Inco, the large...",Raw materials from Valeâ€™s Sudbury mining ope...,Urban,LOW (some local organising),06/2000,04/2012,In operation,,POINT (-79.25000 42.88333),ON-MAIN-2e13aaa0
15,QC-CONFLICT-93cab570,5599,Nouveau Mondeâ€™s Matawinie graphite mine in Q...,Municipality of Saint-Michel-des-Saints in Ma...,Quebec,-73.958848,46.633467,MEDIUM (Regional level),<p>Five kilometers southwest of the municipali...,The project includes 319 mining concessions (c...,Rural,"MEDIUM (street protests, visible mobilization)",01/01/2017,,Under construction,,POINT (-73.95885 46.63347),QC-MAIN-d6748d40


In [22]:
col_to_keep = ["ej_atlas_id",
  "main_id",
  "case_name",
  "start_date",
  "end_date",
  "conflict_description",
  "conflict_details",
  "population_affected",
  "conflict_intensity",
  "project_status"
               ]
conflict_gdf = conflict_gdf[col_to_keep]
conflict_gdf

Unnamed: 0,ej_atlas_id,main_id,case_name,start_date,end_date,conflict_description,conflict_details,population_affected,conflict_intensity,project_status
9,4135,NL-MAIN-2d8801d6,"Vale Mining in Voisey's Bay, Labrador, Canada",11/1994,,<p>Voiseyâ€™s Bay nickel mine in Labrador was ...,"Voiseyâ€™s Bay is a â€œ6,000 tonnes-per-day fa...",Rural,"MEDIUM (street protests, visible mobilization)",In operation
10,4145,ON-MAIN-2e13aaa0,Port Colborne Class Action Lawsuit Against Val...,06/2000,04/2012,"<p>In Pearson (later Smith) v. Inco, the large...",Raw materials from Valeâ€™s Sudbury mining ope...,Urban,LOW (some local organising),In operation
15,5599,QC-MAIN-d6748d40,Nouveau Mondeâ€™s Matawinie graphite mine in Q...,01/01/2017,,<p>Five kilometers southwest of the municipali...,The project includes 319 mining concessions (c...,Rural,"MEDIUM (street protests, visible mobilization)",Under construction


### Tailings

In [24]:
lt_main_tailings = match_facilities(main_gdf, tailings_gdf,
                     id_col1='main_id', id_col2='tailing_id',
                     name_col1='facility_name', name_col2='related_mine'
                     )

In [25]:
matches_main_tailings = one_to_many_relationships(
    lt_main_tailings,
    id_main_col='main_id',
    id_sat_col='tailing_id',
    distance_threshold_m=20000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [26]:
tailings_gdf = tailings_gdf.merge(
    matches_main_tailings,
    on="tailing_id",  
    how="left"
)

In [27]:
# For SQL
tailings_gdf = tailings_gdf[tailings_gdf["main_id"].notna()].copy()

In [28]:
col_to_keep = ["row_id",
  "main_id",
  "year",
  "tailing_id",
  "tsf_name",
  "status",
  "construction_year",
  "raise_type",
  "current_maximum_height",
  "current_tailings_storage",
  "planned_storage_5_years",
  "hazard_categorization",
  "classification_system",
  "geometry"
               ]
tailings_gdf = tailings_gdf[col_to_keep]
tailings_gdf

Unnamed: 0,row_id,main_id,year,tailing_id,tsf_name,status,construction_year,raise_type,current_maximum_height,current_tailings_storage,planned_storage_5_years,hazard_categorization,classification_system,geometry
0,TAIL-12.0-2019-1,NU-MAIN-730aefe3,2019,12.0,North Cell,Active,2009.0,"Downstream, Upstream",31.0,14400000,14400000,Medium,Canadian Dam Association,POINT (-96.04333 65.02156)
1,TAIL-13.0-2019-1,NU-MAIN-730aefe3,2019,13.0,South Cell,Active,2012.0,Downstream,49.0,10420000,10800000,Medium,Canadian Dam Association,POINT (-96.03339 65.01378)
2,TAIL-14.0-2019-1,NU-MAIN-730aefe3,2019,14.0,In pit,Active,2009.0,,,850000,12500000,Low,,POINT (-96.03254 65.00190)
3,TAIL-15.0-2019-1,NU-MAIN-8b0264c9,2019,15.0,Meliadine TSF,Active,2019.0,,5.0,89000,4354000,Medium,Canadian Dam Association,POINT (-92.14034 63.02083)
4,TAIL-16.0-2019-1,QC-MAIN-c0660aec,2019,16.0,South TSF,Active,2007.0,Not Raised,5.0,1664000,2500000,Medium,Canadian Dam Association,POINT (-77.84198 48.05434)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,TAIL-1126.0-2019-1,ON-MAIN-f080c409,2019,1126.0,R1 Tailings,Active,1985.0,Compacted\nOuter Shell,14.0,2900000,4000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)
216,TAIL-1127.0-2019-1,ON-MAIN-f080c409,2019,1127.0,R2 Tailings,Active,1985.0,Compacted\nOuter Shell,13.0,18000000,18000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)
217,TAIL-1128.0-2019-1,ON-MAIN-f080c409,2019,1128.0,R3 Tailings,Active,1985.0,Compacted\nOuter Shell,27.0,39000000,42000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)
218,TAIL-1129.0-2019-1,ON-MAIN-f080c409,2019,1129.0,R4 Tailings,Active,1990.0,Compacted\nOuter Shell,22.0,59000000,63000000,Extreme,Canadian Dam Association,POINT (-81.14058 46.47142)


### GHG

In [11]:
lt_main_ghg = match_facilities(main_gdf, ghg_gdf,
                     id_col1='main_id', id_col2='ghg_id',
                     name_col1='facility_name', name_col2='facility_name_ghg'
                     )

In [12]:
matches_main_ghg = one_to_many_relationships(
    lt_main_ghg,
    id_main_col='main_id',
    id_sat_col='ghg_id',
    distance_threshold_m=10000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [13]:
ghg_gdf = ghg_gdf.merge(
    matches_main_ghg,
    on="ghg_id",  
    how="left"
)

In [14]:
# For SQL
ghg_gdf = ghg_gdf[ghg_gdf["main_id"].notna()].copy()

In [15]:
col_to_keep = ["row_id",
  "ghg_id",             
  "main_id",
  "year",
  "sector",
  "value",
  "unit"
               ]
ghg_gdf = ghg_gdf[col_to_keep]
ghg_gdf

Unnamed: 0,row_id,ghg_id,main_id,year,sector,value,unit
1,GHG-10101-2022-1,10101,NT-MAIN-6c1d6ee6,2022,Diamond Mining,164.88,kilotonnes of carbon dioxide equivalents (kt C...
4,GHG-10188-2022-1,10188,QC-MAIN-084bd95c,2022,Iron Ore Mining,293.10,kilotonnes of carbon dioxide equivalents (kt C...
7,GHG-10240-2022-1,10240,QC-MAIN-a97821c0,2022,Nickel-Copper Ore Mining,129.26,kilotonnes of carbon dioxide equivalents (kt C...
8,GHG-10299-2022-1,10299,QC-MAIN-e25eed27,2022,Iron Ore Mining,909.89,kilotonnes of carbon dioxide equivalents (kt C...
9,GHG-10350-2022-1,10350,NL-MAIN-b64bae7a,2022,Iron Ore Mining,110.93,kilotonnes of carbon dioxide equivalents (kt C...
...,...,...,...,...,...,...,...
158,GHG-11042-2022-1,11042,ON-MAIN-b466361f,2022,"Aluminum Rolling, Drawing, Extruding and Alloying",14.92,kilotonnes of carbon dioxide equivalents (kt C...
169,GHG-11723-2022-1,11723,ON-MAIN-2e13aaa0,2022,Non-Ferrous Metal (except Aluminum) Smelting a...,14.17,kilotonnes of carbon dioxide equivalents (kt C...
170,GHG-11730-2022-1,11730,QC-MAIN-92628f16,2022,Primary Production of Alumina and Aluminum,380.82,kilotonnes of carbon dioxide equivalents (kt C...
178,GHG-12298-2022-1,12298,ON-MAIN-63b394c3,2022,Non-Ferrous Metal (except Aluminum) Smelting a...,330.67,kilotonnes of carbon dioxide equivalents (kt C...


### Pollution

In [20]:
pollutant_gdf

Unnamed: 0,row_id,year,npri_id,facility_name_npri,company_name_npri,facility_type,longitude,latitude,terrestrial_ecozone,watershed,substance_name_npri,substance_name_ecoinvent,substance_unit,emission_type,emission_subtype,value,source_df,geometry
0,npri-1568-2023-1,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Ammonia (total),Ammonia,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.636,npri_metal_mining_df,POINT (-80.08734 48.13032)
1,npri-1568-2023-2,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Arsenic (and its compounds),Arsenic,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.116,npri_metal_mining_df,POINT (-80.08734 48.13032)
2,npri-1568-2023-3,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Carbon monoxide,"Carbon monoxide, fossil",tonnes,Air Emissions / Émissions à l'air,Stack Emissions,76.208,npri_metal_mining_df,POINT (-80.08734 48.13032)
3,npri-1568-2023-4,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Chromium (and its compounds),Chromium,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.00105,npri_metal_mining_df,POINT (-80.08734 48.13032)
4,npri-1568-2023-5,2023,1568,Macassa Mine,Agnico Eagle Mines,mining,-80.08734,48.13032,Boreal Shield,St. Lawrence Drainage Area,Cobalt (and its compounds),Cobalt,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.18,npri_metal_mining_df,POINT (-80.08734 48.13032)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24125,npri-5697-2023-6,2023,5697,Rayette Manufacturing,,manufacturing,-79.49600,43.80800,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",25.1008,npri_metal_manufacturing_df,POINT (-79.49600 43.80800)
24126,npri-5698-2023-6,2023,5698,Bowes Manufacturing,,manufacturing,-79.49480,43.80360,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",23.418,npri_metal_manufacturing_df,POINT (-79.49480 43.80360)
24127,npri-34697-2023-2,2023,34697,541 Bowes Warehouse,,manufacturing,-79.49707,43.81726,Mixedwood Plain,St. Lawrence Drainage Area,Speciated VOC - Hydrotreated heavy naphtha,,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",16.543746,npri_metal_manufacturing_df,POINT (-79.49707 43.81726)
24128,npri-5697-2023-7,2023,5697,Rayette Manufacturing,,manufacturing,-79.49600,43.80800,Mixedwood Plain,St. Lawrence Drainage Area,Volatile Organic Compounds (Total),,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",12.5823,npri_metal_manufacturing_df,POINT (-79.49600 43.80800)


In [21]:
lt_main_pollution = match_facilities(main_gdf, pollutant_gdf,
                     id_col1='main_id', id_col2='npri_id',
                     name_col1='facility_name', name_col2='facility_name_npri'
                     )

In [22]:
matches_main_pollution = one_to_many_relationships(
    lt_main_pollution,
    id_main_col='main_id',
    id_sat_col='npri_id',
    distance_threshold_m=10000,
    similarity_threshold=80,
    similarity_metric="token_set"  # "partial" or "token_set"
)

In [24]:
pollutant_gdf = pollutant_gdf.merge(
    matches_main_pollution,
    on="npri_id",  
    how="left"
)

In [25]:
# For SQL
pollutant_gdf = pollutant_gdf[pollutant_gdf["main_id"].notna()].copy()

In [26]:
col_to_keep = ["row_id",
  "npri_id",
  "main_id",
  "year",
  "terrestrial_ecozone",
  "watershed",
  "substance_name_npri",
  "substance_name_ecoinvent",
  "substance_unit",
  "emission_type",
  "emission_subtype",
  "value"
 ]
pollutant_gdf = pollutant_gdf[col_to_keep]
pollutant_gdf

Unnamed: 0,row_id,npri_id,main_id,year,terrestrial_ecozone,watershed,substance_name_npri,substance_name_ecoinvent,substance_unit,emission_type,emission_subtype,value
0,npri-1568-2023-1,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Ammonia (total),Ammonia,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.636
1,npri-1568-2023-2,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Arsenic (and its compounds),Arsenic,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.116
2,npri-1568-2023-3,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Carbon monoxide,"Carbon monoxide, fossil",tonnes,Air Emissions / Émissions à l'air,Stack Emissions,76.208
3,npri-1568-2023-4,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Chromium (and its compounds),Chromium,tonnes,Air Emissions / Émissions à l'air,Stack Emissions,0.00105
4,npri-1568-2023-5,1568,ON-MAIN-1f126a43,2023,Boreal Shield,St. Lawrence Drainage Area,Cobalt (and its compounds),Cobalt,kg,Air Emissions / Émissions à l'air,Stack Emissions,0.18
...,...,...,...,...,...,...,...,...,...,...,...,...
24116,npri-28761-2023-117,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Sulphur dioxide,Sulfur dioxide,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",4.405
24117,npri-28761-2023-118,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Sulphuric acid,Sulfuric acid,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",0.2475
24118,npri-28761-2023-119,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Total particulate matter,No match found,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",29.277299999999997
24119,npri-28761-2023-120,28761,NL-MAIN-d9036091,2023,Boreal Shield,St. Lawrence Drainage Area,Total reduced sulphur (expressed as hydrogen s...,No match found,tonnes,Grand Total,"Total Releases, Disposals and Transfers for Re...",0.485


## Point to polygon matching

### Protected lands datasets 

In [126]:
from matching_functions import associate_facilities_near_polygons

ImportError: cannot import name 'associate_facilities_near_polygons' from 'matching_functions' (C:\Users\mp_ma\OneDrive - polymtlus\Desktop\POST_DOC\Project\canada_metal_sustainability_db\matching_functions.py)

In [127]:
lt_wpda = associate_facilities_near_polygons(
    main_gdf,
    wpda_gdf_polygons,
    facility_id_col="main_id",
    polygon_id_col="wpda_id",
    buffer_km=50,
    crs="EPSG:3978"
)

In [128]:
lt_wpda

Unnamed: 0,main_id,wpda_id,distance_km,relation_type
0,QC-MAIN-5801b453,555567198,39.435,within_buffer
1,QC-MAIN-5801b453,555593497,24.661,within_buffer
2,QC-MAIN-5801b453,555568527,45.434,within_buffer
3,QC-MAIN-5801b453,555643133,37.808,within_buffer
4,QC-MAIN-5801b453,555643114,15.683,within_buffer
...,...,...,...,...
11201,BC-MAIN-891d3cb5,555516318,46.587,within_buffer
11202,BC-MAIN-891d3cb5,167500,46.769,within_buffer
11203,BC-MAIN-891d3cb5,18409,41.464,within_buffer
11204,BC-MAIN-891d3cb5,555651745,1.421,within_buffer


In [129]:
wpda_gdf_polygons = wpda_gdf_polygons.merge(lt_wpda, on="wpda_id", how="left")

In [133]:
wpda_gdf_polygons

Unnamed: 0,wpda_id,wpda_name,type,ownership,operator,status_year,geometry,main_id,distance_km,relation_type
0,611_A,Wood Buffalo National Park Of Canada,National Park,State,Parks Canada Agency,1922,"POLYGON ((-112.87253 59.99877, -111.53006 59.7...",,,
1,611_B,Wood Buffalo National Park Of Canada,National Park,State,Parks Canada Agency,1922,"POLYGON ((-112.87253 59.99877, -115.58450 59.9...",NT-MAIN-b99d4cb2,27.397,within_buffer
2,612,Kluane National Park Reserve Of Canada,National Park,State,Parks Canada Agency,1972,"POLYGON ((-141.00187 60.30634, -137.43466 60.7...",,,
3,613_B,Auyuittuq National Park Of Canada,National Park,State,Parks Canada Agency,1972,"MULTIPOLYGON (((-64.62292 67.13706, -63.92605 ...",,,
4,613_A,Auyuittuq National Park Of Canada,National Park,State,Parks Canada Agency,1972,"MULTIPOLYGON (((-68.23194 67.76712, -67.56846 ...",,,
...,...,...,...,...,...,...,...,...,...,...
20582,555785736,Rideau Canal National Historic Site,National Historic Site,State,Parks Canada Agency,2023,"MULTIPOLYGON (((-76.24341 44.90190, -76.19385 ...",ON-MAIN-dca9cce7,0.427,within_buffer
20583,555785736,Rideau Canal National Historic Site,National Historic Site,State,Parks Canada Agency,2023,"MULTIPOLYGON (((-76.24341 44.90190, -76.19385 ...",ON-MAIN-4a6187a4,2.245,within_buffer
20584,555785737,Capital Recreation Area,OECM,State,National Capital Commission (NCC),2023,"POLYGON ((-75.60920 45.33843, -75.61502 45.332...",ON-MAIN-8056a739,7.798,within_buffer
20585,555785737,Capital Recreation Area,OECM,State,National Capital Commission (NCC),2023,"POLYGON ((-75.60920 45.33843, -75.61502 45.332...",ON-MAIN-470b504b,5.739,within_buffer


In [130]:
lt_indigenous = associate_facilities_near_polygons(
    main_gdf,
    indigenous_can_gdf,
    facility_id_col="main_id",
    polygon_id_col="indigenous_land_id",
    buffer_km=50,
    crs="EPSG:3978"
)

In [131]:
lt_indigenous

Unnamed: 0,main_id,indigenous_land_id,distance_km,relation_type
0,QC-MAIN-5801b453,ZZ-IND-a8bafff3,32.971,within_buffer
1,QC-MAIN-5801b453,ZZ-IND-a8bafff3,32.971,within_buffer
2,QC-MAIN-5801b453,ZZ-IND-a8bafff3,32.971,within_buffer
3,QC-MAIN-5801b453,ZZ-IND-a8bafff3,32.971,within_buffer
4,QC-MAIN-5801b453,ZZ-IND-a8bafff3,32.971,within_buffer
...,...,...,...,...
3139,SK-MAIN-bae48393,ZZ-IND-35a209df,49.269,within_buffer
3140,SK-MAIN-bae48393,ZZ-IND-35a209df,49.269,within_buffer
3141,SK-MAIN-bae48393,ZZ-IND-35a209df,49.269,within_buffer
3142,BC-MAIN-891d3cb5,ZZ-IND-00b88493,42.205,within_buffer


In [132]:
indigenous_can_gdf = indigenous_can_gdf.merge(lt_indigenous, on="indigenous_land_id", how="left")

In [124]:
indigenous_can_gdf

Unnamed: 0,indigenous_land_id,indigenous_land_name,land_category,data_source,status_date,longitude,latitude,geometry,main_id,distance_km,relation_type
0,ZZ-IND-65515118,Nicola Lake 1,Indian Reserve,"Natural Resources Canada, Earth Sciences Secto...",2012-09-05,-1.340968e+07,6.482531e+06,MULTIPOLYGON Z (((-13412001.078 6484020.720 0....,BC-MAIN-4724f4ba,37.643,within_buffer
1,ZZ-IND-65515118,Nicola Lake 1,Indian Reserve,"Natural Resources Canada, Earth Sciences Secto...",2012-09-05,-1.340968e+07,6.482531e+06,MULTIPOLYGON Z (((-13412001.078 6484020.720 0....,BC-MAIN-aa76f6f2,47.059,within_buffer
2,ZZ-IND-65515118,Nicola Lake 1,Indian Reserve,"Natural Resources Canada, Earth Sciences Secto...",2012-09-05,-1.340968e+07,6.482531e+06,MULTIPOLYGON Z (((-13412001.078 6484020.720 0....,BC-MAIN-bf503b6b,49.482,within_buffer
3,ZZ-IND-822586b0,Piikani Reserve,Indian Reserve,"Natural Resources Canada, Earth Sciences Secto...",2012-09-05,-1.265128e+07,6.375071e+06,MULTIPOLYGON Z (((-12638644.679 6375535.364 0....,,,
4,ZZ-IND-cf5cc3a8,Suwannee Lake Indian Reserve 09924,Indian Reserve,"Natural Resources Canada, Earth Sciences Secto...",2013-05-30,-1.114723e+07,7.586468e+06,"POLYGON Z ((-11149110.724 7581557.313 0.000, -...",,,
...,...,...,...,...,...,...,...,...,...,...,...
3678,ZZ-IND-ace3fb37,Maa-nulth,First Nations Treaty Lands,Ministry of Aboriginal Relations and Reconcili...,2016-06-09,-1.393595e+07,6.271393e+06,MULTIPOLYGON Z (((-13935109.163 6271300.023 0....,,,
3679,ZZ-IND-07b0f9d4,Maa-nulth,First Nations Treaty Lands,Ministry of Aboriginal Relations and Reconcili...,2016-06-09,-1.391666e+07,6.302849e+06,MULTIPOLYGON Z (((-13915849.512 6301673.475 0....,,,
3680,ZZ-IND-2cec7703,Maa-nulth,First Nations Treaty Lands,Ministry of Aboriginal Relations and Reconcili...,2016-06-09,-1.390820e+07,6.289657e+06,MULTIPOLYGON Z (((-13900991.756 6286092.967 0....,,,
3681,ZZ-IND-a914af3f,Maa-nulth,First Nations Treaty Lands,Ministry of Aboriginal Relations and Reconcili...,2016-06-09,-1.398181e+07,6.269168e+06,MULTIPOLYGON Z (((-13981167.298 6267857.450 0....,,,


In [123]:
wpda_gdf_polygons

Unnamed: 0,wpda_id,wpda_name,type,ownership,operator,status_year,geometry
0,611_A,Wood Buffalo National Park Of Canada,National Park,State,Parks Canada Agency,1922,"POLYGON ((-112.87253 59.99877, -111.53006 59.7..."
1,611_B,Wood Buffalo National Park Of Canada,National Park,State,Parks Canada Agency,1922,"POLYGON ((-112.87253 59.99877, -115.58450 59.9..."
2,612,Kluane National Park Reserve Of Canada,National Park,State,Parks Canada Agency,1972,"POLYGON ((-141.00187 60.30634, -137.43466 60.7..."
3,613_B,Auyuittuq National Park Of Canada,National Park,State,Parks Canada Agency,1972,"MULTIPOLYGON (((-64.62292 67.13706, -63.92605 ..."
4,613_A,Auyuittuq National Park Of Canada,National Park,State,Parks Canada Agency,1972,"MULTIPOLYGON (((-68.23194 67.76712, -67.56846 ..."
...,...,...,...,...,...,...,...
13463,555785733,Grosse Île and the Irish Memorial National His...,Lieu historique national,State,Parks Canada Agency,2023,"MULTIPOLYGON (((-70.67843 47.01639, -70.67803 ..."
13464,555785734,Navy Island National Historic Site,National Historic Site,State,Parks Canada Agency,2023,"POLYGON ((-79.01037 43.04900, -79.00871 43.060..."
13465,555785735,Fort George National Historic Site (Paradise G...,National Historic Site,State,Parks Canada Agency,2023,"POLYGON ((-79.06782 43.24368, -79.05901 43.248..."
13466,555785736,Rideau Canal National Historic Site,National Historic Site,State,Parks Canada Agency,2023,"MULTIPOLYGON (((-76.24341 44.90190, -76.19385 ..."


### Unify datasets

In [114]:
protected_mapping = {
    "wpda_id": "protected_area_id",
    "wpda_name": "land_name",
    "type": "land_type",
    "status_year": "date"
}

indigenous_mapping = {
    "indigenous_land_id": "protected_area_id",
    "indigenous_land_name": "land_name",
    "land_category": "land_type",
    "status_date": "date"
}


In [125]:
protected_clean = wpda_gdf_polygons.rename(columns=protected_mapping)[["protected_area_id", "land_name", "land_type", "date", "geometry", "main_id", "distance_km"]].copy()
indigenous_clean = indigenous_can_gdf.rename(columns=indigenous_mapping)[["protected_area_id", "land_name", "land_type", "date", "geometry", "main_id", "distance_km"]].copy()

KeyError: "['main_id', 'distance_km'] not in index"

In [120]:
target_crs = "EPSG:4326"
protected_clean = protected_clean.to_crs(target_crs)
indigenous_clean = indigenous_clean.to_crs(target_crs)

In [121]:
combined_land_gdf = pd.concat([protected_clean, indigenous_clean], ignore_index=True)
combined_land_gdf = gpd.GeoDataFrame(combined_land_gdf, geometry="geometry", crs=wpda_gdf_polygons.crs)

In [122]:
combined_land_gdf

Unnamed: 0,protected_area_id,land_name,land_type,date,geometry
0,611_A,Wood Buffalo National Park Of Canada,National Park,1922,"POLYGON ((-112.87253 59.99877, -111.53006 59.7..."
1,611_B,Wood Buffalo National Park Of Canada,National Park,1922,"POLYGON ((-112.87253 59.99877, -115.58450 59.9..."
2,612,Kluane National Park Reserve Of Canada,National Park,1972,"POLYGON ((-141.00187 60.30634, -137.43466 60.7..."
3,613_B,Auyuittuq National Park Of Canada,National Park,1972,"MULTIPOLYGON (((-64.62292 67.13706, -63.92605 ..."
4,613_A,Auyuittuq National Park Of Canada,National Park,1972,"MULTIPOLYGON (((-68.23194 67.76712, -67.56846 ..."
...,...,...,...,...,...
17146,ZZ-IND-ace3fb37,Maa-nulth,First Nations Treaty Lands,2016-06-09,"MULTIPOLYGON Z (((-125.18122 48.97901 0.00003,..."
17147,ZZ-IND-07b0f9d4,Maa-nulth,First Nations Treaty Lands,2016-06-09,"MULTIPOLYGON Z (((-125.00820 49.15777 0.00003,..."
17148,ZZ-IND-2cec7703,Maa-nulth,First Nations Treaty Lands,2016-06-09,"MULTIPOLYGON Z (((-124.87473 49.06615 0.00003,..."
17149,ZZ-IND-a914af3f,Maa-nulth,First Nations Treaty Lands,2016-06-09,"MULTIPOLYGON Z (((-125.59496 48.95871 0.00003,..."


### Land occupation footprint datasets

In [None]:
from matching_functions import assign_polygons_to_points

In [69]:
assigned_df = assign_polygons(
    main_gdf,
    tailings_gdf,
    tang_canada_gdf,
    max_dist_km=10
)


In [70]:
assigned_df

Unnamed: 0,main_id,tailing_id,tang_id,distance_km,relation_type
0,NS-MAIN-c9c77a59,,45954,0.000000,one-to-many
1,NS-MAIN-c9c77a59,,45955,1.186584,one-to-many
2,NL-MAIN-d9036091,,7631,0.811781,one-to-many
3,NL-MAIN-d9036091,,7641,4.647047,one-to-many
4,NL-MAIN-d9036091,,7633,0.000000,many-to-one
...,...,...,...,...,...
789,NT-MAIN-6c1d6ee6,,26078,2.857250,one-to-many
790,NT-MAIN-6c1d6ee6,,43740,0.833606,one-to-many
791,NT-MAIN-6c1d6ee6,,3479,0.000000,one-to-many
792,NT-MAIN-6c1d6ee6,,50385,2.150861,one-to-many


In [71]:
assigned_df['relation_type'].value_counts()

relation_type
one-to-many    707
many-to-one     56
one-to-one      31
Name: count, dtype: int64

In [72]:
polygon_gdf = tang_canada_gdf.merge(assigned_df, on="tang_id", how="left")

In [78]:
polygon_gdf

Unnamed: 0,tang_id,area_km2,geometry,main_id,tailing_id,distance_km,relation_type
0,6383,0.026130,"POLYGON Z ((1795479.979 -83564.531 0.000, 1795...",,,,
1,6388,0.671399,"POLYGON Z ((1796123.448 -80625.526 0.000, 1796...",,,,
2,6390,0.050909,"POLYGON Z ((1796791.634 -80270.697 0.000, 1796...",,,,
3,45954,2.622800,"POLYGON Z ((2466485.124 185586.547 0.000, 2466...",NS-MAIN-c9c77a59,,0.000000,one-to-many
4,45955,0.441869,"POLYGON Z ((2465809.682 185862.507 0.000, 2465...",NS-MAIN-c9c77a59,,1.186584,one-to-many
...,...,...,...,...,...,...,...
1915,3399,0.541024,"POLYGON Z ((-717210.659 1915423.857 0.000, -71...",,,,
1916,3401,1.053173,"POLYGON Z ((-715648.337 1916385.463 0.000, -71...",,,,
1917,26364,0.079894,"POLYGON Z ((-717332.782 1916999.286 0.000, -71...",,,,
1918,43739,1.519488,"POLYGON Z ((-713032.430 1919630.086 0.000, -71...",,,,


# 2) Maps 

In [None]:
from visualization_functions import plot_polygons_assginment

In [75]:
plot_polygons_assginment(
    main_gdf,
    tailings_gdf,
    polygon_gdf,  # Make sure polygon_gdf already has 'main_id' and 'tailing_id'
    output_html="facility_tailing_assignment_map.html"
)


✅ Map saved to facility_tailing_assignment_map.html


# 3) Exports 

## Simple format

In [31]:
from matching_functions import export_sqlite_db

In [32]:
export_sqlite_db(
    "data/Tables/metallican.sqlite", 
    tables_dict={
    "main": main_gdf,
    "tailings": tailings_gdf,
    "mincan": min_can_gdf,
    "conflict": conflict_gdf,   
    "ghg": ghg_gdf,
    "pollution": pollutant_gdf,
    "climate_categories": climate_category_gdf,
    "peatland": peatland_gdf,
    "weather": weather_gdf,
    "population": population_gdf,
    "natural_potential_vegetation": npv_gdf,
    "land_cover": land_cover_gdf,
    "water_risk": water_risk_gdf,
    "land_occupation": tang_canada_gdf,
    "indigenous_land": indigenous_can_gdf,
    "protected_land": wpda_gdf_polygons,
    "lt_protected_land": matching_main_wpda,
    "lt_indigenous_land": matching_main_indigenous,
    "lt_clusters": lt_land_occupation
},
    keep_geometry_tables=['main', 'tailings'], 
    csv_dir='data/Tables/csv'
)

✅ Exported 'main' → SQLite + CSV
✅ Exported 'tailings' → SQLite + CSV


  df_export["geometry"] = df_export.geometry.to_wkt()
  df_export["geometry"] = df_export.geometry.to_wkt()


✅ Exported 'mincan' → SQLite + CSV
✅ Exported 'conflict' → SQLite + CSV
✅ Exported 'ghg' → SQLite + CSV
✅ Exported 'pollution' → SQLite + CSV
✅ Exported 'climate_categories' → SQLite + CSV
✅ Exported 'peatland' → SQLite + CSV
✅ Exported 'weather' → SQLite + CSV
✅ Exported 'population' → SQLite + CSV
✅ Exported 'natural_potential_vegetation' → SQLite + CSV
✅ Exported 'land_cover' → SQLite + CSV
✅ Exported 'water_risk' → SQLite + CSV
✅ Exported 'land_occupation' → SQLite + CSV
✅ Exported 'indigenous_land' → SQLite + CSV
✅ Exported 'protected_land' → SQLite + CSV
✅ Exported 'lt_protected_land' → SQLite + CSV
✅ Exported 'lt_indigenous_land' → SQLite + CSV
✅ Exported 'lt_clusters' → SQLite + CSV
✅ All exports completed to SQLite and CSVs in: data/Tables/csv


## More advanced setting

In [None]:
from matching_functions import create_and_populate_database

In [37]:
tables_dict = {
    "main": main_gdf,
    "tailings": tailings_gdf,
    "mincan": min_can_gdf,
    "conflict": conflict_gdf,
    "ghg": ghg_gdf,
    "pollution": pollutant_gdf,
    "climate_categories": climate_category_gdf,
    "weather": weather_gdf,
    "peatland": peatland_gdf,
    "population": population_gdf,
    "water_risk": water_risk_gdf,
    "land_cover": land_cover_gdf,
    "natural_potential_vegetation": npv_gdf,
    "indigenous_land": indigenous_can_gdf,
    "protected_land": wpda_gdf_polygons,
    "land_occupation": tang_canada_gdf
}

# Run the function
create_and_populate_database(
    db_path="data/Tables/metallican.sqlite",
    schema_path="schema.sql",
    tables_dict=tables_dict,
    keep_geometry_tables=["main", "tailings"]
)


ℹ️ No old database found at 'data/Tables/metallican.sqlite'
✅ Empty database structure created at 'data/Tables/metallican.sqlite'
🔄 Converting geometries...


  df["geometry"] = df["geometry"].to_wkt()


✅ Geometries handled (kept only where needed)
✅ New connection opened
✅ Inserted 305 rows into 'main'
✅ Inserted 227 rows into 'tailings'
✅ Inserted 939 rows into 'mincan'
✅ Inserted 20 rows into 'conflict'
✅ Inserted 184 rows into 'ghg'
✅ Inserted 24130 rows into 'pollution'
✅ Inserted 2745 rows into 'climate_categories'
✅ Inserted 39040 rows into 'weather'
✅ Inserted 305 rows into 'peatland'
✅ Inserted 1220 rows into 'population'
✅ Inserted 9760 rows into 'water_risk'
✅ Inserted 305 rows into 'land_cover'
✅ Inserted 305 rows into 'natural_potential_vegetation'
✅ Inserted 3258 rows into 'indigenous_land'
✅ Inserted 13468 rows into 'protected_land'
✅ Inserted 1885 rows into 'land_occupation'
✅ No foreign key problems found!
✅ Connection closed properly
