In [1]:
import pandas as pd
import geopandas as gpd

# NRCan datasets 

In [None]:
nrcan_mining_df = gpd.read_file(
    r'data\NRCan\Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)\900A_73rd_2023_shape\900A_73rd_shape\900A_73rd_ProducingMines.shp')

nrcan_manufacturing_df = gpd.read_file(
    r'data\NRCan\Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)\900A_73rd_2023_shape\900A_73rd_shape\900A_73rd_MetalWorks.shp')

nrcan_oilgas_df = gpd.read_file(
    r'data\NRCan\Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)\900A_73rd_2023_shape\900A_73rd_shape\900A_73rd_OilAndGas.shp')

In [None]:
from db_creation_data_cleaning import plot_commodity_distribution

In [None]:
plot_commodity_distribution(nrcan_mining_df, "COM_DESC_E", "COM_GROU_E", "Producing Mines")
plot_commodity_distribution(nrcan_manufacturing_df, "COM_DESC_E", "COM_GROU_E", "Metal Works")

In [None]:
nrcan_mining_df.to_excel(
    r'data\NRCan\Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)\nrcan_mining_df.xlsx')
nrcan_manufacturing_df.to_excel(
    r'data\NRCan\Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)\nrcan_manufacturing_df.xlsx')
nrcan_oilgas_df.to_excel(
    r'data\NRCan\Principal Mineral Areas, Producing Mines, and Oil and Gas Fields (900A)\nrcan_oilgas_df.xlsx')

# MinCan dataset

In [2]:
# MinCan datasets
min_can_df = pd.read_excel(
    r'data/Sources/MinCan _Past and Present Productive Mines of Canada, 1950-2022_March2024.xlsx', 
    sheet_name='Data')

In [3]:
from db_creation_data_cleaning import is_mine_active

In [4]:
# Apply the function to create a new column 'mine_status'
min_can_df['mine_status'] = min_can_df.apply(is_mine_active, axis=1)

In [6]:
# Delete useless columns 
min_can_df = min_can_df.drop(columns=[
    'commodity1', 'commodity2', 'commodity3', 'commodity4', 'commodity5', 'commodity6', 'commodity7', 'commodity8'])

In [7]:
min_can_df

Unnamed: 0,company1,company2,company3,company4,company5,company6,namemine,town,province,latitude,...,close3,commodityall,information,source1,source2,source3,link1,link2,link3,mine_status
0,Quebec Copper Corp. Ltd,,,,,,,,Quebec,45.263786,...,,"Copper, Gold, Silver",,0,1,0,,,,Inactive
1,Noland Mines Ltd.,,,,,,Spruce Creek,Atlin,British Columbia,59.382720,...,,Gold,,0,1,1,https://www.mindat.org/feature-6088912.html,https://www.mindat.org/loc-257276.html,https://mrdata.usgs.gov/ardf/show-ardf.php?ard...,Inactive
2,Falconbridge Nickel Mines Ltd.,,,,,,Wesfrob (Tasu),Moresby Island,British Columbia,52.758340,...,,"Iron, Copper",,0,1,1,https://www.mindat.org/loc-26169.html,,,Inactive
3,Greenwood Coal Co. Ltd.,,,,,,,,Nova Scotia,45.550219,...,,Coal,The currently entered coordinates are for the ...,0,1,1,https://www.mindat.org/loc-290980.html,,,Inactive
4,Noranda Mines Ltd,Normetal Mining Corporation Ltd.,Falconbridge Ltd.,Xstrata,,,Horne,Noranda,Quebec,48.254270,...,,"Copper, Gold, Silver, Selenium, Tellurium, Pyr...",Alternative names: Horne Mine (1975) The mine ...,0,1,1,https://www.mindat.org/loc-609.html,,,Inactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
942,"Imperial Metals Corporation, Mitsubishi Materi...",Teck Resources Limited,Huckleberry Mines Ltd.,,,,Huckleberry,Kemano,British Columbia,53.681110,...,,"Copper, Molybdenum, Silver, Gold",,1,0,0,,,,Inactive
943,BHP Diamonds Inc.,Dia Met Minerals Ltd.,Charles Fipke and Stewart Blusson,Arctic Canadian Diamond Company,,,Panda (Ekati Project),Lac de Gras,Northwest Territories,64.716667,...,,Diamond,Brief closure between 2020 and 2021,1,0,0,,,,Active
944,Magnola Magnesium Inc.,Society generale de financement du Quebec,,,,,Ardorbec,Danville,Quebec,45.748462,...,,Magnesium,,1,0,0,,,,Active
945,Canadian Johns-Manville Co. Ltd.,Asbestos Inc.,,,,,Jeffrey,Asbestos (Val-des-Sources),Quebec,45.775100,...,,Chrysotile,"FR: En 1992, la mine est l'objet d'une importa...",0,0,1,https://miningwatch.ca/sites/default/files/asb...,https://niche-canada.org/2015/11/19/workers-as...,https://www.researchgate.net/publication/28664...,Inactive


In [None]:
from visualization_functions import plot_mine_status_mincan

In [None]:
# Apply function to the dataset
plot_mine_status_mincan(min_can_df)

In [8]:
min_can_df.to_csv(r'data/cleaned_data/min_can_cleaned_df.csv', index=False)

# National Pollutant Inventory (NPRI)

In [None]:
from db_creation_data_cleaning import clean_npri

In [None]:
npri_df = clean_npri("data/Emissions_accounts/Raw_data/NPRI/NPRI-INRP_DataDonnées_2023.xlsx")

In [None]:
npri_df

In [None]:
# Create mapping NPRI-EI through IW+ mapping
# Import existing concordances 
mapping_npri_iw = pd.read_excel(r'data/Concordances/openIO_IW_EI_concordance.xlsx', sheet_name='NPRI_to_IW21')
mapping_ei_iw = pd.read_excel(r'data/Concordances/openIO_IW_EI_concordance.xlsx', sheet_name='EI_to_IW+')

In [None]:
# Standardize column names for merging
mapping_npri_iw.rename(columns={'IMPACT World+ flows': 'iw_name'}, inplace=True)
mapping_ei_iw.rename(columns={'iw name': 'iw_name'}, inplace=True)

In [None]:
# Add the 'EI' column to df_npri_iw based on matching iw_name with df_ei_iw
mapping_npri_iw['EI'] = mapping_npri_iw['iw_name'].map(
    lambda iw: mapping_ei_iw[mapping_ei_iw['iw_name'] == iw]['ecoinvent name'].iloc[0]
    if iw in mapping_ei_iw['iw_name'].values else 'No match found'
)


In [None]:
mapping_npri_iw

In [None]:
# We need to rename it to merge it to NPRI data
mapping_npri_iw.rename(columns={'OpenIO flows': 'Substance Name (English)'}, inplace=True)

In [None]:
# We add the Ecoinvent name based on the mapping 
npri_df = npri_df.merge(mapping_npri_iw[['Substance Name (English)', 'EI']],
              on='Substance Name (English)',
              how='left')

In [None]:
# Removing French names 
npri_df = npri_df.drop(columns=['Terrestrial Ecozone (French)', 'Watershed (French)', 
                                'NAICS 4 Sector Name (French)', 'NAICS 6 Sector Name (French)', 'Substance Name (French)'    ])

In [None]:
npri_metal_mining_df = npri_df[npri_df["NAICS 4 Sector Name (English)"] == "Metal ore mining"]
npri_metal_manufacturing_df = npri_df[npri_df["NAICS 4 Code"].astype(str).str[:3] == "331"]

In [None]:
npri_metal_mining_df.to_csv(r'data/cleaned_data/npri_metal_mining_df.csv', index=False)
npri_metal_manufacturing_df.to_csv(r'data/cleaned_data/npri_metal_manufacturing_df.csv', index=False)

# GHG from large facilities datasets

In [None]:
# GHG from large facilities datasets
ghg_facility_df = pd.read_csv(
    r'data/Emissions_accounts/Raw_data/GHG_large_facilities/Greenhouse gas emissions from large facilities.csv',
    encoding='ISO-8859-1')

In [None]:
from constants import metal_ore_mining_naics, metal_manufacturing_naics

In [None]:
from db_creation_data_cleaning import filter_ghg_facility_naics

In [None]:
ghg_facility_metal_mining_df = filter_ghg_facility_naics(ghg_facility_df, metal_ore_mining_naics)
ghg_facility_metal_manufacturing_df = filter_ghg_facility_naics(ghg_facility_df, metal_manufacturing_naics)

In [None]:
ghg_facility_metal_mining_df.to_csv(r'data/cleaned_data/ghg_facility_metal_mining_df.csv', index=False)
ghg_facility_metal_manufacturing_df.to_csv(r'data/cleaned_data/ghg_facility_metal_manufacturing_df.csv', index=False)

# SUT

In [None]:
# SUT energy datasets 
sut_df = pd.read_csv(
    r'data/Emissions_accounts/Raw_data/Mining industries, energy consumption by NAICS - 16100029-eng/16100029.csv'
) 

In [None]:
from db_creation_data_cleaning import clean_sut

In [None]:
sut_df_cleaned = clean_sut(sut_df, year=2022, units_to_exclude=["Thousands of dollars"])
sut_df_cleaned

In [None]:
sut_df_cleaned['North American Industry Classification System (NAICS)'].value_counts()

In [None]:
from db_creation_data_cleaning import split_by_naics

In [None]:
# Apply the function to the cleaned DataFrame
naics_dfs = split_by_naics(sut_df_cleaned)

# Access a specific NAICS DataFrame
uranium_mining_df = naics_dfs['Uranium ore mining [212291]']

# Climate Trace

In [None]:
##Sources
ctrace_copper = pd.read_csv(r'data/Sources/ClimateTRACE/CAN/DATA/mineral_extraction/copper-mining_emissions_sources.csv')
ctrace_iron = pd.read_csv(r'data/Sources/ClimateTRACE/CAN/DATA/mineral_extraction/iron-mining_emissions_sources.csv')
## Confidence
ctrace_copper_confidence = pd.read_csv(r'data/Sources/ClimateTRACE/CAN/DATA/mineral_extraction/copper-mining_emissions_sources_confidence.csv')
ctrace_iron_confidence = pd.read_csv(r'data/Sources/ClimateTRACE/CAN/DATA/mineral_extraction/iron-mining_emissions_sources_confidence.csv')
## Ownership
ctrace_copper_ownership = pd.read_csv(r'data/Sources/ClimateTRACE/CAN/DATA/mineral_extraction/copper-mining_emissions_sources_ownership.csv')
ctrace_iron_ownership = pd.read_csv(r'data/Sources/ClimateTRACE/CAN/DATA/mineral_extraction/iron-mining_emissions_sources_ownership.csv')

## GPKG file for all mineral extraction worldwide
ctrace_gdf = gpd.read_file(
    "data/Sources/ClimateTRACE/mineral_extraction/DATA/mineral-extraction_geometries.gpkg")


In [None]:
ctrace = pd.concat([ctrace_copper, ctrace_iron])
ctrace_confidence = pd.concat([ctrace_copper_confidence, ctrace_iron_confidence])
ctrace_copper_ownership = pd.concat([ctrace_copper_ownership, ctrace_iron_ownership])

In [None]:
def plot_categorical_distributions(df, columns_of_interest, layout=(2, 3), save_path=None):
    """
    Plots pie charts for categorical distributions in a given DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        columns_of_interest (list): List of categorical columns to visualize.
        layout (tuple): Layout of the subplots (rows, cols).
        save_path (str, optional): Path to save the image. If None, it is not saved.
    """
    import matplotlib.pyplot as plt
    
    # Define color map to ensure consistency across plots
    unique_categories = set()
    for col in columns_of_interest:
        unique_categories.update(df[col].dropna().unique())

    color_map = plt.get_cmap("tab10")  # Use tab10 colormap for distinct colors
    colors = {cat: color_map(i % 10) for i, cat in enumerate(sorted(unique_categories))}

    # Create subplots
    fig, axes = plt.subplots(layout[0], layout[1], figsize=(layout[1] * 5, layout[0] * 5))
    axes = axes.flatten()  # Flatten to 1D array for easy indexing

    for i, col in enumerate(columns_of_interest):
        if i >= len(axes):  # Stop if more columns than subplots
            break
        
        ax = axes[i]
        counts = df[col].value_counts(normalize=True) * 100
        colors_list = [colors[cat] for cat in counts.index]  # Ensure consistent color coding

        counts.plot(kind="pie", autopct="%1.1f%%", startangle=140, colors=colors_list, ax=ax)
        ax.set_title(f"Distribution of {col}")
        ax.set_ylabel("")  # Hide y-axis label for clarity

    # Hide any unused subplots
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    
    # Save if path is provided
    if save_path:
        plt.savefig(save_path, dpi=300)
    
    plt.show()

In [None]:
def compute_annual_figures(df, date_column, columns_to_sum, metadata_columns):
    """
    Aggregates the sum of selected columns by year while keeping relevant metadata.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the data.
        date_column (str): The name of the column containing datetime values.
        columns_to_sum (list): List of column names to aggregate by year.
        metadata_columns (list): List of metadata columns to keep.

    Returns:
        pd.DataFrame: A DataFrame with annual sums and relevant metadata.
    """
    
    # Ensure date column is in datetime format
    df[date_column] = pd.to_datetime(df[date_column], errors="coerce")

    # Extract year from the date column
    df["year"] = df[date_column].dt.year  
    
    # Group by year and relevant metadata, summing numerical columns
    annual_sums = df.groupby(["year"] + metadata_columns, as_index=False)[columns_to_sum].sum()
    
    return annual_sums

In [None]:
def plot_subsector_breakdown_stacked(df, subsector_column, subsector_value, year_column, breakdown_columns, value_columns, save_path=None):
    """
    Plots stacked bar charts for emissions, activity, and capacity for a specific subsector per year,
    categorized by specified breakdown columns.

    Parameters:
        df (pd.DataFrame): The dataset containing relevant data.
        subsector_column (str): Column representing the subsector.
        subsector_value (str): The specific subsector to filter for visualization.
        year_column (str): Column representing the year.
        breakdown_columns (list): List of categorical columns used for breakdown (e.g., 'other1', 'other2').
        value_columns (list): List of numerical columns to visualize (e.g., emissions, activity, capacity).
        save_path (str, optional): Path to save the image. If None, it is not saved.
    """
    import matplotlib.pyplot as plt
    
    # Filter data for the selected subsector
    df_filtered = df[df[subsector_column] == subsector_value]
    
    # Group data by year and breakdown columns, summing numerical values
    grouped_df = df_filtered.groupby([year_column] + breakdown_columns, as_index=False)[value_columns].sum()

    # Define figure layout
    fig, axes = plt.subplots(3, 1, figsize=(12, 15))  # 3 rows, 1 column
    colors = plt.get_cmap("tab10")  # Consistent color scheme

    # Create stacked bar plots for each value column
    for i, value_col in enumerate(value_columns):
        pivot_df = grouped_df.pivot(index=year_column, columns=breakdown_columns[0], values=value_col)
        pivot_df.plot(kind="bar", stacked=True, ax=axes[i], colormap=colors)
        
        axes[i].set_title(f"{value_col} Breakdown for {subsector_value}")
        axes[i].set_xlabel("Year")
        axes[i].set_ylabel(value_col)
        axes[i].legend(title=breakdown_columns[0])
        axes[i].tick_params(axis='x', rotation=45)

    plt.tight_layout()

    # Save the figure if a path is provided
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")

    plt.show()

In [None]:
plot_categorical_distributions(
    ctrace_confidence, 
    columns_of_interest=["source_type", "capacity", "capacity_factor", "activity", "emissions_factor", "emissions_quantity"], 
    layout=(2, 3), 
    save_path="results/ctrace_confidence.png")

In [None]:
metadata_columns = ['source_id', 'source_name', 'source_type', 'iso3_country', 'sector',
       'subsector','lat', 'lon', 'geometry_ref',
       'gas', 'activity_units', 'emissions_factor', 'emissions_factor_units',
        'capacity_units', 'capacity_factor', 'other1', 'other1_def',
       'other2', 'other2_def', 'other3', 'other3_def', 'sector_id'
                    ]
columns_to_sum = ['emissions_quantity', 'activity', 'capacity']

In [None]:
# Compute annual sums while keeping relevant metadata
ctrace_annual = compute_annual_figures(
    ctrace, 
    'start_time', 
    columns_to_sum, 
    metadata_columns)
ctrace_annual

In [None]:
#Activity = represents the amount in t or ore extracted (e.g. t of copper ore, t of iron ore)
#Capacity = metal resource contained in the mine (e.g. metal resource contained in the mine in tonnes)

In [None]:
ctrace_annual.to_csv(r'data/cleaned_data/ctrace_annual.csv', index=False)