In [None]:
import pandas as pd
import os

# From World Mining Data 2024

In [None]:
file_path_wmd = 'data/World_mining_data_2024/6.5. Share_of_World_Mineral_Production_2022_by_Countries.xlsx'  

In [None]:
def get_canada_production_data_wmd(file_path):
    """
    Reads an Excel file with multiple sheets, each representing a commodity.
    Extracts Canada's production, world share, and unit for each commodity, returning a DataFrame.
    Commodities are listed in alphabetical order, with 0s for missing data.
    
    Parameters:
    - file_path (str): Path to the Excel file containing the data.
    
    Returns:
    - pd.DataFrame: DataFrame with columns: ['Commodity', 'Production Volume', 'World Share (%)', 'Unit']
    """
    # Load the Excel file
    excel_data = pd.ExcelFile(file_path)
    
    # Initialize a dictionary to store production volume, world share data, and units for Canada
    canada_production_data = {'Commodity': [], 'Production Volume': [], 'World Share (%)': [], 'Unit': []}
    
    # Loop through each sheet, extract Canada's data if present
    for sheet in excel_data.sheet_names:
        df = excel_data.parse(sheet, skiprows=1)  # Start reading from the second row with corrected column headers
        canada_row = df[df['Country'].str.contains("Canada", na=False)]
        
        # Get the unit if present, default to 'tonnes' if not specified
        unit = df['unit'].iloc[0] if 'unit' in df.columns else 'tonnes'
        
        # Add data or fill with 0 if Canada data is missing
        canada_production_data['Commodity'].append(sheet)
        canada_production_data['Unit'].append(unit)
        if not canada_row.empty:
            canada_production_data['Production Volume'].append(float(canada_row['Production 2022'].values[0]))
            canada_production_data['World Share (%)'].append(float(canada_row['Share in %'].values[0]))
        else:
            canada_production_data['Production Volume'].append(0)
            canada_production_data['World Share (%)'].append(0)

    # Convert the dictionary to a DataFrame and sort alphabetically by Commodity
    canada_production_df = pd.DataFrame(canada_production_data).sort_values(by="Commodity").reset_index(drop=True)
    
    return canada_production_df

In [None]:
# We keep only non 0 production volume
canada_production_df = get_canada_production_data_wmd(file_path_wmd)
canada_production_df = canada_production_df[canada_production_df['Production Volume'] != 0]
canada_production_df

In [None]:
canada_production_df.to_csv(r'data/World_mining_data_2024/canada_production_wmd2024.csv', index=False)

In [None]:
# We drop the FF and those for which we haven't found LCI 
commodities_to_drop = ['Baryte', 'Coking Coal', 'Diamonds (Gem)', 'Gypsum and Anhydrite', 'Lignite', 
                       'Magnesite', 'Natural Gas', 'Oil Sands (part of Petroleum)', 'Petroleum',
                       'Salt (rock, brines, marine)', 'Steam Coal ', 'Sulfur (elementar & industrial)', 
                       'Talc, Steatite & Pyrophyllite']
canada_production_df = canada_production_df[~canada_production_df['Commodity'].isin(commodities_to_drop)]

In [None]:
canada_production_df

In [None]:
# We put everything in kg
canada_production_df_kg = canada_production_df.copy()
canada_production_df_kg.loc[canada_production_df_kg['Unit'] == 'metr. t', 'Production Volume'] *= 1000
canada_production_df_kg.loc[canada_production_df_kg['Unit'] == 'metr. t', 'Unit'] = 'kg'
canada_production_df_kg.rename(columns={'Production Volume': 'Production Volume (kg)'}, inplace=True)
canada_production_df_kg.drop(columns=['Unit'], inplace=True)
canada_production_df_kg

In [None]:
canada_production_df_kg.to_csv(r'data/World_mining_data_2024/canada_production_wmd2024_kg.csv', index=False)

## From USGS's Mining Commodities Summary 2024

In [None]:
def get_canada_production_mcs_usgs(folder_path, delimiter=","):
    """
    Parse all CSV (or similarly formatted) files in `folder_path`.
    Keep the first 5 columns: [Source, Country, Type, Prod_2022, Prod_est_2023].

    1. Check if the file has any 'Canada' entry in 'Country' (case-insensitive).
    2. Only if it does, include rows where 'Country' has 'Canada' or 'World'.
    3. Skip the entire file otherwise.
    4. Add a 'FileName' column indicating which CSV file the row came from.
    5. Add a 'Unit' column that stores the original 4th column name (e.g., "Prod_t_2022").

    Returns a single DataFrame with these rows from all files combined.

    Parameters:
    -----------
    folder_path : str
        Path to the folder containing .csv files.
    delimiter : str
        The delimiter for the CSV files (default ',').
        Change to ';' or '\\t' if your file is delimited differently.
    """

    all_data = []

    for filename in os.listdir(folder_path):
        # Only process .csv files
        if filename.lower().endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            
            # Read the CSV (adjust header or skiprows if your files have extra lines)
            df = pd.read_csv(file_path, delimiter=delimiter, header=0)

            # Make sure there are at least 5 columns
            if df.shape[1] < 5:
                continue

            # Slice the first 5 columns
            subset_df = df.iloc[:, :5].copy()

            # Capture the original name of the 4th column (index=3 in 0-based Python)
            original_4th_col_name = df.columns[3]

            # Rename them to a standard set
            subset_df.columns = [
                "Source",
                "Country",
                "Type",
                "Prod_2022",
                "Prod_est_2023"
            ]

            # Check if the file has any row with 'Canada'
            mask_canada = subset_df["Country"].str.contains("Canada", case=False, na=False)
            if not mask_canada.any():
                # If there's no Canada, skip the entire file
                continue

            # Filter for rows where 'Country' is Canada or World
            mask_final = subset_df["Country"].str.contains("Canada|World", case=False, na=False)
            subset_df = subset_df[mask_final]

            # Add the file name column
            subset_df["FileName"] = filename

            # Add a "Unit" column containing the original 4th column name
            subset_df["Unit"] = original_4th_col_name

            # If there's data left, add to our list
            if not subset_df.empty:
                all_data.append(subset_df)

    # Combine all filtered data
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
    else:
        # Return an empty DataFrame with expected columns
        final_df = pd.DataFrame(columns=[
            "Source",
            "Country",
            "Type",
            "Prod_2022",
            "Prod_est_2023",
            "FileName",
            "Unit"
        ])

    return final_df


In [None]:
usgs_mcs_2024 = 'data/USGS/Mineral_commodities_2024'

In [None]:
canada_mcs24_df = get_canada_production_mcs_usgs(usgs_mcs_2024)

In [None]:
canada_mcs24_df

In [None]:
canada_mcs24_df.to_csv(r'data/USGS/data_usgs_mineral_commodities_24_processed.csv', index=False)