In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import os 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from matplotlib.ticker import PercentFormatter

In [None]:
file_path = 'data/World_mining_data_2024/6.5. Share_of_World_Mineral_Production_2022_by_Countries.xlsx'  

# World Mining Data

In [None]:
def plot_canada_production_heatmap(file_path, commodity_list=None):
    """
    Generates a heatmap of Canada's production volume and world share for selected commodities, with units in the labels.
    
    Parameters:
    - file_path (str): Path to the Excel file containing the data.
    - commodity_list (list, optional): List of commodities to include. If None, all commodities are included.
    """
    # Load Excel file
    excel_data = pd.ExcelFile(file_path)

    # Initialize a dictionary to store production volume, world share data, and units for Canada
    canada_production_data = {'Commodity': [], 'Production Volume': [], 'World Share (%)': []}

    # Loop through each sheet, extract Canada's data if present
    for sheet in excel_data.sheet_names:
        df = excel_data.parse(sheet, skiprows=1)  # Start reading from the second row with corrected column headers
        
        # Attempt to extract the unit and Canada's production data
        unit = df['unit'].iloc[0] if 'unit' in df.columns else ''
        canada_row = df[df['Country'].str.contains("Canada", na=False)]
        
        if not canada_row.empty:
            commodity_with_unit = f"{sheet} ({unit})"
            canada_production_data['Commodity'].append(commodity_with_unit)
            canada_production_data['Production Volume'].append(float(canada_row['Production 2022'].values[0]))
            canada_production_data['World Share (%)'].append(float(canada_row['Share in %'].values[0]))

    # Convert the dictionary to a DataFrame
    canada_production_df = pd.DataFrame(canada_production_data).sort_values(by="Commodity")

    # Filter by commodity_list if provided
    if commodity_list is not None:
        canada_production_df = canada_production_df[canada_production_df['Commodity'].isin(commodity_list)]

    # Prepare data for heatmap
    heatmap_data = canada_production_df.set_index("Commodity")[["Production Volume", "World Share (%)"]]

    # Plotting the heatmap with annotations and separate color bars
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 12), gridspec_kw={'width_ratios': [1, 1]})

    # Heatmap for Production Volume with annotations
    sns.heatmap(heatmap_data[['Production Volume']], ax=ax1, cmap="Blues", cbar_kws={'label': 'Production Volume'}, annot=True, fmt=".2f", yticklabels=True)
    ax1.set_title('Canada Production Volume')
    ax1.set_ylabel("")  # Remove y-axis label as it is shared

    # Heatmap for World Share (%) with annotations
    sns.heatmap(heatmap_data[['World Share (%)']], ax=ax2, cmap="Reds", cbar_kws={'label': 'World Share (%)'}, annot=True, fmt=".2f", yticklabels=False)
    ax2.set_title('Canada as % World Share')
    ax2.set_ylabel("")  # Remove y-axis label to avoid repetition

    plt.tight_layout()
    plt.savefig('results/production_resv/production_data_canada_wmd2024.png', dpi=300, format='png', bbox_inches='tight')
    
    plt.show()

In [None]:
plot_canada_production_heatmap(file_path)

In [None]:
def plot_canada_production_heatmap_standardized_units(file_path, commodity_list=None):
    """
    Generates a heatmap of Canada's production volume (standardized to metric tonnes) and world share for selected commodities.
    
    Parameters:
    - file_path (str): Path to the Excel file containing the data.
    - commodity_list (list, optional): List of commodities to include. If None, all commodities are included.
    """
    # Load Excel file
    excel_data = pd.ExcelFile(file_path)

    # Initialize a dictionary to store production volume and world share data for Canada in metric tonnes
    canada_production_data = {'Commodity (metr. t)': [], 'Production Volume (tonnes)': [], 'World Share (%)': []}

    # Conversion factors
    unit_conversions = {
        "ct": 1 / 5_000_000,        # carats to tonnes
        "kg": 1 / 1_000,             # kilograms to tonnes
        "Mio m3": 0.717              # million cubic meters to tonnes (approximate for natural gas)
    }

    # Loop through each sheet, extract Canada's data if present
    for sheet in excel_data.sheet_names:
        df = excel_data.parse(sheet, skiprows=1)  # Start reading from the second row with corrected column headers
        
        # Attempt to extract the unit and Canada's production data
        unit = df['unit'].iloc[0] if 'unit' in df.columns else 'tonnes'
        canada_row = df[df['Country'].str.contains("Canada", na=False)]
        
        if not canada_row.empty:
            # Apply conversion to tonnes if necessary
            production_volume = float(canada_row['Production 2022'].values[0])
            conversion_factor = unit_conversions.get(unit, 1)  # Default to 1 if unit is already in tonnes or unknown
            production_volume_in_tonnes = production_volume * conversion_factor
            
            # Store data
            canada_production_data['Commodity (metr. t)'].append(sheet)
            canada_production_data['Production Volume (tonnes)'].append(production_volume_in_tonnes)
            canada_production_data['World Share (%)'].append(float(canada_row['Share in %'].values[0]))

    # Convert the dictionary to a DataFrame
    canada_production_df = pd.DataFrame(canada_production_data).sort_values(by="Commodity (metr. t)")

    # Filter by commodity_list if provided
    if commodity_list is not None:
        canada_production_df = canada_production_df[canada_production_df['Commodity (metr. t)'].isin(commodity_list)]

    # Prepare data for heatmap
    heatmap_data = canada_production_df.set_index("Commodity (metr. t)")[["Production Volume (tonnes)", "World Share (%)"]]

    # Plotting the heatmap with annotations and separate color bars
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 12), gridspec_kw={'width_ratios': [1, 1]})

    # Heatmap for Production Volume (tonnes) with annotations
    sns.heatmap(heatmap_data[['Production Volume (tonnes)']], ax=ax1, cmap="Blues", cbar_kws={'label': 'Production Volume (tonnes)'}, annot=True, fmt=".2f", yticklabels=True)
    ax1.set_title('Canada Production Volume (in tonnes)')
    ax1.set_ylabel("")  # Remove y-axis label as it is shared

    # Heatmap for World Share (%) with annotations
    sns.heatmap(heatmap_data[['World Share (%)']], ax=ax2, cmap="Reds", cbar_kws={'label': 'World Share (%)'}, annot=True, fmt=".2f", yticklabels=False)
    ax2.set_title('Canada as % World Share')
    ax2.set_ylabel("")  # Remove y-axis label to avoid repetition

    plt.tight_layout()
    plt.savefig('results/production_resv/production_data_canada_wmd2024_standardised_units.png', dpi=300, format='png', bbox_inches='tight')
    plt.show()

In [None]:
# Execute the function with the provided file path
plot_canada_production_heatmap_standardized_units(file_path)

In [None]:
def get_canada_production_data(file_path):
    """
    Reads an Excel file with multiple sheets, each representing a commodity.
    Extracts Canada's production, world share, and unit for each commodity, returning a DataFrame.
    Commodities are listed in alphabetical order, with 0s for missing data.
    
    Parameters:
    - file_path (str): Path to the Excel file containing the data.
    
    Returns:
    - pd.DataFrame: DataFrame with columns: ['Commodity', 'Production Volume', 'World Share (%)', 'Unit']
    """
    # Load the Excel file
    excel_data = pd.ExcelFile(file_path)
    
    # Initialize a dictionary to store production volume, world share data, and units for Canada
    canada_production_data = {'Commodity': [], 'Production Volume': [], 'World Share (%)': [], 'Unit': []}
    
    # Loop through each sheet, extract Canada's data if present
    for sheet in excel_data.sheet_names:
        df = excel_data.parse(sheet, skiprows=1)  # Start reading from the second row with corrected column headers
        canada_row = df[df['Country'].str.contains("Canada", na=False)]
        
        # Get the unit if present, default to 'tonnes' if not specified
        unit = df['unit'].iloc[0] if 'unit' in df.columns else 'tonnes'
        
        # Add data or fill with 0 if Canada data is missing
        canada_production_data['Commodity'].append(sheet)
        canada_production_data['Unit'].append(unit)
        if not canada_row.empty:
            canada_production_data['Production Volume'].append(float(canada_row['Production 2022'].values[0]))
            canada_production_data['World Share (%)'].append(float(canada_row['Share in %'].values[0]))
        else:
            canada_production_data['Production Volume'].append(0)
            canada_production_data['World Share (%)'].append(0)

    # Convert the dictionary to a DataFrame and sort alphabetically by Commodity
    canada_production_df = pd.DataFrame(canada_production_data).sort_values(by="Commodity").reset_index(drop=True)
    
    return canada_production_df



In [None]:
canada_production_df = get_canada_production_data(file_path)

In [None]:
canada_production_df

In [None]:
canada_production_df.to_excel(r'results/canada_production_wmd.xlsx', index=False)

# OWID extraction from USGS

In [None]:
# List of commodities available in OWID
commodities = [
               'Aluminum', 'Antinomy', 
               'Bismuth',
               'Chromium','Cobalt', 'Copper',
               'Fluorspar',
               'Gallium', 'Germanium', 'Graphite',
               'Helium',
               'Iridium','Iron', 'Iron_ore', 
               'Lithium',
               'Magnesium_compounds', 'Magnesium_metal', 'Manganese', 'Molybdenum', 
               'Nickel', 'Niobium', 
               'Palladium','PGMs', 'Platinum', 'Potash',
               'REEs', 'Rhodium', 'Ruthenium',
               'Silicon', 
               'Tantalum', 'Tin', 'Titanium', 'Tungsten', 
               'Uranium', 
               'Vanadium',
               'Zinc'
]

In [None]:
def get_canada_data(commodities):
    data_list = []
    filtered_commodities = []  # List to store commodities that meet the threshold

    for i, commodity in enumerate(commodities):
        print(f"Processing {commodity}...")
        
        # Define file paths
        commodity_dir = os.path.join('data/USGS/OWID', commodity) 
        reserves_file = os.path.join(commodity_dir, f'{commodity.lower()}_reserves.csv')
        production_types = ['mine', 'refinery', 'processing', 'smelter']
        production_files = {ptype: os.path.join(commodity_dir, f'{commodity.lower()}_production_{ptype}.csv') for ptype in production_types}

        # Load reserves data
        try:
            reserves_df = pd.read_csv(reserves_file)
        except FileNotFoundError:
            print(f"Reserves file {reserves_file} not found.")
            reserves_df = None

        # Load production data
        production_data = {}
        for ptype, pfile in production_files.items():
            try:
                production_data[ptype] = pd.read_csv(pfile)
            except FileNotFoundError:
                print(f"Production file {pfile} not found.")
        
        # Extract the latest year of data
        latest_year_reserves = reserves_df['Year'].max() if reserves_df is not None else None
        latest_reserves = reserves_df[reserves_df['Year'] == latest_year_reserves] if reserves_df is not None and not reserves_df.empty else None

        # Initialize reserves as None to distinguish missing data
        canada_reserves = None
        world_reserves = None
        canada_reserves_percentage = None

        if latest_reserves is not None and not latest_reserves.empty:
            # Calculate world reserves if available
            world_reserves = latest_reserves[latest_reserves['Entity'] == 'World'].iloc[0, -1] if 'World' in latest_reserves['Entity'].values else latest_reserves.iloc[:, -1].sum()

            # Calculate Canada reserves if available
            canada_reserves = latest_reserves[latest_reserves['Entity'] == 'Canada'].iloc[0, -1] if 'Canada' in latest_reserves['Entity'].values else None

            # Calculate Canada reserves as a percentage of world reserves if both are available
            canada_reserves_percentage = (canada_reserves / world_reserves * 100) if world_reserves and canada_reserves else None
        
        # Filter production data for the latest year
        latest_year_production = {}
        latest_production = {}
        canada_production_percentage = {}

        for ptype, pdata in production_data.items():
            if not pdata.empty:
                latest_year_production[ptype] = pdata['Year'].max()
                latest_production[ptype] = pdata[pdata['Year'] == latest_year_production[ptype]]
                
                # Calculate production percentages
                world_production = latest_production[ptype][latest_production[ptype]['Entity'] == 'World'].iloc[0, -1] if 'World' in latest_production[ptype]['Entity'].values else latest_production[ptype].iloc[:, -1].sum()
                canada_production = latest_production[ptype][latest_production[ptype]['Entity'] == 'Canada'].iloc[0, -1] if 'Canada' in latest_production[ptype]['Entity'].values else None
                canada_production_percentage[ptype] = (canada_production / world_production * 100) if world_production and canada_production else None
            else:
                canada_production_percentage[ptype] = None
                latest_production[ptype] = None

        # Collect data for the dataframe for all commodities
        data_row = {
            'Commodity': commodity,
            'Reserves (Canada)': canada_reserves,
            'Reserves (World)': world_reserves,
            'Reserves (%)': canada_reserves_percentage
        }

        # Add production data for each production type
        for ptype in production_types:
            if ptype in canada_production_percentage:
                canada_value = latest_production[ptype][latest_production[ptype]['Entity'] == 'Canada'].iloc[0, -1] if latest_production[ptype] is not None and 'Canada' in latest_production[ptype]['Entity'].values else None
                world_value = latest_production[ptype][latest_production[ptype]['Entity'] == 'World'].iloc[0, -1] if latest_production[ptype] is not None and 'World' in latest_production[ptype]['Entity'].values else None
                data_row[f'{ptype.capitalize()} Production (Canada)'] = canada_value
                data_row[f'{ptype.capitalize()} Production (World)'] = world_value
                data_row[f'{ptype.capitalize()} Production (%)'] = canada_production_percentage[ptype]
            else:
                data_row[f'{ptype.capitalize()} Production (Canada)'] = None
                data_row[f'{ptype.capitalize()} Production (World)'] = None
                data_row[f'{ptype.capitalize()} Production (%)'] = None

        data_list.append(data_row)

    # Create the dataframe and display it
    df = pd.DataFrame(data_list)
    return df

In [None]:
# Example usage with a 5% threshold
df = get_canada_data(commodities)
df

### Canada as World %

In [None]:
# Define the relevant percentage columns
percentage_columns = [
    'Reserves (%)', 
    'Mine Production (%)', 
    'Refinery Production (%)', 
    'Processing Production (%)', 
    'Smelter Production (%)'
]

# Extract only the relevant columns and keep NaNs intact
df_percentages = df[['Commodity'] + percentage_columns].set_index('Commodity')

# Rename the columns to simplify their labels
df_percentages.rename(columns={
    'Reserves (%)': 'Reserves', 
    'Mine Production (%)': 'Mine', 
    'Refinery Production (%)': 'Refinery', 
    'Processing Production (%)': 'Processing', 
    'Smelter Production (%)': 'Smelter'
}, inplace=True)

# Create a heatmap without filling NaNs
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(df_percentages, annot=True, cmap="Blues", fmt=".2f", linewidths=.5, cbar_kws={'format': '%.0f%%'},
                      annot_kws={"size": 10, "weight": "bold"}, mask=df_percentages.isna())

# Overlay crosses on NaN cells by iterating through each cell in the DataFrame
for i in range(df_percentages.shape[0]):
    for j in range(df_percentages.shape[1]):
        if pd.isna(df_percentages.iloc[i, j]):  # Check if the cell is NaN
            heatmap.text(j + 0.5, i + 0.5, '✕', color='red', ha='center', va='center', fontsize=14, fontweight='bold')

# Customize x-axis labels
heatmap.set_xticklabels(heatmap.get_xticklabels(), fontsize=12, fontweight='bold')

# Remove the y-axis label 'Commodity'
heatmap.set_ylabel('')

# Set the title with more space using the 'pad' argument
plt.title('Canada as % of World reserves and production', fontweight='bold', fontsize=16, pad=20)

# Adjust layout and save the heatmap with high quality
plt.tight_layout()
plt.savefig('results/production_resv/reserves_and_production_heatmap.png', format='png', dpi=300)

# Show the heatmap
plt.show()

### Production in tonnes

In [None]:
# Define the relevant columns
values_columns = [
    'Mine Production (Canada)', 
    'Refinery Production (Canada)', 
    'Processing Production (Canada)', 
    'Smelter Production (Canada)'
]

# Extract the relevant columns, keeping NaN values intact
df_values = df[['Commodity'] + values_columns].set_index('Commodity')

# Rename columns for simpler labels
df_values.rename(columns={
    'Mine Production (Canada)': 'Mine', 
    'Refinery Production (Canada)': 'Refinery', 
    'Processing Production (Canada)': 'Processing', 
    'Smelter Production (Canada)': 'Smelter'
}, inplace=True)

# Create the heatmap without filling NaNs, using a mask for NaN cells
plt.figure(figsize=(12, 10))
heatmap = sns.heatmap(df_values, annot=True, cmap="Blues", fmt=".2f", linewidths=.5, cbar_kws={'label': 'Tonnes'},
                      annot_kws={"size": 10, "weight": "bold"}, mask=df_values.isna())

# Overlay crosses on NaN cells by iterating through each cell
for i in range(df_values.shape[0]):
    for j in range(df_values.shape[1]):
        if pd.isna(df_values.iloc[i, j]):  # Check if the cell is NaN
            heatmap.text(j + 0.5, i + 0.5, '✕', color='red', ha='center', va='center', fontsize=14, fontweight='bold')

# Customize x-axis labels
heatmap.set_xticklabels(heatmap.get_xticklabels(), fontsize=12, fontweight='bold')

# Remove the y-axis label 'Commodity'
heatmap.set_ylabel('')

# Set the title with more space using the 'pad' argument
plt.title('Canadian production in tonnes', fontweight='bold', fontsize=16, pad=20)

# Adjust layout and save the heatmap with high quality
plt.tight_layout()
plt.savefig('results/production_resv/production_heatmap.png', format='png', dpi=300)

# Show the heatmap
plt.show()