In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
production_file_path = r'data/NRCan/Production, shipments and value of shipments of metallic and non-metallic minerals - 16100022-eng/16100022.csv'
energy_use_file_path = r'data/NRCan/Mining industries, energy consumption by NAICS - 16100029-eng/16100029.csv'

# Production data from 2019 to 2023

Production, shipments and value of shipments of metallic and non-metallic minerals, annual. Provided by NRCan can be dowloaded from [here](https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=1610002201)

In [None]:
production_df = pd.read_csv(production_file_path)

In [None]:
production_df

In [None]:
# Filter for Canada and select only the relevant columns
production_df_clean = production_df[
    (production_df['GEO'] == 'Canada') & (production_df['Variables'] == 'Quantity produced') & (production_df['UOM'] != 'Thousands of dollars') 
][['REF_DATE', 'GEO', 'Products', 'Variables', 'UOM', 'VALUE']]

# Sort products alphabetically
production_df_clean = production_df_clean.sort_values(by='Products').reset_index(drop=True)
production_df_clean.fillna({'VALUE': 0}, inplace=True)
production_df_clean

In [None]:
# Lists for categorizing product types
metals = ['cobalt', 'copper', 'gold', 'ilmenite', 'iridium', 'iron', 'lead', 'lithium', 'molybdenum', 'nickel', 'niobium', 
          'palladium', 'platinum', 'platinum group', 'rhodium', 'ruthenium', 'silver', 'spodumene', 'tantalum', 'titanium', 'uranium', 'zinc']
non_metals = ['barite', 'diamonds', 'gemstones', 'graphite', 'gypsum', 'marl', 'mica', 'peat', 'potash', 
              'potassium', 'pumice', 'salt', 'serpentine', 'soapstone', 'sodium', 'wollastonite', 'zeolite']
aggregates = ['clay', 'lime', 'granite', 'magnesite', 'marble', 'nepheline', 'sand', 'slate', 'stone']

# Add the 'TYPE' column based on product keywords
def categorize_product(product_name):
    product_name = product_name.lower()
    if any(metal in product_name for metal in metals):
        return 'Metals'
    elif any(non_metal in product_name for non_metal in non_metals):
        return 'Non_metals'
    elif any(aggregate in product_name for aggregate in aggregates):
        return 'Aggregates'
    return 'Other'

In [None]:
# Apply the categorization function
production_df_clean['TYPE'] = production_df_clean['Products'].apply(categorize_product)

In [None]:
production_df_clean

In [None]:
production_df_clean.to_csv(r'data/NRCan/Cleaned/qty_produced_2019_2023.csv', index=False)

# Energy consumption 

Data on the quantity of energy purchased and the energy expenses are presented at the national level, by energy source (electricity, heavy fuel oil, diesel, natural gas, etc.) and by North American Industry Classification System (NAICS). Data provided by NRCan, it can be dowloaded from [here](https://open.canada.ca/data/en/dataset/85727787-6faf-4fdd-865e-b80ee9760753)

In [None]:
energy_df = pd.read_csv(energy_use_file_path)

In [None]:
# Filter data for Canada, physical units, and keep relevant columns
energy_df_clean = energy_df[
    (energy_df['GEO'] == 'Canada') & (energy_df['UOM'] != 'Thousands of dollars')
][['REF_DATE', 'GEO', 'North American Industry Classification System (NAICS)', 'Energy types', 'UOM', 'VALUE']]

energy_df_clean

In [None]:
energy_df_clean.to_csv(r'data/NRCan/Cleaned/energy_consumption_2019_2023.csv', index=False)

In [None]:
energy_df_clean['North American Industry Classification System (NAICS)'].unique()

Possible to compute energy recipe (e.g. production/energy consumption) for iron, diamond, potash. For others, the co-production or by-production makes it difficult to know.