In [1]:
import pandas as pd
import numpy as np

In [2]:
import plotly.graph_objects as go

In [4]:
df = pd.read_excel(r'data/Tables/extracted_data.xlsx', sheet_name='IDs')

In [14]:
import os
import pandas as pd
import plotly.graph_objects as go

def create_sankey(
    df,
    save_path
):
    """
    Creates a Sankey diagram:
    - Mining/Manufacturing: Facility Type → Company Name → Commodity
    - Project: Facility Type → Commodity

    Parameters
    ----------
    df : pd.DataFrame
        Must contain 'facility_type', 'company_name_folder', 'commodities_nrcan'
    save_path : str
        Path WITHOUT extension (e.g., 'results/sankey/facility_company_commodity')
    """
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # 1) Basic cleaning
    df_viz = df[['facility_type', 'company_name_folder', 'commodities_nrcan']].dropna(subset=['facility_type']).copy()
    df_viz['commodities_nrcan'] = df_viz['commodities_nrcan'].fillna('Unknown')
    df_viz = df_viz.assign(commodities_nrcan=df_viz['commodities_nrcan'].str.split(', ')).explode('commodities_nrcan')

    # 2) Define nodes
    facility_types = df_viz['facility_type'].unique().tolist()
    companies = df_viz['company_name_folder'].dropna().unique().tolist()
    commodities = df_viz['commodities_nrcan'].unique().tolist()

    node_labels = facility_types + companies + commodities
    node_colors = (
        ["#66c2a5"] * len(facility_types) +
        ["#fc8d62"] * len(companies) +
        ["#8da0cb"] * len(commodities)
    )
    node_indices = {label: idx for idx, label in enumerate(node_labels)}

    # 3) Build links
    sources = []
    targets = []
    values = []

    # Separate data
    df_mine_manuf = df_viz[df_viz['facility_type'].isin(['mining', 'manufacturing'])]
    df_project = df_viz[df_viz['facility_type'] == 'project']

    # a) mining/manufacturing: facility_type → company
    links_fc = df_mine_manuf.groupby(['facility_type', 'company_name_folder']).size().reset_index(name='count')
    for _, row in links_fc.iterrows():
        sources.append(node_indices[row['facility_type']])
        targets.append(node_indices[row['company_name_folder']])
        values.append(row['count'])

    # b) company → commodity
    links_cc = df_mine_manuf.groupby(['company_name_folder', 'commodities_nrcan']).size().reset_index(name='count')
    for _, row in links_cc.iterrows():
        sources.append(node_indices[row['company_name_folder']])
        targets.append(node_indices[row['commodities_nrcan']])
        values.append(row['count'])

    # c) project: facility_type → commodity
    links_pc = df_project.groupby(['facility_type', 'commodities_nrcan']).size().reset_index(name='count')
    for _, row in links_pc.iterrows():
        sources.append(node_indices[row['facility_type']])
        targets.append(node_indices[row['commodities_nrcan']])
        values.append(row['count'])

    # 4) Build figure
    fig = go.Figure(go.Sankey(
        arrangement="snap",
        node=dict(
            pad=15, thickness=20, line=dict(color="black", width=1),
            label=node_labels, color=node_colors,
        ),
        link=dict(source=sources, target=targets, value=values)
    ))

    fig.update_layout(
        font_family="Arial", font_color='black', font_size=12,
        width=1400, height=800,
        paper_bgcolor="white", plot_bgcolor="white"
    )

    # 5) Save outputs
    fig.write_html(f"{save_path}.html")
    try:
        fig.write_image(f"{save_path}.pdf", format="pdf", width=1400, height=800, scale=2)
    except ValueError:
        print("Warning: Kaleido not installed, PDF export skipped.")

    return fig


In [15]:
sankey_fig = create_sankey(df, save_path='results/sankey')

