# Jupyter Notebook: ged_analysis.ipynb

## 1. Import necessary libraries

In [11]:
import sys

!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install ipywidgets

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
%matplotlib inline

# Now import ipywidgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


## 2. Define utility functions (for loading, cleaning, computing metrics)

In [12]:
def load_approx_results(file_path, fill_n_density=None):
    """
    Load approximation results from an XLSX file.

    If `fill_n_density` is provided, it should be a DataFrame (e.g., the SimGNN data)
    that has 'graph_id_1', 'graph_id_2', 'graph1_n', 'graph2_n', 'graph1_density', 'graph2_density'.
    We'll merge those columns into the loaded DataFrame as needed.
    """
    df = pd.read_excel(file_path)

    # Clean up
    df.dropna(how='all', inplace=True)  # drop empty rows

    # Convert to numeric where applicable
    numeric_cols = [
        'ged', 'accuracy', 'absolute_error', 'squared_error',
        'runtime', 'memory_usage_mb',
        'graph1_n', 'graph1_density', 'graph2_n', 'graph2_density'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # If we have a reference DF with node-count & density, fill them in where missing
    if fill_n_density is not None:
        # We'll merge on graph_id_1, graph_id_2
        # but keep existing columns if they already have data
        needed_cols = ['graph1_n', 'graph2_n', 'graph1_density', 'graph2_density']

        # Create a slim reference with just the ID pairs + those columns
        ref_cols = ['graph_id_1', 'graph_id_2'] + needed_cols
        ref = fill_n_density[ref_cols].copy().drop_duplicates()

        # Merge using left join so that approximate results remain in place
        merged = pd.merge(
            df, ref,
            how='left',
            on=['graph_id_1', 'graph_id_2'],
            suffixes=('', '_ref')
        )

        # Fill missing from _ref
        for c in needed_cols:
            merged[c] = np.where(
                merged[c].isna(),
                merged[f"{c}_ref"],
                merged[c]
            )
        # Clean leftover columns
        drop_cols = [f"{c}_ref" for c in needed_cols if f"{c}_ref" in merged.columns]
        merged.drop(columns=drop_cols, inplace=True)

        df = merged

    # Some final cleanup if needed
    df.dropna(subset=['graph_id_1', 'graph_id_2'], inplace=True)

    return df


def load_exact_results(file_path):
    """
    Load exact GED results from an XLSX file.
    Compute 'ged_exact' as the median of (min_ged, max_ged) if they differ,
    else min_ged (or max_ged).
    """
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)

    for col in ['min_ged', 'max_ged']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    def compute_ged_exact(row):
        if pd.notnull(row['min_ged']) and pd.notnull(row['max_ged']) and row['min_ged'] != row['max_ged']:
            return (row['min_ged'] + row['max_ged']) / 2.0
        else:
            return row['min_ged']  # or row['max_ged']

    df['ged_exact'] = df.apply(compute_ged_exact, axis=1)
    df.dropna(subset=['graph_id_1', 'graph_id_2', 'ged_exact'], inplace=True)

    return df

def compute_relative_accuracy(ged_approx, ged_exact):
    """
    Relative accuracy measure (example).
    If you have your own definition, replace accordingly.
    """
    return 1.0 - abs(ged_approx - ged_exact) / ged_exact if ged_exact != 0 else np.nan

def mean_absolute_error(approx_values, exact_values):
    approx_values = np.array(approx_values)
    exact_values = np.array(exact_values)
    return np.mean(np.abs(approx_values - exact_values))

def mean_squared_error(approx_values, exact_values):
    approx_values = np.array(approx_values)
    exact_values = np.array(exact_values)
    return np.mean((approx_values - exact_values)**2)

def compute_scalability(graph_sizes, runtimes, memory_usages):
    """
    Compute scalability as the change in runtime and memory usage per unit increase in graph size.
    Returns a tuple (slope_runtime, slope_memory).

    Uses a simple linear regression (via numpy.polyfit).
    """
    graph_sizes = np.array(graph_sizes, dtype=float)
    runtimes = np.array(runtimes, dtype=float)
    memory_usages = np.array(memory_usages, dtype=float)

    slope_runtime, _ = np.polyfit(graph_sizes, runtimes, 1)
    slope_memory, _ = np.polyfit(graph_sizes, memory_usages, 1)
    return slope_runtime, slope_memory

## 3. Define file paths for each dataset & method

In [15]:
datasets = {
    "PROTEINS": {
        "HED":      r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\simgnn\PROTEINS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\PROTEINS\merged\results.xlsx"
    },
    "AIDS": {
        "HED":      r"C:\project_data\results\gedlib\AIDS\AIDS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\AIDS\AIDS_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\simgnn\AIDS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\AIDS\merged\results.xlsx"
    },
    "IMDB-BINARY": {
        "HED":      r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\simgnn\IMDB-BINARY\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\IMDB-BINARY\merged\results.xlsx"
    }
}

## 4. Load data for each dataset & method

In [16]:
data = {}

for dset, paths in datasets.items():
    # Load the SimGNN (since we might want to fill missing columns from it)
    df_simgnn = load_approx_results(paths["SimGNN"])

    # Load exact GED
    df_exact = load_exact_results(paths["Exact"])

    # Load HED, IPFP, SimGNN (again, but now with fill from the same df_simgnn if needed)
    df_hed   = load_approx_results(paths["HED"], fill_n_density=df_simgnn)
    df_ipfp  = load_approx_results(paths["IPFP"], fill_n_density=df_simgnn)
    # For consistency, we can also re-load or re-use df_simgnn with fill from itself
    # but typically it already has the columns, so no special fill needed
    # We'll do it just for demonstration:
    df_simgnn_filled = load_approx_results(paths["SimGNN"], fill_n_density=df_simgnn)

    # Merge each approximate with exact for in-memory computations
    # (We do not physically merge to a single big file, just a left join so we have ged_exact.)
    df_hed   = pd.merge(df_hed,   df_exact, on=['graph_id_1','graph_id_2'], how='left', suffixes=('', '_exact'))
    df_ipfp  = pd.merge(df_ipfp,  df_exact, on=['graph_id_1','graph_id_2'], how='left', suffixes=('', '_exact'))
    df_simgnn_filled = pd.merge(df_simgnn_filled, df_exact, on=['graph_id_1','graph_id_2'], how='left', suffixes=('', '_exact'))

    # Compute or recast metrics if columns are missing
    for df_approx in [df_hed, df_ipfp, df_simgnn_filled]:
        if 'ged_exact' in df_approx.columns and 'ged' in df_approx.columns:
            # absolute_error, etc.
            df_approx['absolute_error'] = abs(df_approx['ged'] - df_approx['ged_exact'])
            df_approx['squared_error']  = (df_approx['ged'] - df_approx['ged_exact'])**2
            df_approx['accuracy']       = df_approx.apply(
                lambda row: compute_relative_accuracy(row['ged'], row['ged_exact'])
                            if pd.notnull(row['ged_exact']) else np.nan,
                axis=1
            )
            # Might also define average graph size, density:
            df_approx['graph_size'] = (df_approx['graph1_n'] + df_approx['graph2_n']) / 2.0
            df_approx['graph_density'] = (df_approx['graph1_density'] + df_approx['graph2_density']) / 2.0

    # Store final data for this dataset
    data[dset] = {
        "HED": df_hed,
        "IPFP": df_ipfp,
        "SimGNN": df_simgnn_filled
    }


KeyError: ['graph_id_1', 'graph_id_2']

## 5. Generate correlation plots for each dataset

In [9]:
# (A) 3 plots to show correlation between accuracy & graph size (small, medium, large).
# We'll define a function that loops over the chosen dataset and method
# and makes scatter plots or box plots.

def plot_accuracy_vs_size_ranges(dset, method):
    """
    For a given dataset (e.g. 'PROTEINS') and method (e.g. 'HED'),
    produce 3 separate plots for:
      1) small (<50 nodes)
      2) medium (50-100)
      3) large (>100)
    We'll show scatter plots of (graph_size vs accuracy).
    """
    df = data[dset][method]
    df_valid = df.dropna(subset=['graph_size','accuracy'])

    # Define the 3 size bins:
    small_df  = df_valid[df_valid['graph_size'] < 50]
    medium_df = df_valid[(df_valid['graph_size'] >= 50) & (df_valid['graph_size'] <= 100)]
    large_df  = df_valid[df_valid['graph_size'] > 100]

    # 1) Plot small
    plt.figure(figsize=(6,4), dpi=120)
    plt.scatter(small_df['graph_size'], small_df['accuracy'], alpha=0.6)
    plt.title(f"{dset} - {method} (Small <50 nodes)")
    plt.xlabel("Average Graph Size")
    plt.ylabel("Relative Accuracy")
    plt.grid(True)
    plt.show()

    # 2) Plot medium
    plt.figure(figsize=(6,4), dpi=120)
    plt.scatter(medium_df['graph_size'], medium_df['accuracy'], alpha=0.6)
    plt.title(f"{dset} - {method} (Medium 50-100 nodes)")
    plt.xlabel("Average Graph Size")
    plt.ylabel("Relative Accuracy")
    plt.grid(True)
    plt.show()

    # 3) Plot large
    plt.figure(figsize=(6,4), dpi=120)
    plt.scatter(large_df['graph_size'], large_df['accuracy'], alpha=0.6)
    plt.title(f"{dset} - {method} (Large >100 nodes)")
    plt.xlabel("Average Graph Size")
    plt.ylabel("Relative Accuracy")
    plt.grid(True)
    plt.show()


# (B) 2 plots for graphs of varying densities (sparse vs. dense).
# We'll define a function that picks a threshold or uses some percentile to define "sparse" vs. "dense".

def plot_accuracy_vs_density(dset, method, density_threshold=0.1):
    """
    Creates two plots: one for "sparse" (density < threshold),
    and one for "dense" (density >= threshold).
    """
    df = data[dset][method]
    df_valid = df.dropna(subset=['graph_density','accuracy'])

    sparse_df = df_valid[df_valid['graph_density'] < density_threshold]
    dense_df  = df_valid[df_valid['graph_density'] >= density_threshold]

    # Sparse
    plt.figure(figsize=(6,4), dpi=120)
    plt.scatter(sparse_df['graph_density'], sparse_df['accuracy'], alpha=0.6)
    plt.title(f"{dset} - {method} (Sparse < {density_threshold})")
    plt.xlabel("Average Graph Density")
    plt.ylabel("Relative Accuracy")
    plt.grid(True)
    plt.show()

    # Dense
    plt.figure(figsize=(6,4), dpi=120)
    plt.scatter(dense_df['graph_density'], dense_df['accuracy'], alpha=0.6)
    plt.title(f"{dset} - {method} (Dense >= {density_threshold})")
    plt.xlabel("Average Graph Density")
    plt.ylabel("Relative Accuracy")
    plt.grid(True)
    plt.show()


## 6. Usage

In [None]:
 #Example: produce the 3 size correlation plots + 2 density plots for PROTEINS/HED
plot_accuracy_vs_size_ranges(dset="PROTEINS", method="HED")
plot_accuracy_vs_density(dset="PROTEINS", method="HED", density_threshold=0.1)

# You can repeat for each method or dataset as needed:
# plot_accuracy_vs_size_ranges("PROTEINS", "IPFP")
# plot_accuracy_vs_density("PROTEINS", "IPFP", density_threshold=0.1)