# Jupyter Notebook: ged_analysis.ipynb

## 1. Import necessary libraries

In [5]:
import sys

!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install ipywidgets

import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt
%matplotlib inline

# Now import ipywidgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


## 2. Define utility functions (for loading, cleaning, computing metrics)

In [6]:
def load_approx_results(file_path):
    """
    Load approximation results from an .xlsx file.
    Expected columns:
        'method', 'graph_id_1', 'graph_id_2', 'ged', 'accuracy', 'absolute_error', 'squared_error',
        'runtime', 'memory_usage_mb', 'graph1_n', 'graph1_density', 'graph2_n', 'graph2_density',
        'scalability'
    Returns a cleaned DataFrame.
    """
    df = pd.read_excel(file_path)

    # Example of dropping empty rows or ignoring certain columns if not found
    # You can adapt or refine these steps as needed:
    df.dropna(how='all', inplace=True)  # drop rows that are entirely NaN
    # You can also fill or ignore partial NaNs, e.g.:
    # df['accuracy'] = df['accuracy'].fillna(0.0)  # or any strategy

    # Convert columns to appropriate dtypes, if necessary
    numeric_cols = ['ged', 'accuracy', 'absolute_error', 'squared_error',
                    'runtime', 'memory_usage_mb', 'graph1_n', 'graph1_density',
                    'graph2_n', 'graph2_density']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # Drop or ignore rows where critical values are missing
    df.dropna(subset=['graph_id_1', 'graph_id_2', 'method'], inplace=True)

    return df

def load_exact_results(file_path):
    """
    Load exact GED results from an .xlsx file.
    Relevant columns: 'graph_id_1', 'graph_id_2', 'min_ged', 'max_ged'
    The exact GED is:
       - median(min_ged, max_ged) if min_ged != max_ged
       - min_ged (or max_ged) if they are equal
    Returns a DataFrame with an additional 'ged_exact' column.
    """
    df = pd.read_excel(file_path)
    # Clean up, drop empty rows
    df.dropna(how='all', inplace=True)

    # Convert columns to numeric
    for col in ['min_ged', 'max_ged']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Compute exact GED
    df['ged_exact'] = df.apply(
        lambda row: (row['min_ged'] + row['max_ged']) / 2
                    if pd.notnull(row['min_ged'])
                       and pd.notnull(row['max_ged'])
                       and row['min_ged'] != row['max_ged']
                    else row['min_ged'],
        axis=1
    )

    df.dropna(subset=['graph_id_1', 'graph_id_2', 'ged_exact'], inplace=True)

    return df

def compute_relative_accuracy(ged_approx, ged_exact):
    """
    Relative accuracy measure (example).
    If you have your own definition, replace accordingly.
    """
    return 1.0 - abs(ged_approx - ged_exact) / ged_exact if ged_exact != 0 else np.nan

def mean_absolute_error(approx_values, exact_values):
    approx_values = np.array(approx_values)
    exact_values = np.array(exact_values)
    return np.mean(np.abs(approx_values - exact_values))

def mean_squared_error(approx_values, exact_values):
    approx_values = np.array(approx_values)
    exact_values = np.array(exact_values)
    return np.mean((approx_values - exact_values)**2)

def compute_scalability(graph_sizes, runtimes, memory_usages):
    """
    Compute scalability as the change in runtime and memory usage per unit increase in graph size.
    Returns a tuple (slope_runtime, slope_memory).

    Uses a simple linear regression (via numpy.polyfit).
    """
    graph_sizes = np.array(graph_sizes, dtype=float)
    runtimes = np.array(runtimes, dtype=float)
    memory_usages = np.array(memory_usages, dtype=float)

    slope_runtime, _ = np.polyfit(graph_sizes, runtimes, 1)
    slope_memory, _ = np.polyfit(graph_sizes, memory_usages, 1)
    return slope_runtime, slope_memory

## 3. Load and merge data

In [7]:
# Modify these file paths to match your environment
approx_file_hed   = "/home/mfilippov/ged_data/results/gedlib/PROTEINS/PROTEINS_HED_results.xlsx"
approx_file_ipfp  = "/home/mfilippov/ged_data/results/gedlib/PROTEINS/PROTEINS_IPFP_results.xlsx"
approx_file_simgnn= "/home/mfilippov/ged_data/results/neural/PROTEINS/performance_130325.xlsx"
exact_file        = "/home/mfilippov/ged_data/results/exact_ged/PROTEINS/merged/results.xlsx"

df_hed    = load_approx_results(approx_file_hed)
df_ipfp   = load_approx_results(approx_file_ipfp)
df_simgnn = load_approx_results(approx_file_simgnn)

df_exact  = load_exact_results(exact_file)

# Merge each approximate DataFrame with the exact GED DataFrame
# We'll keep track of which method each row belongs to so we can unify the data for plotting.
df_hed_merged = pd.merge(df_hed, df_exact, on=['graph_id_1','graph_id_2'], how='left')
df_ipfp_merged = pd.merge(df_ipfp, df_exact, on=['graph_id_1','graph_id_2'], how='left')
df_simgnn_merged = pd.merge(df_simgnn, df_exact, on=['graph_id_1','graph_id_2'], how='left')

# Combine all in one DataFrame for convenience if needed
df_all = pd.concat([df_hed_merged, df_ipfp_merged, df_simgnn_merged], ignore_index=True)

# Clean out any rows with missing critical fields
df_all.dropna(subset=['ged_exact','ged','method'], inplace=True)

## 4. Example: Compute or update metrics as needed

In [8]:
# If the columns 'accuracy', 'absolute_error', or 'squared_error' in the loaded data
# are not what you want, or you want to re-compute them, you can do that here:
df_all['absolute_error'] = abs(df_all['ged'] - df_all['ged_exact'])
df_all['squared_error']  = (df_all['ged'] - df_all['ged_exact'])**2
df_all['accuracy']       = df_all.apply(
    lambda row: compute_relative_accuracy(row['ged'], row['ged_exact']),
    axis=1
)

# We might also unify a "graph_size" for easy filtering/plotting
# e.g. we consider the average size of the two graphs in a pair:
df_all['graph_size'] = (df_all['graph1_n'] + df_all['graph2_n']) / 2.0

# Similarly for "graph_density":
df_all['graph_density'] = (df_all['graph1_density'] + df_all['graph2_density']) / 2.0


## 5. Interactive Plotting

In [9]:
def plot_accuracy_vs_size(df, node_range=(0,150)):
    """
    Plots accuracy vs. average graph size.
    Allows filtering by a node range (e.g. <50, 50-100, >100).
    """
    # Filter data
    df_filtered = df[(df['graph_size'] >= node_range[0]) & (df['graph_size'] <= node_range[1])]

    # Create high-resolution plot
    plt.figure(figsize=(10, 6), dpi=120)

    # We'll plot each method separately
    methods = df_filtered['method'].unique()
    for m in methods:
        sub = df_filtered[df_filtered['method'] == m]
        plt.scatter(sub['graph_size'], sub['accuracy'], label=m, alpha=0.7)

    plt.title(f"Accuracy vs. Graph Size\nNode range: {node_range}")
    plt.xlabel("Average Graph Size (# nodes)")
    plt.ylabel("Relative Accuracy")
    plt.legend()
    plt.grid(True)
    plt.show()


def plot_runtime_vs_size(df, node_range=(0,150)):
    """
    Plots runtime vs. average graph size.
    """
    df_filtered = df[(df['graph_size'] >= node_range[0]) & (df['graph_size'] <= node_range[1])]

    plt.figure(figsize=(10, 6), dpi=120)

    methods = df_filtered['method'].unique()
    for m in methods:
        sub = df_filtered[df_filtered['method'] == m]
        plt.scatter(sub['graph_size'], sub['runtime'], label=m, alpha=0.7)

    plt.title(f"Runtime vs. Graph Size\nNode range: {node_range}")
    plt.xlabel("Average Graph Size (# nodes)")
    plt.ylabel("Runtime (s)")
    plt.legend()
    plt.grid(True)
    plt.show()


def plot_accuracy_vs_density(df, density_range=(0,1.0)):
    """
    Plots accuracy vs. average graph density.
    """
    df_filtered = df[(df['graph_density'] >= density_range[0]) & (df['graph_density'] <= density_range[1])]

    plt.figure(figsize=(10, 6), dpi=120)

    methods = df_filtered['method'].unique()
    for m in methods:
        sub = df_filtered[df_filtered['method'] == m]
        plt.scatter(sub['graph_density'], sub['accuracy'], label=m, alpha=0.7)

    plt.title(f"Accuracy vs. Graph Density\nDensity range: {density_range}")
    plt.xlabel("Average Graph Density")
    plt.ylabel("Relative Accuracy")
    plt.legend()
    plt.grid(True)
    plt.show()


def plot_error_vs_runtime(df):
    """
    Trade-off plot: average error (distance from exact GED) vs. average runtime, by method.
    """
    plt.figure(figsize=(10, 6), dpi=120)

    methods = df['method'].unique()
    # We'll compute average error, average runtime by method
    for m in methods:
        sub = df[df['method'] == m]
        avg_error = sub['absolute_error'].mean()
        avg_runtime = sub['runtime'].mean()
        plt.scatter(avg_runtime, avg_error, label=m, s=100)  # bigger marker

    plt.title("Trade-off: Average Error vs. Average Runtime")
    plt.xlabel("Average Runtime (s)")
    plt.ylabel("Average Absolute Error")
    plt.legend()
    plt.grid(True)
    plt.show()

## 6. Add interactive controls (ipywidgets)

In [10]:
node_slider = widgets.IntRangeSlider(
    value=[0, 150],
    min=0,
    max=300,
    step=1,
    description='Node range:',
    continuous_update=False
)

density_slider = widgets.FloatRangeSlider(
    value=[0.0, 1.0],
    min=0.0,
    max=2.0,  # set upper bound as needed
    step=0.01,
    description='Density range:',
    continuous_update=False
)

# We'll define interactive wrappers so the user can adjust node/density range:
@interact(node_range=node_slider)
def interactive_accuracy_vs_size(node_range):
    return plot_accuracy_vs_size(df_all, node_range=node_range)

@interact(node_range=node_slider)
def interactive_runtime_vs_size(node_range):
    return plot_runtime_vs_size(df_all, node_range=node_range)

@interact(density_range=density_slider)
def interactive_accuracy_vs_density(density_range):
    return plot_accuracy_vs_density(df_all, density_range=density_range)


# The trade-off plot might not require a range slider:
plot_error_vs_runtime(df_all)

interactive(children=(IntRangeSlider(value=(0, 150), continuous_update=False, description='Node range:', max=3…

<Figure size 1200x720 with 0 Axes>

interactive(children=(IntRangeSlider(value=(0, 150), continuous_update=False, description='Node range:', max=3…

interactive(children=(FloatRangeSlider(value=(0.0, 1.0), continuous_update=False, description='Density range:'…

KeyError: 'runtime'

<Figure size 1200x720 with 0 Axes>