In [5]:
import sys

!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install ipywidgets

import os
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [6]:
def load_approx_results(file_path, fill_n_density=None):
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)

    # Convert graph IDs to string to ensure consistent dtype
    df['graph_id_1'] = df['graph_id_1'].astype(str)
    df['graph_id_2'] = df['graph_id_2'].astype(str)

    numeric_cols = [
        'ged', 'accuracy', 'absolute_error', 'squared_error',
        'runtime', 'memory_usage_mb',
        'graph1_n', 'graph1_density', 'graph2_n', 'graph2_density'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    if fill_n_density is not None:
        # Ensure fill DataFrame also has string-based IDs
        fill_n_density['graph_id_1'] = fill_n_density['graph_id_1'].astype(str)
        fill_n_density['graph_id_2'] = fill_n_density['graph_id_2'].astype(str)

        needed_cols = ['graph1_n', 'graph2_n', 'graph1_density', 'graph2_density']
        ref_cols = ['graph_id_1', 'graph_id_2'] + needed_cols
        ref = fill_n_density[ref_cols].drop_duplicates()

        merged = pd.merge(
            df, ref,
            how='left',
            on=['graph_id_1','graph_id_2'],
            suffixes=('', '_ref')
        )
        for c in needed_cols:
            merged[c] = np.where(
                merged[c].isna(),
                merged[f"{c}_ref"],
                merged[c]
            )
        drop_cols = [f"{c}_ref" for c in needed_cols if f"{c}_ref" in merged.columns]
        merged.drop(columns=drop_cols, inplace=True)
        df = merged

    df.dropna(subset=['graph_id_1','graph_id_2'], inplace=True)
    return df

def load_exact_results(file_path):
    """
    Load exact GED results from an XLSX file.
    Compute 'ged_exact' as the median of (min_ged, max_ged) if they differ,
    else min_ged (or max_ged).
    """
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)

    # Convert graph IDs to string
    df['graph_id_1'] = df['graph_id_1'].astype(str)
    df['graph_id_2'] = df['graph_id_2'].astype(str)

    for col in ['min_ged', 'max_ged']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    def compute_ged_exact(row):
        if pd.notnull(row['min_ged']) and pd.notnull(row['max_ged']) and row['min_ged'] != row['max_ged']:
            return (row['min_ged'] + row['max_ged']) / 2.0
        else:
            return row['min_ged']  # or row['max_ged']

    df['ged_exact'] = df.apply(compute_ged_exact, axis=1)
    df.dropna(subset=['graph_id_1','graph_id_2','ged_exact'], inplace=True)
    return df


def compute_relative_accuracy(ged_approx, ged_exact):
    if ged_approx >= (ged_exact * 2) or ged_approx is None or ged_exact is None:
        return 0
    return 1 - abs(ged_approx - ged_exact)/ged_exact


In [7]:
datasets = {
    "AIDS": {
        "HED":      r"C:\project_data\results\gedlib\AIDS\AIDS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\AIDS\AIDS_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\AIDS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\AIDS\merged\results.xlsx"
    },
    "IMDB-BINARY": {
        "HED":      r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\IMDB-BINARY\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\IMDB-BINARY\merged\results.xlsx"
    },
    "PROTEINS": {
        "HED":      r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\PROTEINS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\PROTEINS\merged\results.xlsx"
    }
}

In [8]:
data = {}

for dset, paths in datasets.items():
    df_simgnn = load_approx_results(paths["SimGNN"])
    df_exact  = load_exact_results(paths["Exact"])

    df_hed  = load_approx_results(paths["HED"], fill_n_density=df_simgnn)
    df_ipfp = load_approx_results(paths["IPFP"], fill_n_density=df_simgnn)
    # For consistency, fill SimGNN from itself:
    df_simgnn_filled = load_approx_results(paths["SimGNN"], fill_n_density=df_simgnn)

    # Merge approximate with exact (intersection in memory)
    df_hed  = pd.merge(df_hed,  df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))
    df_ipfp = pd.merge(df_ipfp, df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))
    df_simgnn_filled = pd.merge(df_simgnn_filled, df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))

    # Compute metrics
    for df_approx in [df_hed, df_ipfp, df_simgnn_filled]:
        if 'ged_exact' in df_approx.columns and 'ged' in df_approx.columns:
            df_approx['absolute_error'] = abs(df_approx['ged'] - df_approx['ged_exact'])
            df_approx['squared_error']  = (df_approx['ged'] - df_approx['ged_exact'])**2
            df_approx['accuracy']       = df_approx.apply(
                lambda row: compute_relative_accuracy(row['ged'], row['ged_exact'])
                            if pd.notnull(row['ged_exact']) else np.nan,
                axis=1
            )
            df_approx['graph_size'] = (df_approx['graph1_n'] + df_approx['graph2_n']) / 2.0
            df_approx['graph_density'] = (df_approx['graph1_density'] + df_approx['graph2_density']) / 2.0

    data[dset] = {
        "HED":    df_hed,
        "IPFP":   df_ipfp,
        "SimGNN": df_simgnn_filled
    }


In [23]:
# Code Block 1: Descriptive Statistics Functions

import numpy as np
import pandas as pd
from scipy import stats

def compute_mae(df, pred_col='ged', true_col='ged_exact'):
    """
    Compute Mean Absolute Error (MAE) for GED.
    """
    if pred_col in df.columns and true_col in df.columns:
        errors = abs(df[pred_col] - df[true_col])
        return errors.mean()
    return np.nan

def compute_mse(df, pred_col='ged', true_col='ged_exact'):
    """
    Compute Mean Squared Error (MSE) for GED.
    """
    if pred_col in df.columns and true_col in df.columns:
        errors = (df[pred_col] - df[true_col]) ** 2
        return errors.mean()
    return np.nan

def compute_median_metrics(df):
    """
    Compute medians for accuracy, runtime, MAE, and MSE.
    Returns a dictionary with these median values.
    """
    metrics = {}
    if 'accuracy' in df.columns:
        metrics['median_accuracy'] = df['accuracy'].median()
    if 'runtime' in df.columns:
        metrics['median_runtime'] = df['runtime'].median()
    if 'absolute_error' in df.columns:
        metrics['median_MAE'] = df['absolute_error'].median()
    elif 'ged' in df.columns and 'ged_exact' in df.columns:
        metrics['median_MAE'] = abs(df['ged'] - df['ged_exact']).median()
    if 'squared_error' in df.columns:
        metrics['median_MSE'] = df['squared_error'].median()
    elif 'ged' in df.columns and 'ged_exact' in df.columns:
        metrics['median_MSE'] = ((df['ged'] - df['ged_exact'])**2).median()
    return metrics

def runtime_frequency(df, bins=10):
    """
    Compute the frequency distribution of runtime.
    Returns a tuple: (bin_edges, frequency_counts).
    """
    if 'runtime' in df.columns:
        counts, bin_edges = np.histogram(df['runtime'].dropna(), bins=bins)
        return bin_edges, counts
    return None, None

def descriptive_stats(df):
    """
    Compute a set of descriptive statistics for key metrics.
    Returns a dictionary with the following:
      - MAE and MSE (mean, median, standard deviation)
      - Accuracy (mean, median, std)
      - Runtime (mean, median, std)
      - Graph size (mean, median, std) if available.
    """
    stats_dict = {}
    # GED error metrics.
    if 'ged' in df.columns and 'ged_exact' in df.columns:
        valid_df = df[
            (df['ged'].notna()) &
            (df['ged_exact'].notna()) &
            (df['ged'] < (df['ged_exact'] * 2))
        ]

        abs_err = abs(valid_df['ged'] - valid_df['ged_exact'])
        sq_err  = (valid_df['ged'] - valid_df['ged_exact']) ** 2

        stats_dict['MAE_mean'] = abs_err.mean()
        stats_dict['MAE_median'] = abs_err.median()
        stats_dict['MSE_mean'] = sq_err.mean()
        stats_dict['MSE_median'] = sq_err.median()
    # Accuracy statistics.
    if 'accuracy' in df.columns:
        filtered_acc = df.loc[df['accuracy'] != 0, 'accuracy']
        stats_dict['accuracy_mean'] = filtered_acc.mean() * 100
        stats_dict['accuracy_median'] = filtered_acc.median() * 100
        #stats_dict['accuracy_std'] = filtered_acc.std() * 100
    # Runtime statistics.
    if 'runtime' in df.columns:
        stats_dict['runtime_mean'] = df['runtime'].mean()
        #stats_dict['runtime_median'] = df['runtime'].median()
        #stats_dict['runtime_std'] = df['runtime'].std()
    # Graph size statistics.
    if 'graph_size' in df.columns:
        stats_dict['graph_size_mean'] = df['graph_size'].mean()
        #stats_dict['graph_size_median'] = df['graph_size'].median()
        #stats_dict['graph_size_std'] = df['graph_size'].std()
    return stats_dict


In [25]:
# Code Block 2: Aggregating Statistics, Printing the Results, and Saving to Excel

import pandas as pd

# Create a list to accumulate results for each dataset and algorithm.
results_list = []

# Assume 'data' is the dictionary defined earlier that maps dataset names to a dict
# of DataFrames for "HED", "IPFP", and "SimGNN".
for dset, algos in data.items():
    for algo, df in algos.items():
        # Use the functions from Code Block 1 to compute descriptive statistics.
        stats_dict = descriptive_stats(df)
        median_metrics = compute_median_metrics(df)
        # Merge the computed statistics.
        combined_stats = {**stats_dict, **median_metrics}
        combined_stats['Dataset'] = dset
        combined_stats['Algorithm'] = algo
        results_list.append(combined_stats)

# Create a DataFrame to display the results.
results_df = pd.DataFrame(results_list)

# Optional: Reorder the columns to show 'Dataset' and 'Algorithm' first.
cols_order = ['Dataset', 'Algorithm'] + [col for col in results_df.columns if col not in ['Dataset', 'Algorithm']]
results_df = results_df[cols_order]

# Print the results table.
print("Descriptive Statistics Table:")
print(results_df)

# Save the results table to an Excel file.
output_file = r"C:\project_data\results\analysis\statistics_analysis_results.xlsx"
results_df.to_excel(output_file, index=False)
print(f"\nResults saved to {output_file}")

# Now, compute and print runtime frequency and regression analysis results.
print("\nAdditional Analyses (Runtime Frequency and Regression Analysis):")
for dset, algos in data.items():
    for algo, df in algos.items():
        print(f"\n--- {dset} - {algo} ---")

        # Compute and print runtime frequency distribution.
        bin_edges, counts = runtime_frequency(df)
        if bin_edges is not None:
            print("Runtime Frequency Distribution:")
            print("Bin Edges:", bin_edges)
            print("Counts:", counts)
        else:
            print("Runtime frequency distribution: Data not available.")


Descriptive Statistics Table:
       Dataset Algorithm   MAE_mean  MAE_median     MSE_mean   MSE_median  \
0         AIDS       HED  10.710920    4.500000   373.416978    20.250000   
1         AIDS      IPFP  11.461439    7.275000   329.065834    52.925625   
2         AIDS    SimGNN  10.638789    2.524324   633.160019     6.372210   
3  IMDB-BINARY       HED  53.922779   47.000000  4059.400631  2209.000000   
4  IMDB-BINARY      IPFP  41.040693   36.000000  3061.714286  1296.000000   
5  IMDB-BINARY    SimGNN  43.960600   36.983870  2991.955262  1367.806628   
6     PROTEINS       HED  43.988287   33.500000  3436.040700  1122.250000   
7     PROTEINS      IPFP  40.423695   29.000000  3454.564257   841.000000   
8     PROTEINS    SimGNN  47.749011   39.115412  3536.471128  1530.015466   

   accuracy_mean  accuracy_median  runtime_mean  graph_size_mean  \
0      44.637909        41.666667      3.059070        29.000000   
1      54.097316        51.107143      0.057332        29.00000

In [26]:
# Aggregate data for each algorithm across all datasets.
method_data = {}
for dset, algos in data.items():
    for algo, df in algos.items():
        if algo not in method_data:
            method_data[algo] = []
        method_data[algo].append(df)

# Compute and print overall statistics for each algorithm.
print("Overall Statistics per Algorithm:")
for algo, dfs in method_data.items():
    combined_df = pd.concat(dfs, ignore_index=True)
    overall_stats = descriptive_stats(combined_df)
    print(f"\nAlgorithm: {algo}")
    for key, value in overall_stats.items():
        print(f"{key}: {value}")

Overall Statistics per Algorithm:

Algorithm: HED
MAE_mean: 48.364164420485174
MAE_median: 40.5
MSE_mean: 3664.9065869272235
MSE_median: 1640.25
accuracy_mean: 38.21868400828582
accuracy_median: 32.14285714285714
runtime_mean: 0.9125057429693403
graph_size_mean: 29.159307778069255

Algorithm: IPFP
MAE_mean: 38.526677735057675
MAE_median: 29.0
MSE_mean: 3064.369436167424
MSE_median: 841.0
accuracy_mean: 54.818076576951135
accuracy_median: 55.90275590551181
runtime_mean: 0.7197468748895979
graph_size_mean: 24.024723709695795

Algorithm: SimGNN
MAE_mean: 40.432714612807075
MAE_median: 32.5645583593405
MSE_mean: 2832.611941546251
MSE_median: 1060.4504611388934
accuracy_mean: 31.63372777460078
accuracy_median: 23.88185965448112
runtime_mean: 0.0039047157109765905
graph_size_mean: 16.078317748469434


In [34]:
# For each algorithm, count pearson and spearman correlations on each dataset and overall on 3 datasets
import numpy as np
import pandas as pd
from scipy import stats

# List to accumulate dataset-level correlation results.
correlation_results = []

# Compute per-dataset correlations.
for dset, algos in data.items():
    for algo, df in algos.items():
        # Filter valid rows.
        valid_df = df[(df['ged'].notna()) & (df['ged_exact'].notna())]
        if len(valid_df) > 1:
            pearson_corr, _ = stats.pearsonr(valid_df['ged'], valid_df['ged_exact'])
            spearman_corr, _ = stats.spearmanr(valid_df['ged'], valid_df['ged_exact'])
        else:
            pearson_corr = np.nan
            spearman_corr = np.nan
        correlation_results.append({
            'Dataset': dset,
            'Algorithm': algo,
            'Pearson': pearson_corr,
            'Spearman': spearman_corr
        })

# Create a DataFrame from the dataset-level results.
dataset_corr_df = pd.DataFrame(correlation_results)
print("Dataset-Level Correlations:")
print(dataset_corr_df)

# Now compute overall correlations per algorithm across all datasets.
overall_results = []
# Aggregate data per algorithm.
method_data = {}
for dset, algos in data.items():
    for algo, df in algos.items():
        if algo not in method_data:
            method_data[algo] = []
        method_data[algo].append(df)

print("\nOverall Correlations per Algorithm:")
for algo, dfs in method_data.items():
    combined_df = pd.concat(dfs, ignore_index=True)
    valid_df = combined_df[(combined_df['ged'].notna()) & (combined_df['ged_exact'].notna())]
    if len(valid_df) > 1:
        pearson_corr, _ = stats.pearsonr(valid_df['ged'], valid_df['ged_exact'])
        spearman_corr, _ = stats.spearmanr(valid_df['ged'], valid_df['ged_exact'])
    else:
        pearson_corr = np.nan
        spearman_corr = np.nan
    overall_results.append({
        'Algorithm': algo,
        'Pearson': pearson_corr,
        'Spearman': spearman_corr
    })

overall_corr_df = pd.DataFrame(overall_results)
print(overall_corr_df)

  pearson_corr, _ = stats.pearsonr(valid_df['ged'], valid_df['ged_exact'])
  spearman_corr, _ = stats.spearmanr(valid_df['ged'], valid_df['ged_exact'])


Dataset-Level Correlations:
       Dataset Algorithm   Pearson  Spearman
0         AIDS       HED -0.017702  0.016701
1         AIDS      IPFP  0.016252 -0.007989
2         AIDS    SimGNN  0.976259  0.387280
3  IMDB-BINARY       HED       NaN       NaN
4  IMDB-BINARY      IPFP  0.177652  0.250381
5  IMDB-BINARY    SimGNN  0.621308  0.627997
6     PROTEINS       HED -0.006096  0.065098
7     PROTEINS      IPFP -0.216342 -0.170353
8     PROTEINS    SimGNN  0.906692  0.879707

Overall Correlations per Algorithm:
  Algorithm   Pearson  Spearman
0       HED -0.010129 -0.079907
1      IPFP  0.037261  0.340787
2    SimGNN  0.725411  0.775921


In [33]:
import pandas as pd
import numpy as np

def compute_fraction_within_margin(df, margin=0.1):
    """
    Compute the fraction of GED predictions within a relative margin of the true GED.
    A prediction is considered within margin if:
      |ged - ged_exact| / ged_exact <= margin
    """
    valid_df = df[(df['ged'].notna()) & (df['ged_exact'].notna())]
    if len(valid_df) == 0:
        return np.nan
    relative_error = abs(valid_df['ged'] - valid_df['ged_exact']) / valid_df['ged_exact']
    return (relative_error <= margin).sum() / len(valid_df)

# Aggregate data per algorithm from the global dictionary `data`
method_data = {}
for dset, algos in data.items():
    for algo, df in algos.items():
        if algo not in method_data:
            method_data[algo] = []
        method_data[algo].append(df)

# Define the margins: 5%, 10% and 20%
margins = [0.05, 0.10, 0.20]

print("Fraction of predictions within margin for each algorithm:")
for algo, dfs in method_data.items():
    combined_df = pd.concat(dfs, ignore_index=True)
    print(f"\nAlgorithm: {algo}")
    for margin in margins:
        fraction = compute_fraction_within_margin(combined_df, margin=margin)
        print(f"Fraction within {int(margin*100)}% margin: {fraction:.2%}")

Fraction of predictions within margin for each algorithm:

Algorithm: HED
Fraction within 5% margin: 0.78%
Fraction within 10% margin: 1.58%
Fraction within 20% margin: 3.33%

Algorithm: IPFP
Fraction within 5% margin: 1.96%
Fraction within 10% margin: 3.74%
Fraction within 20% margin: 6.95%

Algorithm: SimGNN
Fraction within 5% margin: 1.36%
Fraction within 10% margin: 2.91%
Fraction within 20% margin: 5.62%
