In [13]:
import sys

!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install ipywidgets

import os
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [14]:
def load_approx_results(file_path, fill_n_density=None):
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)

    # Convert graph IDs to string to ensure consistent dtype
    df['graph_id_1'] = df['graph_id_1'].astype(str)
    df['graph_id_2'] = df['graph_id_2'].astype(str)

    numeric_cols = [
        'ged', 'accuracy', 'absolute_error', 'squared_error',
        'runtime', 'memory_usage_mb',
        'graph1_n', 'graph1_density', 'graph2_n', 'graph2_density'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    if fill_n_density is not None:
        # Ensure fill DataFrame also has string-based IDs
        fill_n_density['graph_id_1'] = fill_n_density['graph_id_1'].astype(str)
        fill_n_density['graph_id_2'] = fill_n_density['graph_id_2'].astype(str)

        needed_cols = ['graph1_n', 'graph2_n', 'graph1_density', 'graph2_density']
        ref_cols = ['graph_id_1', 'graph_id_2'] + needed_cols
        ref = fill_n_density[ref_cols].drop_duplicates()

        merged = pd.merge(
            df, ref,
            how='left',
            on=['graph_id_1','graph_id_2'],
            suffixes=('', '_ref')
        )
        for c in needed_cols:
            merged[c] = np.where(
                merged[c].isna(),
                merged[f"{c}_ref"],
                merged[c]
            )
        drop_cols = [f"{c}_ref" for c in needed_cols if f"{c}_ref" in merged.columns]
        merged.drop(columns=drop_cols, inplace=True)
        df = merged

    df.dropna(subset=['graph_id_1','graph_id_2'], inplace=True)
    return df

def load_exact_results(file_path):
    """
    Load exact GED results from an XLSX file.
    Compute 'ged_exact' as the median of (min_ged, max_ged) if they differ,
    else min_ged (or max_ged).
    """
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)

    # Convert graph IDs to string
    df['graph_id_1'] = df['graph_id_1'].astype(str)
    df['graph_id_2'] = df['graph_id_2'].astype(str)

    for col in ['min_ged', 'max_ged']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    def compute_ged_exact(row):
        if pd.notnull(row['min_ged']) and pd.notnull(row['max_ged']) and row['min_ged'] != row['max_ged']:
            return (row['min_ged'] + row['max_ged']) / 2.0
        else:
            return row['min_ged']  # or row['max_ged']

    df['ged_exact'] = df.apply(compute_ged_exact, axis=1)
    df.dropna(subset=['graph_id_1','graph_id_2','ged_exact'], inplace=True)
    return df


def compute_relative_accuracy(ged_approx, ged_exact):
    if ged_exact == 0 or ged_approx is None or ged_exact is None:
        return np.nan
    return min(ged_approx, ged_exact) / max(ged_approx, ged_exact)


In [15]:
datasets = {
    "AIDS": {
        "HED":      r"C:\project_data\results\gedlib\AIDS\AIDS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\AIDS\AIDS_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\AIDS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\AIDS\merged\results.xlsx"
    },
    "IMDB-BINARY": {
        "HED":      r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\IMDB-BINARY\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\IMDB-BINARY\merged\results.xlsx"
    },
    "PROTEINS": {
        "HED":      r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\PROTEINS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\PROTEINS\merged\results.xlsx"
    }
}

In [16]:
data = {}

for dset, paths in datasets.items():
    df_simgnn = load_approx_results(paths["SimGNN"])
    df_exact  = load_exact_results(paths["Exact"])

    df_hed  = load_approx_results(paths["HED"], fill_n_density=df_simgnn)
    df_ipfp = load_approx_results(paths["IPFP"], fill_n_density=df_simgnn)
    # For consistency, fill SimGNN from itself:
    df_simgnn_filled = load_approx_results(paths["SimGNN"], fill_n_density=df_simgnn)

    # Merge approximate with exact (intersection in memory)
    df_hed  = pd.merge(df_hed,  df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))
    df_ipfp = pd.merge(df_ipfp, df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))
    df_simgnn_filled = pd.merge(df_simgnn_filled, df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))

    # Compute metrics
    for df_approx in [df_hed, df_ipfp, df_simgnn_filled]:
        if 'ged_exact' in df_approx.columns and 'ged' in df_approx.columns:
            df_approx['absolute_error'] = abs(df_approx['ged'] - df_approx['ged_exact'])
            df_approx['squared_error']  = (df_approx['ged'] - df_approx['ged_exact'])**2
            df_approx['accuracy']       = df_approx.apply(
                lambda row: compute_relative_accuracy(row['ged'], row['ged_exact'])
                            if pd.notnull(row['ged_exact']) else np.nan,
                axis=1
            )
            df_approx['graph_size'] = (df_approx['graph1_n'] + df_approx['graph2_n']) / 2.0
            df_approx['graph_density'] = (df_approx['graph1_density'] + df_approx['graph2_density']) / 2.0

    data[dset] = {
        "HED":    df_hed,
        "IPFP":   df_ipfp,
        "SimGNN": df_simgnn_filled
    }


In [17]:
# Code Block 1: Descriptive Statistics Functions

import numpy as np
import pandas as pd
from scipy import stats

def compute_mae(df, pred_col='ged', true_col='ged_exact'):
    """
    Compute Mean Absolute Error (MAE) for GED.
    """
    if pred_col in df.columns and true_col in df.columns:
        errors = abs(df[pred_col] - df[true_col])
        return errors.mean()
    return np.nan

def compute_mse(df, pred_col='ged', true_col='ged_exact'):
    """
    Compute Mean Squared Error (MSE) for GED.
    """
    if pred_col in df.columns and true_col in df.columns:
        errors = (df[pred_col] - df[true_col]) ** 2
        return errors.mean()
    return np.nan

def compute_median_metrics(df):
    """
    Compute medians for accuracy, runtime, MAE, and MSE.
    Returns a dictionary with these median values.
    """
    metrics = {}
    if 'accuracy' in df.columns:
        metrics['median_accuracy'] = df['accuracy'].median()
    if 'runtime' in df.columns:
        metrics['median_runtime'] = df['runtime'].median()
    if 'absolute_error' in df.columns:
        metrics['median_MAE'] = df['absolute_error'].median()
    elif 'ged' in df.columns and 'ged_exact' in df.columns:
        metrics['median_MAE'] = abs(df['ged'] - df['ged_exact']).median()
    if 'squared_error' in df.columns:
        metrics['median_MSE'] = df['squared_error'].median()
    elif 'ged' in df.columns and 'ged_exact' in df.columns:
        metrics['median_MSE'] = ((df['ged'] - df['ged_exact'])**2).median()
    return metrics

def runtime_frequency(df, bins=10):
    """
    Compute the frequency distribution of runtime.
    Returns a tuple: (bin_edges, frequency_counts).
    """
    if 'runtime' in df.columns:
        counts, bin_edges = np.histogram(df['runtime'].dropna(), bins=bins)
        return bin_edges, counts
    return None, None

def descriptive_stats(df):
    """
    Compute a set of descriptive statistics for key metrics.
    Returns a dictionary with the following:
      - MAE and MSE (mean, median, standard deviation)
      - Accuracy (mean, median, std)
      - Runtime (mean, median, std)
      - Graph size (mean, median, std) if available.
    """
    stats_dict = {}
    # GED error metrics.
    if 'ged' in df.columns and 'ged_exact' in df.columns:
        abs_err = abs(df['ged'] - df['ged_exact'])
        sq_err  = (df['ged'] - df['ged_exact']) ** 2
        stats_dict['MAE_mean'] = abs_err.mean()
        stats_dict['MAE_median'] = abs_err.median()
        stats_dict['MAE_std'] = abs_err.std()
        stats_dict['MSE_mean'] = sq_err.mean()
        stats_dict['MSE_median'] = sq_err.median()
        stats_dict['MSE_std'] = sq_err.std()
    # Accuracy statistics.
    if 'accuracy' in df.columns:
        stats_dict['accuracy_mean'] = df['accuracy'].mean()
        stats_dict['accuracy_median'] = df['accuracy'].median()
        stats_dict['accuracy_std'] = df['accuracy'].std()
    # Runtime statistics.
    if 'runtime' in df.columns:
        stats_dict['runtime_mean'] = df['runtime'].mean()
        stats_dict['runtime_median'] = df['runtime'].median()
        stats_dict['runtime_std'] = df['runtime'].std()
    # Graph size statistics.
    if 'graph_size' in df.columns:
        stats_dict['graph_size_mean'] = df['graph_size'].mean()
        stats_dict['graph_size_median'] = df['graph_size'].median()
        stats_dict['graph_size_std'] = df['graph_size'].std()
    return stats_dict


In [18]:
# Code Block 2: Aggregating Statistics, Printing the Results, and Saving to Excel

import pandas as pd

# Create a list to accumulate results for each dataset and algorithm.
results_list = []

# Assume 'data' is the dictionary defined earlier that maps dataset names to a dict
# of DataFrames for "HED", "IPFP", and "SimGNN".
for dset, algos in data.items():
    for algo, df in algos.items():
        # Use the functions from Code Block 1 to compute descriptive statistics.
        stats_dict = descriptive_stats(df)
        median_metrics = compute_median_metrics(df)
        # Merge the computed statistics.
        combined_stats = {**stats_dict, **median_metrics}
        combined_stats['Dataset'] = dset
        combined_stats['Algorithm'] = algo
        results_list.append(combined_stats)

# Create a DataFrame to display the results.
results_df = pd.DataFrame(results_list)

# Optional: Reorder the columns to show 'Dataset' and 'Algorithm' first.
cols_order = ['Dataset', 'Algorithm'] + [col for col in results_df.columns if col not in ['Dataset', 'Algorithm']]
results_df = results_df[cols_order]

# Print the results table.
print("Descriptive Statistics Table:")
print(results_df)

# Save the results table to an Excel file.
output_file = r"C:\project_data\results\analysis\statistics_analysis_results.xlsx"
results_df.to_excel(output_file, index=False)
print(f"\nResults saved to {output_file}")

# Now, compute and print runtime frequency and regression analysis results.
print("\nAdditional Analyses (Runtime Frequency and Regression Analysis):")
for dset, algos in data.items():
    for algo, df in algos.items():
        print(f"\n--- {dset} - {algo} ---")

        # Compute and print runtime frequency distribution.
        bin_edges, counts = runtime_frequency(df)
        if bin_edges is not None:
            print("Runtime Frequency Distribution:")
            print("Bin Edges:", bin_edges)
            print("Counts:", counts)
        else:
            print("Runtime frequency distribution: Data not available.")


Descriptive Statistics Table:
       Dataset Algorithm    MAE_mean  MAE_median     MAE_std       MSE_mean  \
0         AIDS       HED   11.533584    7.750000   11.869579     273.888950   
1         AIDS      IPFP   36.071261   22.637500   43.622441    3202.862371   
2         AIDS    SimGNN    9.591837    2.814440   20.766443     523.227366   
3  IMDB-BINARY       HED   52.608168   46.000000   34.537662    3960.434431   
4  IMDB-BINARY      IPFP  119.892653   95.000000   90.888705   22633.034351   
5  IMDB-BINARY    SimGNN   42.837130   35.117490   32.635882    2900.089245   
6     PROTEINS       HED   44.055700   33.500000   38.742625    3441.821948   
7     PROTEINS      IPFP  198.323067   54.000000  338.307431  153745.640107   
8     PROTEINS    SimGNN   47.747376   39.111853   35.447956    3536.307833   

    MSE_median        MSE_std  accuracy_mean  accuracy_median  ...  \
0    60.062500     898.179628       0.424218         0.400000  ...   
1   512.457813    9196.694039       0.2