In [1]:
import sys

!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install ipywidgets

import os
import pandas as pd
import numpy as np
from scipy import stats

import matplotlib.pyplot as plt
%matplotlib inline

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
def load_approx_results(file_path, fill_n_density=None):
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)

    # Convert graph IDs to string to ensure consistent dtype
    df['graph_id_1'] = df['graph_id_1'].astype(str)
    df['graph_id_2'] = df['graph_id_2'].astype(str)

    numeric_cols = [
        'ged', 'accuracy', 'absolute_error', 'squared_error',
        'runtime', 'memory_usage_mb',
        'graph1_n', 'graph1_density', 'graph2_n', 'graph2_density'
    ]
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    if fill_n_density is not None:
        # Ensure fill DataFrame also has string-based IDs
        fill_n_density['graph_id_1'] = fill_n_density['graph_id_1'].astype(str)
        fill_n_density['graph_id_2'] = fill_n_density['graph_id_2'].astype(str)

        needed_cols = ['graph1_n', 'graph2_n', 'graph1_density', 'graph2_density']
        ref_cols = ['graph_id_1', 'graph_id_2'] + needed_cols
        ref = fill_n_density[ref_cols].drop_duplicates()

        merged = pd.merge(
            df, ref,
            how='left',
            on=['graph_id_1','graph_id_2'],
            suffixes=('', '_ref')
        )
        for c in needed_cols:
            merged[c] = np.where(
                merged[c].isna(),
                merged[f"{c}_ref"],
                merged[c]
            )
        drop_cols = [f"{c}_ref" for c in needed_cols if f"{c}_ref" in merged.columns]
        merged.drop(columns=drop_cols, inplace=True)
        df = merged

    df.dropna(subset=['graph_id_1','graph_id_2'], inplace=True)
    return df

def load_exact_results(file_path):
    """
    Load exact GED results from an XLSX file.
    Compute 'ged_exact' as the median of (min_ged, max_ged) if they differ,
    else min_ged (or max_ged).
    """
    df = pd.read_excel(file_path)
    df.dropna(how='all', inplace=True)

    # Convert graph IDs to string
    df['graph_id_1'] = df['graph_id_1'].astype(str)
    df['graph_id_2'] = df['graph_id_2'].astype(str)

    for col in ['min_ged', 'max_ged']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    def compute_ged_exact(row):
        if pd.notnull(row['min_ged']) and pd.notnull(row['max_ged']) and row['min_ged'] != row['max_ged']:
            return (row['min_ged'] + row['max_ged']) / 2.0
        else:
            return row['min_ged']  # or row['max_ged']

    df['ged_exact'] = df.apply(compute_ged_exact, axis=1)
    df.dropna(subset=['graph_id_1','graph_id_2','ged_exact'], inplace=True)
    return df

def compute_relative_accuracy(ged_approx, ged_exact):
    if ged_exact == 0 or ged_approx is None or ged_exact is None:
        return np.nan
    elif ged_approx >= (ged_exact * 2):
        return 0
    return 1 - abs(ged_approx - ged_exact)/ged_exact

In [3]:
datasets = {
    "AIDS": {
        "HED":      r"C:\project_data\results\gedlib\AIDS\AIDS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\AIDS\AIDS_IPFP_results.xlsx",
        "SimGNN":    r"C:\project_data\results\neural\AIDS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\AIDS\merged\results.xlsx"
    },
    "IMDB-BINARY": {
        "HED":      r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\IMDB-BINARY\IMDB-BINARY_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\IMDB-BINARY\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\IMDB-BINARY\merged\results.xlsx"
    },
    "PROTEINS": {
        "HED":      r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_HED_results.xlsx",
        "IPFP":     r"C:\project_data\results\gedlib\PROTEINS\PROTEINS_IPFP_results.xlsx",
        "SimGNN":   r"C:\project_data\results\neural\PROTEINS\performance.xlsx",
        "Exact":    r"C:\project_data\results\exact_ged\PROTEINS\merged\results.xlsx"
    }
}

In [4]:
def adjust_ids(df):
    # Convert id columns to integer, add one, then back to string
    for col in ['graph_id_1', 'graph_id_2']:
        df[col] = df[col].astype(int) - 1
        df[col] = df[col].astype(str)
    return df

data = {}

for dset, paths in datasets.items():
    df_simgnn = load_approx_results(paths["SimGNN"])
    df_exact  = load_exact_results(paths["Exact"])

    df_hed  = load_approx_results(paths["HED"], fill_n_density=df_simgnn)
    df_ipfp = load_approx_results(paths["IPFP"], fill_n_density=df_simgnn)

    # Adjust ids for HED and IPFP so that they start from 1.
    df_exact_copy = df_exact.copy()
    df_exact = adjust_ids(df_exact)

    # For consistency, fill SimGNN from itself:
    df_simgnn_filled = load_approx_results(paths["SimGNN"], fill_n_density=df_simgnn)

    # Merge approximate with exact (intersection in memory)
    df_hed  = pd.merge(df_hed,  df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))
    df_ipfp = pd.merge(df_ipfp, df_exact, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))
    df_simgnn_filled = pd.merge(df_simgnn_filled, df_exact_copy, on=['graph_id_1','graph_id_2'], how='inner', suffixes=('', '_exact'))

    # Compute metrics
    for df_approx in [df_hed, df_ipfp, df_simgnn_filled]:
        if 'ged_exact' in df_approx.columns and 'ged' in df_approx.columns:
            df_approx['absolute_error'] = abs(df_approx['ged'] - df_approx['ged_exact'])
            df_approx['squared_error']  = (df_approx['ged'] - df_approx['ged_exact'])**2
            df_approx['accuracy'] = df_approx.apply(
                lambda row: compute_relative_accuracy(row['ged'], row['ged_exact'])
                            if pd.notnull(row['ged_exact']) else np.nan,
                axis=1
            )
            df_approx['graph_size'] = (df_approx['graph1_n'] + df_approx['graph2_n']) / 2.0
            df_approx['graph_density'] = (df_approx['graph1_density'] + df_approx['graph2_density']) / 2.0

    data[dset] = {
        "HED":    df_hed,
        "IPFP":   df_ipfp,
        "SimGNN": df_simgnn_filled
    }

In [13]:
# Code Block 1: Descriptive Statistics Functions

import numpy as np
import pandas as pd
from scipy import stats

def compute_mae(df, pred_col='ged', true_col='ged_exact'):
    """
    Compute Mean Absolute Error (MAE) for GED.
    """
    if pred_col in df.columns and true_col in df.columns:
        errors = abs(df[pred_col] - df[true_col])
        return errors.mean()
    return np.nan

def compute_mse(df, pred_col='ged', true_col='ged_exact'):
    """
    Compute Mean Squared Error (MSE) for GED.
    """
    if pred_col in df.columns and true_col in df.columns:
        errors = (df[pred_col] - df[true_col]) ** 2
        return errors.mean()
    return np.nan

def compute_mre(df, pred_col='ged', true_col='ged_exact'):
    """
    Compute Mean Relative Error (MRE) for GED.
    """
    if pred_col in df.columns and true_col in df.columns:
        # Filter out rows where true_col is zero to avoid division by zero
        valid_df = df[df[true_col] > 0]
        if len(valid_df) == 0:
            return np.nan
        relative_errors = abs(valid_df[pred_col] - valid_df[true_col]) / valid_df[true_col]
        return relative_errors.mean()
    return np.nan

def compute_median_metrics(df):
    """
    Compute medians for MRE, runtime, MAE, and MSE.
    Returns a dictionary with these median values.
    """
    metrics = {}
    if 'runtime' in df.columns:
        metrics['median_runtime'] = df['runtime'].median()
    if 'absolute_error' in df.columns:
        metrics['median_MAE'] = df['absolute_error'].median()
    elif 'ged' in df.columns and 'ged_exact' in df.columns:
        metrics['median_MAE'] = abs(df['ged'] - df['ged_exact']).median()
    if 'squared_error' in df.columns:
        metrics['median_MSE'] = df['squared_error'].median()
    elif 'ged' in df.columns and 'ged_exact' in df.columns:
        metrics['median_MSE'] = ((df['ged'] - df['ged_exact'])**2).median()
    if 'ged' in df.columns and 'ged_exact' in df.columns:
        valid_df = df[df['ged_exact'] > 0]
        if len(valid_df) > 0:
            rel_errors = abs(valid_df['ged'] - valid_df['ged_exact']) / valid_df['ged_exact']
            metrics['median_MRE'] = rel_errors.median()
    return metrics

def runtime_frequency(df, bins=10):
    """
    Compute the frequency distribution of runtime.
    Returns a tuple: (bin_edges, frequency_counts).
    """
    if 'runtime' in df.columns:
        counts, bin_edges = np.histogram(df['runtime'].dropna(), bins=bins)
        return bin_edges, counts
    return None, None

def descriptive_stats(df):
    """
    Compute a set of descriptive statistics for key metrics.
    Returns a dictionary with the following:
      - MAE and MSE (mean, median, standard deviation)
      - MRE (mean, median, std)
      - Runtime (mean, median, std)
      - Graph size (mean, median, std) if available.
    """
    stats_dict = {}

    # Mean Relative Error (MRE) statistics
    if 'ged' in df.columns and 'ged_exact' in df.columns:
        valid_df = df[
            (df['ged'].notna()) &
            (df['ged_exact'].notna()) &
            (df['ged_exact'] > 0)
        ]

        if len(valid_df) > 0:
            rel_errors = abs(valid_df['ged'] - valid_df['ged_exact']) / abs(valid_df['ged_exact'])
            stats_dict['MRE_mean'] = rel_errors.mean()
            stats_dict['MRE_median'] = rel_errors.median()
            stats_dict['MRE_std'] = rel_errors.std()

    # MAE and MSE metrics
    if 'ged' in df.columns and 'ged_exact' in df.columns:
        valid_df = df[
            (df['ged'].notna()) &
            (df['ged_exact'].notna()) &
            (df['ged'] < (df['ged_exact'] * 2))
        ]

        abs_err = abs(valid_df['ged'] - valid_df['ged_exact'])
        sq_err  = (valid_df['ged'] - valid_df['ged_exact']) ** 2

        stats_dict['MAE_mean'] = abs_err.mean()
        stats_dict['MAE_median'] = abs_err.median()
        stats_dict['MSE_mean'] = sq_err.mean()
        stats_dict['MSE_median'] = sq_err.median()

    # Runtime statistics.
    if 'runtime' in df.columns:
        stats_dict['runtime_mean'] = df['runtime'].mean()
        stats_dict['runtime_median'] = df['runtime'].median()
        stats_dict['runtime_std'] = df['runtime'].std()

    # Graph size statistics.
    if 'graph_size' in df.columns:
        stats_dict['graph_size_mean'] = df['graph_size'].mean()
        stats_dict['graph_size_median'] = df['graph_size'].median()
        stats_dict['graph_size_std'] = df['graph_size'].std()

    return stats_dict

In [14]:
# Code Block 2: Aggregating Statistics, Printing the Results, and Saving to Excel

import pandas as pd

# Create a list to accumulate results for each dataset and algorithm.
results_list = []

# Assume 'data' is the dictionary defined earlier that maps dataset names to a dict
# of DataFrames for "HED", "IPFP", and "SimGNN".
for dset, algos in data.items():
    for algo, df in algos.items():
        # Use the functions from Code Block 1 to compute descriptive statistics.
        stats_dict = descriptive_stats(df)
        median_metrics = compute_median_metrics(df)
        # Merge the computed statistics.
        combined_stats = {**stats_dict, **median_metrics}
        combined_stats['Dataset'] = dset
        combined_stats['Algorithm'] = algo
        results_list.append(combined_stats)

# Create a DataFrame to display the results.
results_df = pd.DataFrame(results_list)

# Optional: Reorder the columns to show 'Dataset' and 'Algorithm' first.
cols_order = ['Dataset', 'Algorithm'] + [col for col in results_df.columns if col not in ['Dataset', 'Algorithm']]
results_df = results_df[cols_order]

# Print the results table.
print("Descriptive Statistics Table:")
print(results_df)

# Save the results table to an Excel file.
output_file = r"C:\project_data\results\analysis\statistics_analysis_results.xlsx"
results_df.to_excel(output_file, index=False)
print(f"\nResults saved to {output_file}")

# Now, compute and print runtime frequency and regression analysis results.
print("\nAdditional Analyses (Runtime Frequency and Regression Analysis):")
for dset, algos in data.items():
    for algo, df in algos.items():
        print(f"\n--- {dset} - {algo} ---")

        # Compute and print runtime frequency distribution.
        bin_edges, counts = runtime_frequency(df)
        if bin_edges is not None:
            print("Runtime Frequency Distribution:")
            print("Bin Edges:", bin_edges)
            print("Counts:", counts)
        else:
            print("Runtime frequency distribution: Data not available.")


Descriptive Statistics Table:
       Dataset Algorithm  MRE_mean  MRE_median   MRE_std   MAE_mean  \
0         AIDS       HED  1.149731    0.833333  1.137125   7.251887   
1         AIDS      IPFP  2.618761    2.166667  2.111480  12.812565   
2         AIDS    SimGNN  0.698819    0.544637  0.774850  10.099670   
3  IMDB-BINARY       HED  1.000000    1.000000  0.000000  53.922779   
4  IMDB-BINARY      IPFP  0.000226    0.000000  0.005786   0.005361   
5  IMDB-BINARY    SimGNN  0.735452    0.784702  0.408576  43.960600   
6     PROTEINS       HED  0.741790    0.776923  0.128479  47.319380   
7     PROTEINS      IPFP  0.114160    0.081967  0.120760   4.837084   
8     PROTEINS    SimGNN  0.744223    0.783402  0.120351  47.749011   

   MAE_median     MSE_mean   MSE_median  runtime_mean  runtime_median  \
0    4.125000   162.265039    17.015625      2.732141        2.397600   
1   12.000000   232.216361   144.000000      0.013975        0.005370   
2    2.641075   522.250851     6.975278 

In [8]:
# For each algorithm, count pearson and spearman correlations on each dataset and overall on 3 datasets
import numpy as np
import pandas as pd
from scipy import stats

# List to accumulate dataset-level correlation results.
correlation_results = []

# Compute per-dataset correlations.
for dset, algos in data.items():
    for algo, df in algos.items():
        # Filter valid rows.
        valid_df = df[(df['ged'].notna()) & (df['ged_exact'].notna())]
        if len(valid_df) > 1:
            pearson_corr, _ = stats.pearsonr(valid_df['ged'], valid_df['ged_exact'])
            spearman_corr, _ = stats.spearmanr(valid_df['ged'], valid_df['ged_exact'])
        else:
            pearson_corr = np.nan
            spearman_corr = np.nan
        correlation_results.append({
            'Dataset': dset,
            'Algorithm': algo,
            'Pearson': pearson_corr,
            'Spearman': spearman_corr
        })

# Create a DataFrame from the dataset-level results.
dataset_corr_df = pd.DataFrame(correlation_results)
print("Dataset-Level Correlations:")
print(dataset_corr_df)

# Now compute overall correlations per algorithm across all datasets.
overall_results = []
# Aggregate data per algorithm.
method_data = {}
for dset, algos in data.items():
    for algo, df in algos.items():
        if algo not in method_data:
            method_data[algo] = []
        method_data[algo].append(df)

print("\nOverall Correlations per Algorithm:")
for algo, dfs in method_data.items():
    combined_df = pd.concat(dfs, ignore_index=True)
    valid_df = combined_df[(combined_df['ged'].notna()) & (combined_df['ged_exact'].notna())]
    if len(valid_df) > 1:
        pearson_corr, _ = stats.pearsonr(valid_df['ged'], valid_df['ged_exact'])
        spearman_corr, _ = stats.spearmanr(valid_df['ged'], valid_df['ged_exact'])
    else:
        pearson_corr = np.nan
        spearman_corr = np.nan
    overall_results.append({
        'Algorithm': algo,
        'Pearson': pearson_corr,
        'Spearman': spearman_corr
    })

overall_corr_df = pd.DataFrame(overall_results)
print(overall_corr_df)

Dataset-Level Correlations:
       Dataset Algorithm   Pearson  Spearman
0         AIDS       HED  0.944368  0.343902
1         AIDS      IPFP  0.979375  0.571994
2         AIDS    SimGNN  0.981745  0.389424
3  IMDB-BINARY       HED       NaN       NaN
4  IMDB-BINARY      IPFP  0.999995  0.999994
5  IMDB-BINARY    SimGNN  0.621308  0.627997
6     PROTEINS       HED  0.835628  0.802561
7     PROTEINS      IPFP  0.998009  0.994016
8     PROTEINS    SimGNN  0.906692  0.879707

Overall Correlations per Algorithm:
  Algorithm   Pearson  Spearman
0       HED  0.222073  0.035582
1      IPFP  0.985443  0.880676
2    SimGNN  0.691586  0.781774


  pearson_corr, _ = stats.pearsonr(valid_df['ged'], valid_df['ged_exact'])
  spearman_corr, _ = stats.spearmanr(valid_df['ged'], valid_df['ged_exact'])


In [9]:
import pandas as pd
import numpy as np

def compute_fraction_within_margin(df, margin=0.1):
    """
    Compute the fraction of GED predictions within a relative margin of the true GED.
    A prediction is considered within margin if:
      |ged - ged_exact| / ged_exact <= margin
    """
    valid_df = df[(df['ged'].notna()) & (df['ged_exact'].notna())]
    if len(valid_df) == 0:
        return np.nan
    relative_error = abs(valid_df['ged'] - valid_df['ged_exact']) / valid_df['ged_exact']
    return (relative_error <= margin).sum() / len(valid_df)

# Define the margins: 5%, 10% and 20%
margins = [0.05, 0.10, 0.20]

# Compute fraction for each dataset and algorithm separately
print("Fraction of predictions within margin for each dataset and algorithm:")
for dset, algos in data.items():
    print(f"\nDataset: {dset}")
    for algo, df in algos.items():
        print(f"  Algorithm: {algo}")
        for margin in margins:
            fraction = compute_fraction_within_margin(df, margin=margin)
            print(f"    Fraction within {int(margin*100)}% margin: {fraction:.2%}")

Fraction of predictions within margin for each dataset and algorithm:

Dataset: AIDS
  Algorithm: HED
    Fraction within 5% margin: 0.99%
    Fraction within 10% margin: 2.93%
    Fraction within 20% margin: 7.36%
  Algorithm: IPFP
    Fraction within 5% margin: 0.06%
    Fraction within 10% margin: 1.75%
    Fraction within 20% margin: 5.49%
  Algorithm: SimGNN
    Fraction within 5% margin: 5.24%
    Fraction within 10% margin: 10.98%
    Fraction within 20% margin: 20.39%

Dataset: IMDB-BINARY
  Algorithm: HED
    Fraction within 5% margin: 0.00%
    Fraction within 10% margin: 0.00%
    Fraction within 20% margin: 0.00%
  Algorithm: IPFP
    Fraction within 5% margin: 97.71%
    Fraction within 10% margin: 97.85%
    Fraction within 20% margin: 97.88%
  Algorithm: SimGNN
    Fraction within 5% margin: 0.68%
    Fraction within 10% margin: 1.82%
    Fraction within 20% margin: 3.64%

Dataset: PROTEINS
  Algorithm: HED
    Fraction within 5% margin: 0.31%
    Fraction within 10% mar

In [10]:
import pandas as pd
import numpy as np

# List to accumulate memory usage stats for each dataset and algorithm.
memory_stats = []

# Iterate through each dataset and algorithm in the global 'data' dictionary.
for dset, algos in data.items():
    for algo, df in algos.items():
        if 'memory_usage_mb' in df.columns:
            mean_mem = df['memory_usage_mb'].mean()
            max_mem = df['memory_usage_mb'].max()
            std_mem = df['memory_usage_mb'].std()
        else:
            mean_mem = np.nan
            max_mem = np.nan
            std_mem = np.nan

        memory_stats.append({
            'Dataset': dset,
            'Algorithm': algo,
            'Mean Memory Usage (MB)': mean_mem,
            'Max Memory Usage (MB)': max_mem,
            'Std Memory Usage (MB)': std_mem
        })

# Create a DataFrame to display the results.
memory_stats_df = pd.DataFrame(memory_stats)
print("Memory Usage Statistics:")
print(memory_stats_df)

Memory Usage Statistics:
       Dataset Algorithm  Mean Memory Usage (MB)  Max Memory Usage (MB)  \
0         AIDS       HED                0.000157               0.257812   
1         AIDS      IPFP             1566.556049            1580.175781   
2         AIDS    SimGNN                0.000102               0.070312   
3  IMDB-BINARY       HED                0.000008               0.257812   
4  IMDB-BINARY      IPFP               41.433127              47.156250   
5  IMDB-BINARY    SimGNN                0.023756               0.453125   
6     PROTEINS       HED                0.000038               0.257812   
7     PROTEINS      IPFP                0.000000               0.000000   
8     PROTEINS    SimGNN                0.001087              10.718750   

   Std Memory Usage (MB)  
0               0.006369  
1               6.157758  
2               0.002157  
3               0.001396  
4               4.092402  
5               0.050127  
6               0.003128  
7       

In [11]:
import pandas as pd
import numpy as np

def format_value(x):
    if pd.isna(x):
        return "NaN"
    # For values less than 1 in magnitude, use scientific notation
    if abs(x) < 1:
        s = "{:.1e}".format(x)  # e.g., "1.2e-01"
        mantissa, exp = s.split("e")
        exp = int(exp)
        return f"{mantissa} * 10^({exp})"
    # For values 1 or larger, show one decimal place normally.
    return f"{x:.2f}"

# List to accumulate memory usage stats for each dataset and algorithm.
memory_stats = []

# Iterate through each dataset and algorithm in the global `data` dictionary.
for dset, algos in data.items():
    for algo, df in algos.items():
        if 'memory_usage_mb' in df.columns:
            mean_mem = df['memory_usage_mb'].mean()
            max_mem = df['memory_usage_mb'].max()
            std_mem = df['memory_usage_mb'].std()
        else:
            mean_mem = np.nan
            max_mem = np.nan
            std_mem = np.nan

        memory_stats.append({
            'Dataset': dset,
            'Algorithm': algo,
            'Mean Memory Usage (MB)': format_value(mean_mem),
            'Max Memory Usage (MB)': format_value(max_mem),
            'Std Memory Usage (MB)': format_value(std_mem)
        })

# Create a DataFrame to display the results.
memory_stats_df = pd.DataFrame(memory_stats)
print("Memory Usage Statistics:")
print(memory_stats_df)

Memory Usage Statistics:
       Dataset Algorithm Mean Memory Usage (MB) Max Memory Usage (MB)  \
0         AIDS       HED          1.6 * 10^(-4)         2.6 * 10^(-1)   
1         AIDS      IPFP                1566.56               1580.18   
2         AIDS    SimGNN          1.0 * 10^(-4)         7.0 * 10^(-2)   
3  IMDB-BINARY       HED          7.6 * 10^(-6)         2.6 * 10^(-1)   
4  IMDB-BINARY      IPFP                  41.43                 47.16   
5  IMDB-BINARY    SimGNN          2.4 * 10^(-2)         4.5 * 10^(-1)   
6     PROTEINS       HED          3.8 * 10^(-5)         2.6 * 10^(-1)   
7     PROTEINS      IPFP           0.0 * 10^(0)          0.0 * 10^(0)   
8     PROTEINS    SimGNN          1.1 * 10^(-3)                 10.72   

  Std Memory Usage (MB)  
0         6.4 * 10^(-3)  
1                  6.16  
2         2.2 * 10^(-3)  
3         1.4 * 10^(-3)  
4                  4.09  
5         5.0 * 10^(-2)  
6         3.1 * 10^(-3)  
7          0.0 * 10^(0)  
8         

In [12]:
import pandas as pd
import numpy as np

# List to accumulate memory usage stats for each dataset and algorithm.
memory_stats = []

# Iterate through each dataset and algorithm in the global 'data' dictionary.
for dset, algos in data.items():
    for algo, df in algos.items():
        if 'memory_usage_mb' in df.columns:
            # Convert from MB to KB by multiplying by 1024.
            mean_mem = df['memory_usage_mb'].mean() * 1024
            max_mem = df['memory_usage_mb'].max() * 1024
            std_mem = df['memory_usage_mb'].std() * 1024
        else:
            mean_mem = np.nan
            max_mem = np.nan
            std_mem = np.nan

        memory_stats.append({
            'Dataset': dset,
            'Algorithm': algo,
            'Mean Memory Usage (KB)': mean_mem,
            'Max Memory Usage (KB)': max_mem,
            'Std Memory Usage (KB)': std_mem
        })

# Create a DataFrame to display the results.
memory_stats_df = pd.DataFrame(memory_stats)
print("Memory Usage Statistics (in KB):")
print(memory_stats_df)

Memory Usage Statistics (in KB):
       Dataset Algorithm  Mean Memory Usage (KB)  Max Memory Usage (KB)  \
0         AIDS       HED            1.611965e-01           2.639995e+02   
1         AIDS      IPFP            1.604153e+06           1.618100e+06   
2         AIDS    SimGNN            1.040513e-01           7.200000e+01   
3  IMDB-BINARY       HED            7.745100e-03           2.639995e+02   
4  IMDB-BINARY      IPFP            4.242752e+04           4.828800e+04   
5  IMDB-BINARY    SimGNN            2.432612e+01           4.640000e+02   
6     PROTEINS       HED            3.885774e-02           2.639995e+02   
7     PROTEINS      IPFP            0.000000e+00           0.000000e+00   
8     PROTEINS    SimGNN            1.113335e+00           1.097600e+04   

   Std Memory Usage (KB)  
0               6.521985  
1            6305.544152  
2               2.208633  
3               1.429931  
4            4190.620095  
5              51.330161  
6               3.202720  
