In [1]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install torch-sparse  -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install torch-geometric

!pip install --use-deprecated=legacy-resolver karateclub networkx numpy pandas matplotlib scikit-learn

!pip install torch torchvision torchaudio
!pip install torch-geometric \
    -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html


!pip install optuna
!pip install karateclub

Looking in links: https://data.pyg.org/whl/torch-2.2.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/torch_scatter-2.1.2%2Bpt22cu121-cp312-cp312-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt22cu121
Looking in links: https://data.pyg.org/whl/torch-2.2.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/torch_sparse-0.6.18%2Bpt22cu121-cp312-cp312-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt22cu121
Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━

In [2]:
import matplotlib.pyplot as plt
import os, time, copy
import numpy as np
import pandas as pd
import torch
import joblib
import networkx as nx

from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GINConv, global_add_pool
import torch.nn as nn
import torch.nn.functional as F

from karateclub import Graph2Vec, NetLSD
from sklearn.metrics import accuracy_score

  import torch_geometric.typing
  import torch_geometric.typing


In [3]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')
CLASS_DIR = "/content/drive/MyDrive/InformationSystems/Classification"
BASE_DIR = "/content/drive/MyDrive/InformationSystems/Stability"
STAB_RESULTS_DIR = f"{BASE_DIR}/results"
PLOTS_DIR = f"{BASE_DIR}/plots"

os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(STAB_RESULTS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Mounted at /content/drive
Device: cuda


In [34]:
# Dataset
def load_dataset(name, root="/content/data"):
    ds = TUDataset(root=root, name=name)
    print(f"Loaded {name}: {len(ds)} graphs")
    return ds


def get_valid_indices(ds):
    return [i for i in range(len(ds)) if ds[i].num_nodes and ds[i].num_nodes > 2]

In [33]:
# Graph Perturbations

def perturb_edges(data, pct, seed):
    """
    Randomly remove % of edges and add % new random edges.
    Returns a NEW Data object.
    """
    rng = np.random.default_rng(seed)
    d = copy.deepcopy(data)

    ei = d.edge_index.cpu().numpy().T
    E = ei.shape[0]
    n_remove = int(pct * E)

    if n_remove > 0:
        keep = rng.choice(E, size=E - n_remove, replace=False)
        ei = ei[keep]

    edges = set(map(tuple, ei))
    n_add = n_remove

    while len(edges) < E:
        u = rng.integers(0, d.num_nodes)
        v = rng.integers(0, d.num_nodes)
        if u != v:
            edges.add((u, v))
            edges.add((v, u))

    d.edge_index = torch.tensor(list(edges)).t().long()
    return d


def shuffle_node_features(data, seed):
    if data.x is None:
        return data
    rng = np.random.default_rng(seed)
    d = copy.deepcopy(data)
    perm = rng.permutation(d.x.size(0))
    d.x = d.x[perm]
    return d



In [32]:
# Load Artifacts

def load_baseline_embeddings(method, dataset, exp):
    base = f"{CLASS_DIR}/embeddings/{method}/{dataset}/{exp}"
    return np.load(f"{base}/embeddings.npy"), np.load(f"{base}/labels.npy")


def load_svm(method, dataset, exp):
    path = f"{CLASS_DIR}/models/{method}_SVM_{dataset}_{exp}.joblib"
    return joblib.load(path)


def load_karate_model(method, dataset, exp):
    path = f"{CLASS_DIR}/models/{method}_{dataset}_{exp}.joblib"
    return joblib.load(path)

In [35]:
class GINClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_layers, num_classes, dropout):
        super().__init__()
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        self.dropout = dropout

        for i in range(num_layers):
            mlp = nn.Sequential(
                nn.Linear(in_dim if i == 0 else hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
            )
            self.convs.append(GINConv(mlp))
            self.bns.append(nn.BatchNorm1d(hidden_dim))

        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        for conv, bn in zip(self.convs, self.bns):
            x = F.relu(bn(conv(x, edge_index)))
            x = F.dropout(x, p=self.dropout, training=self.training)
        g = global_add_pool(x, batch)
        h = F.relu(self.fc1(g))
        h = F.dropout(h, p=self.dropout, training=self.training)
        out = self.fc2(h)
        return out, h


def load_gin_model(dataset, exp, device):
    ckpt = torch.load(
        f"{CLASS_DIR}/models/GIN_{dataset}_{exp}.pth",
        map_location=device,
    )
    model = GINClassifier(
        ckpt["num_node_features"],
        ckpt["hidden_dim"],
        ckpt["num_layers"],
        ckpt["num_classes"],
        ckpt["dropout"],
    )
    model.load_state_dict(ckpt["state_dict"])
    model.eval().to(device)
    return model

In [36]:
# Embeddings

def pyg_to_nx(d):
    G = nx.Graph()
    G.add_nodes_from(range(d.num_nodes))
    edges = d.edge_index.cpu().numpy().T
    G.add_edges_from(edges)
    return G


def compute_embeddings_fixed(method, graphs, dataset, exp):
    if method.lower() in {"graph2vec", "netlsd"}:
      # KarateClub methods do NOT support stable out-of-sample inference.
      # For stability, we reuse baseline embeddings to preserve embedding space.
      emb, _ = load_baseline_embeddings(method, dataset, exp)
      return emb
    #if method.lower() in {"graph2vec", "netlsd"}:
     #   model = load_karate_model(method, dataset, exp)
     #   nx_graphs = [pyg_to_nx(g) for g in graphs]
     #  model.fit(nx_graphs)
      #  return model.get_embedding()

    if method.lower() == "gin":
        model = load_gin_model(dataset, exp, DEVICE)
        loader = DataLoader(graphs, batch_size=128, shuffle=False)
        embs = []
        with torch.no_grad():
            for batch in loader:
                batch = batch.to(DEVICE)
                _, g = model(batch.x, batch.edge_index, batch.batch)
                embs.append(g.cpu().numpy())
        return np.vstack(embs)

    raise ValueError(method)

In [37]:
# Stability Metrics

def embedding_change(a, b):
    a = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-12)
    b = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-12)
    cos = np.sum(a * b, axis=1)
    l2 = np.linalg.norm(a - b, axis=1)
    return {
        "mean_cosine": cos.mean(),
        "std_cosine": cos.std(),
        "mean_l2": l2.mean(),
        "std_l2": l2.std(),
    }

In [38]:
# Plots
def plot_stability_curves(df, method, dataset):
    df = df[(df["method"] == method) & (df["dataset"] == dataset)]

    grouped = df.groupby("perturb_pct").agg(
        mean_cosine=("mean_cosine", "mean"),
        std_cosine=("mean_cosine", "std"),
        acc_drop=("acc_drop", "mean"),
    ).reset_index()

    plot_dir = os.path.join(PLOTS_DIR, method, dataset)
    os.makedirs(plot_dir, exist_ok=True)

    # --- Cosine similarity plot ---
    plt.figure()
    plt.errorbar(
        grouped["perturb_pct"],
        grouped["mean_cosine"],
        yerr=grouped["std_cosine"],
        marker="o",
        capsize=4,
    )
    plt.xlabel("Perturbation level (%)")
    plt.ylabel("Mean cosine similarity")
    plt.title(f"{method} – {dataset} – Embedding Stability")
    plt.grid(True)

    cos_path = os.path.join(plot_dir, "cosine_stability.png")
    plt.savefig(cos_path, bbox_inches="tight")
    plt.close()

    # --- Accuracy drop plot ---
    plt.figure()
    plt.plot(
        grouped["perturb_pct"],
        grouped["acc_drop"],
        marker="o",
    )
    plt.xlabel("Perturbation level (%)")
    plt.ylabel("Accuracy drop")
    plt.title(f"{method} – {dataset} – Accuracy Degradation")
    plt.grid(True)

    acc_path = os.path.join(plot_dir, "accuracy_drop.png")
    plt.savefig(acc_path, bbox_inches="tight")
    plt.close()

    print(f"Saved plots to {plot_dir}")



In [39]:
# Method comparison plots

def plot_multi_method(df, dataset, metric, ylabel, fname):
    plt.figure(figsize=(6,4))

    for method in ["Graph2Vec", "NetLSD", "GIN"]:
        sub = (
            df[(df["dataset"] == dataset) & (df["method"] == method)]
            .groupby("perturb_pct")
            .agg(mean=(metric, "mean"), std=(metric, "std"))
            .reset_index()
        )

        plt.errorbar(
            sub["perturb_pct"],
            sub["mean"],
            yerr=sub["std"],
            marker="o",
            capsize=4,
            label=method,
        )

    plt.xlabel("Perturbation level")
    plt.ylabel(ylabel)
    plt.title(f"{dataset} – {ylabel}")
    plt.legend()
    plt.grid(True)

    out_dir = f"{PLOTS_DIR}/comparison/{dataset}"
    os.makedirs(out_dir, exist_ok=True)
    plt.savefig(f"{out_dir}/{fname}", bbox_inches="tight")
    plt.close()

In [53]:
def ensure_node_features(graphs, feat_dim):
    """
    Inject constant node features with the SAME dimensionality
    as used during GIN classification.
    """
    fixed = []
    for g in graphs:
        if g.x is None:
            g = copy.deepcopy(g)
            g.x = torch.ones((g.num_nodes, feat_dim), dtype=torch.float)
        fixed.append(g)
    return fixed

In [60]:
def run_stability(
    method,
    dataset,
    seed,
    perturb_levels=(0.0, 0.05, 0.10, 0.20)
):
    experiment_id = str(seed)
    ds = load_dataset(dataset)

    gin_feat_dim = None
    if method.lower() == "gin":
        ckpt = torch.load(
            f"{CLASS_DIR}/models/GIN_{dataset}_{experiment_id}.pth",
            map_location=DEVICE,
        )
        gin_feat_dim = ckpt["num_node_features"]
   #idx = get_valid_indices(ds)
    #graphs = [ds[i] for i in idx]
    # IMPORTANT: use exactly the same graphs as in classification
    graphs = [ds[i] for i in range(len(ds))]

    if method.lower() == "gin":
        graphs = ensure_node_features(graphs, feat_dim=gin_feat_dim)

    emb_orig, y = load_baseline_embeddings(method, dataset, experiment_id)

    if method.lower() in {"graph2vec", "netlsd"}:
        svm = load_svm(method, dataset, experiment_id)
        acc_orig = accuracy_score(y, svm.predict(emb_orig))

    elif method.lower() == "gin":
        gin_model = load_gin_model(dataset, experiment_id, DEVICE)
        loader = DataLoader(graphs, batch_size=128, shuffle=False)
        preds = []
        with torch.no_grad():
            for batch in loader:
                batch = batch.to(DEVICE)
                logits, _ = gin_model(batch.x, batch.edge_index, batch.batch)
                preds.append(logits.argmax(dim=1).cpu().numpy())
        preds = np.concatenate(preds)
        acc_orig = accuracy_score(y, preds)

    for p in perturb_levels:
        pert = []
        for i, g in enumerate(graphs):
        #for i, g in zip(idx, graphs):
            g2 = perturb_edges(g, p, seed + i)
            g2 = shuffle_node_features(g2, seed + 10000 + i)
            pert.append(g2)

        if method.lower() == "gin":
            pert = ensure_node_features(pert, feat_dim=gin_feat_dim)

        emb_pert = compute_embeddings_fixed(method, pert, dataset, experiment_id)

        if method.lower() in {"graph2vec", "netlsd"}:
              acc_pert = accuracy_score(y, svm.predict(emb_pert))
              change = {}

        elif method.lower() == "gin":
                loader = DataLoader(pert, batch_size=128, shuffle=False)
                preds = []
                with torch.no_grad():
                    for batch in loader:
                        batch = batch.to(DEVICE)
                        logits, _ = gin_model(batch.x, batch.edge_index, batch.batch)
                        preds.append(logits.argmax(dim=1).cpu().numpy())
                preds = np.concatenate(preds)
                acc_pert = accuracy_score(y, preds)
                change = embedding_change(emb_orig, emb_pert)

        row = {
            "method": method,
            "dataset": dataset,
            "seed": seed,
            "perturb_pct": p,
            "acc_orig": acc_orig,
            "acc_pert": acc_pert,
            "acc_drop": acc_orig - acc_pert,
        }

        if method.lower() == "gin":
            row.update({
                "mean_cosine": change["mean_cosine"],
                "std_cosine": change["std_cosine"],
                "mean_l2": change["mean_l2"],
                "std_l2": change["std_l2"],
            })
        else:
            row.update({
                "mean_cosine": np.nan,
                "std_cosine": np.nan,
                "mean_l2": np.nan,
                "std_l2": np.nan,
            })

        df = pd.DataFrame([row])
        out = f"{STAB_RESULTS_DIR}/stability_results.csv"
        df.to_csv(out, mode="a", header=not os.path.exists(out), index=False)


    print("Stability completed.")


In [61]:
DATASETS=["ENZYMES", "MUTAG", "IMDB-MULTI"]
METHODS=["Graph2Vec", "NetLSD", "GIN"]

for dataset in DATASETS:
    for method in METHODS:
        for seed in [42, 43, 44]:
            print(f"[RUN] {method} | {dataset} | seed={seed}")
            run_stability(
                method=method,
                dataset=dataset,
                seed=seed,
            )


[RUN] Graph2Vec | ENZYMES | seed=42
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] Graph2Vec | ENZYMES | seed=43
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] Graph2Vec | ENZYMES | seed=44
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] NetLSD | ENZYMES | seed=42
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] NetLSD | ENZYMES | seed=43
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] NetLSD | ENZYMES | seed=44
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] GIN | ENZYMES | seed=42
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] GIN | ENZYMES | seed=43
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] GIN | ENZYMES | seed=44
Loaded ENZYMES: 600 graphs
Stability completed.
[RUN] Graph2Vec | MUTAG | seed=42
Loaded MUTAG: 188 graphs
Stability completed.
[RUN] Graph2Vec | MUTAG | seed=43
Loaded MUTAG: 188 graphs
Stability completed.
[RUN] Graph2Vec | MUTAG | seed=44
Loaded MUTAG: 188 graphs
Stability completed.
[RUN] NetLSD | MUTAG | seed=42


In [62]:

df = pd.read_csv(f"{STAB_RESULTS_DIR}/stability_results.csv")
print(df.head())

# Single plots
for dataset in DATASETS:
    for method in METHODS:
        plot_stability_curves(df, method, dataset)

# Comparison Datasets
for dataset in DATASETS:
    plot_multi_method(
        df,
        dataset,
        metric="mean_cosine",
        ylabel="Mean cosine similarity",
        fname="embedding_stability.png",
    )

    plot_multi_method(
        df,
        dataset,
        metric="acc_drop",
        ylabel="Accuracy drop",
        fname="accuracy_drop.png",
    )

      method  dataset  seed  perturb_pct  acc_orig  acc_pert  acc_drop  \
0  Graph2Vec  ENZYMES    42         0.00  0.161937  0.161937       0.0   
1  Graph2Vec  ENZYMES    42         0.05  0.161937  0.161937       0.0   
2  Graph2Vec  ENZYMES    42         0.10  0.161937  0.161937       0.0   
3  Graph2Vec  ENZYMES    42         0.20  0.161937  0.161937       0.0   
4  Graph2Vec  ENZYMES    43         0.00  0.163606  0.163606       0.0   

   mean_cosine  std_cosine  mean_l2  std_l2  
0          NaN         NaN      NaN     NaN  
1          NaN         NaN      NaN     NaN  
2          NaN         NaN      NaN     NaN  
3          NaN         NaN      NaN     NaN  
4          NaN         NaN      NaN     NaN  
Saved plots to /content/drive/MyDrive/InformationSystems/Stability/plots/Graph2Vec/ENZYMES
Saved plots to /content/drive/MyDrive/InformationSystems/Stability/plots/NetLSD/ENZYMES
Saved plots to /content/drive/MyDrive/InformationSystems/Stability/plots/GIN/ENZYMES
Saved plots to 