In [1]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install torch-sparse  -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install torch-geometric

!pip install --use-deprecated=legacy-resolver karateclub networkx numpy pandas matplotlib scikit-learn

!pip install torch torchvision torchaudio
!pip install torch-geometric \
    -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html


!pip install optuna
!pip install karateclub

Looking in links: https://data.pyg.org/whl/torch-2.2.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/torch_scatter-2.1.2%2Bpt22cu121-cp312-cp312-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt22cu121
Looking in links: https://data.pyg.org/whl/torch-2.2.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/torch_sparse-0.6.18%2Bpt22cu121-cp312-cp312-linux_x86_64.whl (5.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt22cu121
Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━

In [2]:
import os, time, copy
import numpy as np
import pandas as pd
import torch
import joblib
import networkx as nx

from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GINConv, global_add_pool
import torch.nn as nn
import torch.nn.functional as F

from karateclub import Graph2Vec, NetLSD
from sklearn.metrics import accuracy_score

  import torch_geometric.typing
  import torch_geometric.typing


In [3]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')
CLASS_DIR = "/content/drive/MyDrive/InformationSystems/Classification"
BASE_DIR = "/content/drive/MyDrive/InformationSystems/Stability"
RESULTS_DIR = f"{BASE_DIR}/results"
PLOTS_DIR = f"{BASE_DIR}/plots"

os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

Mounted at /content/drive
Device: cpu


In [4]:
# Dataset
def load_dataset(name, root="/content/data"):
    ds = TUDataset(root=root, name=name)
    print(f"Loaded {name}: {len(ds)} graphs")
    return ds


def get_valid_indices(ds):
    return [i for i in range(len(ds)) if ds[i].num_nodes and ds[i].num_nodes > 2]

In [5]:
# Graph Perturbations

def perturb_edges(data, pct, seed):
    """
    Randomly remove % of edges and add % new random edges.
    Returns a NEW Data object.
    """
    rng = np.random.default_rng(seed)
    d = copy.deepcopy(data)

    ei = d.edge_index.cpu().numpy().T
    E = ei.shape[0]
    n_remove = int(pct * E)

    if n_remove > 0:
        keep = rng.choice(E, size=E - n_remove, replace=False)
        ei = ei[keep]

    edges = set(map(tuple, ei))
    n_add = n_remove

    while len(edges) < E:
        u = rng.integers(0, d.num_nodes)
        v = rng.integers(0, d.num_nodes)
        if u != v:
            edges.add((u, v))
            edges.add((v, u))

    d.edge_index = torch.tensor(list(edges)).t().long()
    return d


def shuffle_node_features(data, seed):
    if data.x is None:
        return data
    rng = np.random.default_rng(seed)
    d = copy.deepcopy(data)
    perm = rng.permutation(d.x.size(0))
    d.x = d.x[perm]
    return d



In [6]:
# Load Artifacts

def load_baseline_embeddings(method, dataset, exp):
    base = f"{CLASS_DIR}/embeddings/{method}/{dataset}/{exp}/baseline"
    return np.load(f"{base}/embeddings.npy"), np.load(f"{base}/labels.npy")


def load_svm(method, dataset, exp):
    path = f"{CLASS_DIR}/models/{method}_SVM_{dataset}_{exp}.joblib"
    return joblib.load(path)


def load_karate_model(method, dataset, exp):
    path = f"{CLASS_DIR}/models/{method}_{dataset}_{exp}.joblib"
    return joblib.load(path)

In [7]:
class GINEncoderClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, num_layers, num_classes, dropout):
        super().__init__()
        self.convs = nn.ModuleList()
        self.bns = nn.ModuleList()
        self.dropout = dropout

        for i in range(num_layers):
            mlp = nn.Sequential(
                nn.Linear(in_dim if i == 0 else hidden_dim, hidden_dim),
                nn.ReLU(),
                nn.Linear(hidden_dim, hidden_dim),
            )
            self.convs.append(GINConv(mlp))
            self.bns.append(nn.BatchNorm1d(hidden_dim))

        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        for conv, bn in zip(self.convs, self.bns):
            x = F.relu(bn(conv(x, edge_index)))
            x = F.dropout(x, p=self.dropout, training=self.training)
        g = global_add_pool(x, batch)
        return self.classifier(g), g


def load_gin_model(dataset, exp, device):
    ckpt = torch.load(
        f"{CLASS_DIR}/models/GIN_{dataset}_{exp}.pth",
        map_location=device,
    )
    model = GINEncoderClassifier(
        ckpt["num_node_features"],
        ckpt["hidden_dim"],
        ckpt["num_layers"],
        ckpt["num_classes"],
        ckpt["dropout"],
    )
    model.load_state_dict(ckpt["state_dict"])
    model.eval().to(device)
    return model

In [8]:
# Embeddings

def pyg_to_nx(d):
    G = nx.Graph()
    G.add_nodes_from(range(d.num_nodes))
    edges = d.edge_index.cpu().numpy().T
    G.add_edges_from(edges)
    return G


def compute_embeddings_fixed(method, graphs, dataset, exp):
    if method.lower() in {"graph2vec", "netlsd"}:
        model = load_karate_model(method, dataset, exp)
        nx_graphs = [pyg_to_nx(g) for g in graphs]
        model.fit(nx_graphs)
        return model.get_embedding()

    if method.lower() == "gin":
        model = load_gin_model(dataset, exp, DEVICE)
        loader = DataLoader(graphs, batch_size=128, shuffle=False)
        embs = []
        with torch.no_grad():
            for batch in loader:
                batch = batch.to(DEVICE)
                _, g = model(batch.x, batch.edge_index, batch.batch)
                embs.append(g.cpu().numpy())
        return np.vstack(embs)

    raise ValueError(method)

In [9]:
# Stability Metrics

def embedding_change(a, b):
    a = a / (np.linalg.norm(a, axis=1, keepdims=True) + 1e-12)
    b = b / (np.linalg.norm(b, axis=1, keepdims=True) + 1e-12)
    cos = np.sum(a * b, axis=1)
    l2 = np.linalg.norm(a - b, axis=1)
    return {
        "mean_cosine": cos.mean(),
        "std_cosine": cos.std(),
        "mean_l2": l2.mean(),
        "std_l2": l2.std(),
    }

In [10]:
def run_stability(
    method,
    dataset,
    experiment,
    perturb_levels=(0.0, 0.05, 0.10, 0.20),
    seeds=(42,),
):
    ds = load_dataset(dataset)
    idx = get_valid_indices(ds)
    graphs = [ds[i] for i in idx]

    emb_orig, y = load_baseline_embeddings(method, dataset, experiment)
    svm = load_svm(method, dataset, experiment)
    acc_orig = accuracy_score(y, svm.predict(emb_orig))

    for p in perturb_levels:
        for seed in seeds:
            pert = []
            for i, g in zip(idx, graphs):
                g2 = perturb_edges(g, p, seed + i)
                g2 = shuffle_node_features(g2, seed + 10000 + i)
                pert.append(g2)

            emb_pert = compute_embeddings_fixed(method, pert, dataset, experiment)
            acc_pert = accuracy_score(y, svm.predict(emb_pert))
            change = embedding_change(emb_orig, emb_pert)

            row = {
                "method": method,
                "dataset": dataset,
                "experiment": experiment,
                "perturb_pct": p,
                "seed": seed,
                "acc_orig": acc_orig,
                "acc_pert": acc_pert,
                "acc_drop": acc_orig - acc_pert,
                **change,
            }

            df = pd.DataFrame([row])
            out = f"{RESULTS_DIR}/{method}_{dataset}_stability.csv"
            df.to_csv(out, mode="a", header=not os.path.exists(out), index=False)

    print("Stability completed.")


In [11]:
run_stability(
    method="Graph2Vec",        # Graph2Vec | NetLSD | GIN
    dataset="MUTAG",
    experiment="21012026_1714",
)

Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Processing...
Done!


Loaded MUTAG: 188 graphs
Stability completed.
