In [14]:
!pip install torch-scatter -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install torch-sparse  -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install torch-geometric

!pip install --use-deprecated=legacy-resolver karateclub networkx numpy pandas matplotlib scikit-learn

!pip install torch torchvision torchaudio
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric \
    -f https://data.pyg.org/whl/torch-$(python -c "import torch; print(torch.__version__)").html


!pip install optuna
!pip install karateclub

Looking in links: https://data.pyg.org/whl/torch-2.2.0+cu121.html
Looking in links: https://data.pyg.org/whl/torch-2.2.0+cu121.html
Looking in links: https://data.pyg.org/whl/torch-2.8.0+cu126.html
Collecting numpy<1.23.0 (from karateclub)
  Using cached numpy-1.22.4.zip (11.5 MB)
  Installing build dependencies ... [?25l[?25hdone
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Getting requirements to build wheel ... [?25l[?25herror
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
[31m│[0m exit code: [1;36m1[0m
[31m╰─>[0m See above for output.

[1;35mnote[0m: This error originates from a subprocess, and is likely not a

In [15]:
# Imports

import torch
import torch.nn.functional as F
from torch.nn import Linear, Sequential, ReLU, BatchNorm1d
from torch_geometric.nn import GINConv, global_add_pool
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from sklearn.metrics import accuracy_score, f1_score
from torch_geometric.transforms import OneHotDegree
import optuna
import pandas as pd
import time, os, psutil
from sklearn.metrics import roc_auc_score
import numpy as np
from karateclub import NetLSD, Graph2Vec
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from torch_geometric.utils import to_networkx
import networkx as nx
import warnings
warnings.filterwarnings("ignore")


In [16]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')
BASE_DIR = "/content/drive/MyDrive/InformationSystems/Classification"
RESULTS_DIR = f"{BASE_DIR}/results"
MODELS_DIR = f"{BASE_DIR}/models"
EMBEDDINGS_DIR = f"{BASE_DIR}/embeddings"

os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
def sanitize_embeddings(embeddings: np.ndarray) -> np.ndarray:
    """
    Replace NaN/Inf values in embeddings and ensure a clean float32 array.
    This is useful for karateclub embeddings that may occasionally produce
    unstable values on some graphs.
    """
    emb = np.asarray(embeddings, dtype=np.float32)
    # Replace NaN and +/- Inf with 0.0
    emb = np.nan_to_num(emb, nan=0.0, posinf=0.0, neginf=0.0)
    return emb

In [18]:
def filter_enzymes_graphs(graphs, labels, min_nodes: int = 3):
    """
    Special handling for ENZYMES: remove very small graphs that can cause
    numerical issues for NetLSD / Graph2Vec.
    Returns filtered (graphs, labels) and prints how many were removed.
    """
    if len(graphs) == 0:
        return graphs, labels

    mask = [g.number_of_nodes() >= min_nodes for g in graphs]
    if not any(mask):
        print("WARNING: All ENZYMES graphs would be filtered out. Skipping filtering.")
        return graphs, labels

    filtered_graphs = [g for g, keep in zip(graphs, mask) if keep]
    if isinstance(labels, np.ndarray):
        filtered_labels = labels[np.array(mask)]
    else:
        filtered_labels = [y for y, keep in zip(labels, mask) if keep]

    removed = len(graphs) - len(filtered_graphs)
    print(f"ENZYMES filtering: removed {removed} graphs with < {min_nodes} nodes, kept {len(filtered_graphs)} graphs.")
    return filtered_graphs, filtered_labels

In [19]:
# GIN Model Definition

class GIN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim, num_classes, num_layers=5, dropout=0.5):
        super(GIN, self).__init__()
        layers = []
        in_dim = num_features
        for _ in range(num_layers):
            nn = Sequential(Linear(in_dim, hidden_dim), ReLU(), Linear(hidden_dim, hidden_dim))
            layers.append(GINConv(nn))
            in_dim = hidden_dim
        self.convs = torch.nn.ModuleList(layers)
        self.bns = torch.nn.ModuleList([BatchNorm1d(hidden_dim) for _ in range(num_layers)])
        self.fc1 = Linear(hidden_dim, hidden_dim)
        self.fc2 = Linear(hidden_dim, num_classes)
        self.dropout = dropout

    def forward(self, x, edge_index, batch):
        for conv, bn in zip(self.convs, self.bns):
            x = F.relu(conv(x, edge_index))
            x = bn(x)
        x = global_add_pool(x, batch)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc2(x)
        return x

In [20]:
# Training

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(DEVICE)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Evaluation

def evaluate(model, loader, criterion):
    model.eval()
    preds, labels, probs = [], [], []
    total_loss = 0.0
    num_batches = 0

    with torch.no_grad():
        for data in loader:
            data = data.to(DEVICE)
            out = model(data.x, data.edge_index, data.batch)

            loss = criterion(out, data.y)
            total_loss += loss.item()
            num_batches += 1

            pred = out.argmax(dim=1)
            preds.extend(pred.cpu().numpy())
            labels.extend(data.y.cpu().numpy())
            probs.extend(F.softmax(out, dim=1).cpu().numpy())  # probabilities for AUC

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    avg_loss = total_loss / max(1, num_batches)

    try:
        if len(np.unique(labels)) == 2:
            auc = roc_auc_score(labels, np.array(probs)[:, 1])
        else:
            auc = roc_auc_score(labels, probs, multi_class='ovr')
    except ValueError:
        auc = np.nan  # if there are not enough samples for AUC

    return acc, f1, auc, avg_loss

In [21]:
def get_gin_embeddings(model, loader):
    """Return graph-level embeddings (after global_add_pool) and labels."""
    model.eval()
    all_emb = []
    all_labels = []
    with torch.no_grad():
        for data in loader:
            data = data.to(DEVICE)
            x, edge_index, batch = data.x, data.edge_index, data.batch
            # forward μέχρι το pooling
            for conv, bn in zip(model.convs, model.bns):
                x = F.relu(conv(x, edge_index))
                x = bn(x)
            x = global_add_pool(x, batch)
            all_emb.append(x.cpu().numpy())
            all_labels.extend(data.y.cpu().numpy())
    embeddings = np.concatenate(all_emb, axis=0)
    labels = np.array(all_labels)
    return embeddings, labels

In [22]:
def run_gin_pipeline(
    dataset_name,
    use_optuna,
    w_acc,
    w_f1,
    w_auc,
    hidden_dim,
    epochs,
    batch_size=32,
    n_trials=15,
):

    # Experiment ID (used in logs and embeddings path)
    experiment_num = time.strftime("%d%m%Y_%H%M", time.localtime())
    # Load dataset
    dataset = TUDataset(root='data/TUDataset', name=dataset_name).shuffle()

    # If no node features use one-hot degree features
    if dataset.num_features == 0 or dataset[0].x is None:
        print("Dataset has no node features. Applying OneHotDegree transform...")

        # Find maximum degree across all graphs
        max_degree = 0
        for data in dataset:
            deg = torch.bincount(data.edge_index[0], minlength=data.num_nodes)
            max_degree = max(max_degree, int(deg.max()))

        # Apply transform
        oh_transform = OneHotDegree(max_degree=max_degree)
        dataset = TUDataset(
            root='data/TUDataset',
            name=dataset_name,
            transform=oh_transform
        ).shuffle()

        num_node_features = max_degree + 1
    else:
        num_node_features = dataset.num_features

    # Train/test split
    train_dataset = dataset[:int(0.8 * len(dataset))]
    test_dataset = dataset[int(0.8 * len(dataset)):]
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    print(f"Loaded dataset {dataset_name}: {len(dataset)} graphs, {num_node_features} node features, {dataset.num_classes} classes")

    def objective(trial):
        num_layers = trial.suggest_int("num_layers", 3, 6)
        dropout = trial.suggest_float("dropout", 0.0, 0.6)
        lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
        weight_decay = trial.suggest_loguniform("weight_decay", 1e-6, 1e-3)

        model = GIN(num_node_features, hidden_dim, dataset.num_classes, num_layers, dropout).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = torch.nn.CrossEntropyLoss()

        for epoch in range(5):  # fewer epochs for fast tuning
            train(model, train_loader, optimizer, criterion)

        acc, f1, auc, _ = evaluate(model, test_loader, criterion)
        score = (w_acc * acc) + (w_f1 * f1) + (w_auc * (0 if np.isnan(auc) else auc))
        return score

    start_generation = time.time()
    if use_optuna:
        print("Running Optuna for hyperparameter tuning...")
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials)
        best_params = study.best_params
        print(f"Best hyperparameters: {best_params}")
    else:
      best_params = { "num_layers": 5, "dropout": 0.5, "lr": 0.001, "weight_decay": 1e-4}
      print(f"Using default hyperparameters: {best_params}")

    generation_time = time.time() - start_generation

    # Final Training with best parameters

    print("\nRunning final training GIN...")
    print(best_params)

    model = GIN(num_node_features, hidden_dim, dataset.num_classes,
                num_layers=best_params["num_layers"], dropout=best_params["dropout"]).to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=best_params["lr"], weight_decay=best_params["weight_decay"])
    criterion = torch.nn.CrossEntropyLoss()

    history = []
    start_time = time.time()
    eval_acc, eval_f1, eval_auc, eval_loss, eval_epoch = 0, 0, 0, 1e9, 0
    best_loss_for_best_epoch = 1e9
    for epoch in range(1, epochs + 1):
        loss = train(model, train_loader, optimizer, criterion)
        acc, f1, auc, e_loss = evaluate(model, test_loader, criterion)
        if acc > eval_acc:
          #edo mipos to allakso na einai kai edo sindiasmos me weights poy eixe kai sto optuna
          eval_acc, eval_f1, eval_auc, eval_loss, eval_epoch = acc, f1, auc, e_loss, epoch
          best_loss_for_best_epoch = e_loss

        elapsed = time.time() - start_time
        print(f"Epoch {epoch:03d} | Loss={loss:.4f} | TestAcc={acc:.3f} | F1={f1:.3f} | AUC={auc:.3f} | Time={elapsed:.2f}s")
        history.append([epoch, loss, acc, f1, auc, elapsed])

    training_time = time.time() - start_time
    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / (1024 ** 2)  # in MB


    # Save GIN embeddings for the whole dataset

    full_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    gin_embeddings, gin_labels = get_gin_embeddings(model, full_loader)

    gin_exp_dir = os.path.join(EMBEDDINGS_DIR, "GIN", experiment_num)
    os.makedirs(gin_exp_dir, exist_ok=True)
    np.save(os.path.join(gin_exp_dir, "embeddings.npy"), gin_embeddings)
    np.save(os.path.join(gin_exp_dir, "labels.npy"), gin_labels)

    # Log file save

    summary_path = f"{RESULTS_DIR}/training_log.csv"
    os.makedirs("results", exist_ok=True)


    summary_data = {
        "experiment_num": experiment_num,
        "dataset": dataset_name,
        "optimization_enabled": "yes" if use_optuna else "no",
        "embedding_dimension": hidden_dim,
        "objective_weights": f"({w_acc},{w_f1},{w_auc})",
        "num_layers": best_params["num_layers"],
        "dropout": best_params["dropout"],
        "lr": best_params["lr"],
        "weight_decay": best_params["weight_decay"],
        "epochs": epochs,
        "best_epoch": eval_epoch,
        "best_loss": round(float(best_loss_for_best_epoch), 4),
        "eval_loss": round(float(eval_loss), 4),
        "eval_acc": round(eval_acc, 4),
        "eval_f1": round(eval_f1, 4),
        "eval_auc": round(eval_auc, 4),
        "training_time (s)": round(training_time, 2),
        "generation_time (s)": round(generation_time, 2),
        "memory_usage (MB)": round(memory_usage, 2)
    }

    df = pd.DataFrame([summary_data])

    # Append mode (keep all trainings)
    if os.path.exists(summary_path):
        df.to_csv(summary_path, mode='a', index=False, header=False)
    else:
        df.to_csv(summary_path, index=False)

    print(f"\nTraining summary stored in : {summary_path}")
    print(df)

    # Save best model
    torch.save(model.state_dict(), f"{MODELS_DIR}/GIN_{dataset_name}.pth")
    print(f"Saved model: results/GIN_{dataset_name}.pth")


In [23]:
def run_graph2vec_pipeline(
    dataset_name,
    w_acc=0.5,
    w_f1=0.3,
    w_auc=0.2,
    embedding_dim=128,
    epochs=50,
    test_size=0.2,
    use_optuna=True,
    n_trials=20,
):
    """
    Pipeline for graph classification using Graph2Vec embeddings + SVM,
    with optional Optuna-based hyperparameter tuning and special handling
    for ENZYMES + embedding sanitization.
    """
    # Experiment ID
    experiment_num = time.strftime("%d%m%Y_%H%M", time.localtime())

    # Load dataset
    dataset = TUDataset(root='data/TUDataset', name=dataset_name).shuffle()
    print(f"Loaded dataset {dataset_name} for Graph2Vec: {len(dataset)} graphs, {dataset.num_classes} classes")

    # Convert PyG graphs to NetworkX graphs
    graphs = []
    labels = []
    for data in dataset:
        g = to_networkx(data, to_undirected=True)
        graphs.append(g)
        labels.append(int(data.y.item()))

    labels = np.array(labels)

    # Special handling for ENZYMES (filter very small graphs)
    if dataset_name.upper() == "ENZYMES":
        graphs, labels = filter_enzymes_graphs(graphs, labels, min_nodes=3)

    # Outer train/test split on graphs
    train_graphs, test_graphs, y_train, y_test = train_test_split(
        graphs,
        labels,
        test_size=test_size,
        random_state=42,
        stratify=labels,
    )

    opt_time = 0.0

    def objective(trial):
        # Hyperparameters for the SVM classifier
        C = trial.suggest_loguniform("C", 1e-2, 1e2)
        gamma = trial.suggest_loguniform("gamma", 1e-4, 1e1)

        # Inner train/validation split on graphs
        inner_tr_graphs, inner_val_graphs, y_tr, y_val = train_test_split(
            train_graphs,
            y_train,
            test_size=0.2,
            random_state=42,
            stratify=y_train,
        )

        all_graphs = inner_tr_graphs + inner_val_graphs

        # Fit Graph2Vec on all (transductive setting) and slice embeddings
        g2v = Graph2Vec(dimensions=embedding_dim, wl_iterations=2, epochs=epochs, workers=os.cpu_count())
        g2v.fit(all_graphs)
        emb_all = sanitize_embeddings(g2v.get_embedding())

        X_tr = emb_all[:len(inner_tr_graphs)]
        X_val = emb_all[len(inner_tr_graphs):]

        clf = SVC(kernel="rbf", probability=True, C=C, gamma=gamma, random_state=42)
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_val)
        y_prob = clf.predict_proba(X_val)

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average="weighted")
        try:
            if len(np.unique(y_val)) == 2:
                auc = roc_auc_score(y_val, y_prob[:, 1])
            else:
                auc = roc_auc_score(y_val, y_prob, multi_class="ovr")
        except ValueError:
            auc = np.nan

        score = (w_acc * acc) + (w_f1 * f1) + (w_auc * (0 if np.isnan(auc) else auc))
        return score

    if use_optuna:
        print("Running Optuna for Graph2Vec+SVM hyperparameter tuning...")
        start_opt = time.time()
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials)
        best_params = study.best_params
        opt_time = time.time() - start_opt
        print(f"Best hyperparameters (Graph2Vec+SVM): {best_params}")
    else:
        best_params = {"C": 1.0, "gamma": "scale"}
        print(f"Using default SVM hyperparameters: {best_params}")

    # Final embedding + training using best hyperparameters
    print("Running final Graph2Vec embedding on train+test graphs...")
    all_graphs_final = train_graphs + test_graphs
    start_embed = time.time()
    g2v = Graph2Vec(dimensions=embedding_dim, wl_iterations=2, epochs=epochs, workers=os.cpu_count())
    g2v.fit(all_graphs_final)
    emb_all = sanitize_embeddings(g2v.get_embedding())
    embed_time = time.time() - start_embed

    X_train = emb_all[:len(train_graphs)]
    X_test = emb_all[len(train_graphs):]

    print("Training final SVM on Graph2Vec embeddings...")
    start_train = time.time()
    clf = SVC(kernel="rbf", probability=True, C=best_params["C"], gamma=best_params["gamma"], random_state=42)
    clf.fit(X_train, y_train)
    train_time = time.time() - start_train

    # Evaluation on held-out test graphs
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    try:
        if len(np.unique(y_test)) == 2:
            auc = roc_auc_score(y_test, y_prob[:, 1])
        else:
            auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except ValueError:
        auc = np.nan

    score = (w_acc * acc) + (w_f1 * f1) + (w_auc * (0 if np.isnan(auc) else auc))


    # Save Graph2Vec embeddings (for all graphs) + labels

    g2v_exp_dir = os.path.join(EMBEDDINGS_DIR, "Graph2Vec", experiment_num)
    os.makedirs(g2v_exp_dir, exist_ok=True)
    np.save(os.path.join(g2v_exp_dir, "embeddings.npy"), emb_all)
    np.save(os.path.join(g2v_exp_dir, "labels.npy"), np.array(labels))

    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / (1024 ** 2)  # in MB

    print(f"Graph2Vec Results on {dataset_name} -> Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}, Score: {score:.3f}")
    print(f"Embedding time: {embed_time:.2f}s | SVM training time: {train_time:.2f}s | Optuna time: {opt_time:.2f}s | Memory usage: {memory_usage:.2f} MB")

    # Log summary to CSV
    summary_path = f"{RESULTS_DIR}/graph2vec_log.csv"

    summary_data = {
        "experiment_num": experiment_num,

        "dataset": dataset_name,
        "embedding_type": "Graph2Vec",
        "embedding_dimension": embedding_dim,
        "optuna_enabled": "yes" if use_optuna else "no",
        "C": best_params["C"],
        "gamma": best_params["gamma"],
        "acc": round(float(acc), 4),
        "f1": round(float(f1), 4),
        "auc": round(float(auc) if not np.isnan(auc) else -1, 4),
        "score": round(float(score), 4),
        "embedding_time (s)": round(embed_time, 2),
        "svm_training_time (s)": round(train_time, 2),
        "optuna_time (s)": round(opt_time, 2),
        "memory_usage (MB)": round(memory_usage, 2),
    }

    df = pd.DataFrame([summary_data])
    if os.path.exists(summary_path):
        df.to_csv(summary_path, mode='a', index=False, header=False)
    else:
        df.to_csv(summary_path, index=False)

    print(f"Graph2Vec summary stored in: {summary_path}")


In [24]:
def run_netlsd_pipeline(
    dataset_name,
    w_acc=0.5,
    w_f1=0.3,
    w_auc=0.2,
    test_size=0.2,
    use_optuna=True,
    n_trials=20,
):
    """
    Pipeline for graph classification using NetLSD embeddings + SVM,
    with optional Optuna-based hyperparameter tuning and ENZYMES filtering.
    """
    # Experiment ID
    experiment_num = time.strftime("%d%m%Y_%H%M", time.localtime())
    # Load dataset
    dataset = TUDataset(root='data/TUDataset', name=dataset_name).shuffle()
    print(f"Loaded dataset {dataset_name} for NetLSD: {len(dataset)} graphs, {dataset.num_classes} classes")

    # Convert PyG graphs to NetworkX graphs
    graphs = []
    labels = []
    for data in dataset:
        g = to_networkx(data, to_undirected=True)
        graphs.append(g)
        labels.append(int(data.y.item()))

    labels = np.array(labels)

    # Special handling for ENZYMES
    if dataset_name.upper() == "ENZYMES":
        graphs, labels = filter_enzymes_graphs(graphs, labels, min_nodes=3)

    # Outer train/test split on graphs
    train_graphs, test_graphs, y_train, y_test = train_test_split(
        graphs,
        labels,
        test_size=test_size,
        random_state=42,
        stratify=labels,
    )

    opt_time = 0.0

    def objective(trial):
        # Hyperparameters for the SVM classifier
        C = trial.suggest_loguniform("C", 1e-2, 1e2)
        gamma = trial.suggest_loguniform("gamma", 1e-4, 1e1)

        # Inner train/validation split on graphs
        inner_tr_graphs, inner_val_graphs, y_tr, y_val = train_test_split(
            train_graphs,
            y_train,
            test_size=0.2,
            random_state=42,
            stratify=y_train,
        )

        all_graphs = inner_tr_graphs + inner_val_graphs

        # Fit NetLSD on all and slice embeddings
        netlsd = NetLSD()
        netlsd.fit(all_graphs)
        emb_all = sanitize_embeddings(netlsd.get_embedding())

        X_tr = emb_all[:len(inner_tr_graphs)]
        X_val = emb_all[len(inner_tr_graphs):]

        clf = SVC(kernel="rbf", probability=True, C=C, gamma=gamma, random_state=42)
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_val)
        y_prob = clf.predict_proba(X_val)

        acc = accuracy_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred, average="weighted")
        try:
            if len(np.unique(y_val)) == 2:
                auc = roc_auc_score(y_val, y_prob[:, 1])
            else:
                auc = roc_auc_score(y_val, y_prob, multi_class="ovr")
        except ValueError:
            auc = np.nan

        score = (w_acc * acc) + (w_f1 * f1) + (w_auc * (0 if np.isnan(auc) else auc))
        return score

    if use_optuna:
        print("Running Optuna for NetLSD+SVM hyperparameter tuning...")
        start_opt = time.time()
        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=n_trials)
        best_params = study.best_params
        opt_time = time.time() - start_opt
        print(f"Best hyperparameters (NetLSD+SVM): {best_params}")
    else:
        best_params = {"C": 1.0, "gamma": "scale"}
        print(f"Using default SVM hyperparameters: {best_params}")

    # Final embedding + training using best hyperparameters
    print("Running final NetLSD embedding on train+test graphs...")
    all_graphs_final = train_graphs + test_graphs
    start_embed = time.time()
    netlsd = NetLSD()
    netlsd.fit(all_graphs_final)
    emb_all = sanitize_embeddings(netlsd.get_embedding())
    embed_time = time.time() - start_embed

    X_train = emb_all[:len(train_graphs)]
    X_test = emb_all[len(train_graphs):]

    print("Training final SVM on NetLSD embeddings...")
    start_train = time.time()
    clf = SVC(kernel="rbf", probability=True, C=best_params["C"], gamma=best_params["gamma"], random_state=42)
    clf.fit(X_train, y_train)
    train_time = time.time() - start_train

    # Evaluation on held-out test graphs
    y_pred = clf.predict(X_test)
    y_prob = clf.predict_proba(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")
    try:
        if len(np.unique(y_test)) == 2:
            auc = roc_auc_score(y_test, y_prob[:, 1])
        else:
            auc = roc_auc_score(y_test, y_prob, multi_class="ovr")
    except ValueError:
        auc = np.nan

    score = (w_acc * acc) + (w_f1 * f1) + (w_auc * (0 if np.isnan(auc) else auc))


    # Save NetLSD embeddings (all graphs) + labels

    netlsd_exp_dir = os.path.join(EMBEDDINGS_DIR, "NetLSD", experiment_num)
    os.makedirs(netlsd_exp_dir, exist_ok=True)
    np.save(os.path.join(netlsd_exp_dir, "embeddings.npy"), emb_all)
    np.save(os.path.join(netlsd_exp_dir, "labels.npy"), np.array(labels))

    process = psutil.Process(os.getpid())
    memory_usage = process.memory_info().rss / (1024 ** 2)  # in MB

    print(f"NetLSD Results on {dataset_name} -> Acc: {acc:.3f}, F1: {f1:.3f}, AUC: {auc:.3f}, Score: {score:.3f}")
    print(f"Embedding time: {embed_time:.2f}s | SVM training time: {train_time:.2f}s | Optuna time: {opt_time:.2f}s | Memory usage: {memory_usage:.2f} MB")

    # Log summary to CSV
    summary_path = f"{RESULTS_DIR}/netlsd_log.csv"

    summary_data = {
        "experiment_num": experiment_num,
        "dataset": dataset_name,
        "embedding_type": "NetLSD",
        "optuna_enabled": "yes" if use_optuna else "no",
        "C": best_params["C"],
        "gamma": best_params["gamma"],
        "acc": round(float(acc), 4),
        "f1": round(float(f1), 4),
        "auc": round(float(auc) if not np.isnan(auc) else -1, 4),
        "score": round(float(score), 4),
        "embedding_time (s)": round(embed_time, 2),
        "svm_training_time (s)": round(train_time, 2),
        "optuna_time (s)": round(opt_time, 2),
        "memory_usage (MB)": round(memory_usage, 2),
    }

    df = pd.DataFrame([summary_data])
    if os.path.exists(summary_path):
        df.to_csv(summary_path, mode='a', index=False, header=False)
    else:
        df.to_csv(summary_path, index=False)

    print(f"NetLSD summary stored in: {summary_path}")


In [25]:
run_gin_pipeline(
    dataset_name="MUTAG",
    use_optuna=True,
    w_acc=0.5,
    w_f1=0.3,
    w_auc=0.2,
    hidden_dim=64,
    epochs=10,
    batch_size=32,
    n_trials=3,
)

[I 2025-11-21 15:11:30,729] A new study created in memory with name: no-name-12706f1b-64ec-4aa5-b75b-3f6dc60a4d7b


Loaded dataset MUTAG: 188 graphs, 7 node features, 2 classes
Running Optuna for hyperparameter tuning...


[I 2025-11-21 15:11:31,080] Trial 0 finished with value: 0.8847282347282348 and parameters: {'num_layers': 3, 'dropout': 0.05841567693412568, 'lr': 0.006407982415764923, 'weight_decay': 8.217058221938571e-06}. Best is trial 0 with value: 0.8847282347282348.
[I 2025-11-21 15:11:31,400] Trial 1 finished with value: 0.2847372760906596 and parameters: {'num_layers': 3, 'dropout': 0.15278766282830905, 'lr': 0.0012825057091342817, 'weight_decay': 4.1179903887272965e-05}. Best is trial 0 with value: 0.8847282347282348.
[I 2025-11-21 15:11:31,906] Trial 2 finished with value: 0.3426497340031175 and parameters: {'num_layers': 6, 'dropout': 0.4172555685589557, 'lr': 0.0036455932756505483, 'weight_decay': 0.0008042309870004285}. Best is trial 0 with value: 0.8847282347282348.


Best hyperparameters: {'num_layers': 3, 'dropout': 0.05841567693412568, 'lr': 0.006407982415764923, 'weight_decay': 8.217058221938571e-06}

Running final training GIN...
{'num_layers': 3, 'dropout': 0.05841567693412568, 'lr': 0.006407982415764923, 'weight_decay': 8.217058221938571e-06}
Epoch 001 | Loss=0.6883 | TestAcc=0.711 | F1=0.590 | AUC=0.902 | Time=0.07s
Epoch 002 | Loss=0.3776 | TestAcc=0.842 | F1=0.837 | AUC=0.929 | Time=0.14s
Epoch 003 | Loss=0.3448 | TestAcc=0.868 | F1=0.861 | AUC=0.956 | Time=0.21s
Epoch 004 | Loss=0.3301 | TestAcc=0.763 | F1=0.721 | AUC=0.904 | Time=0.28s
Epoch 005 | Loss=0.2680 | TestAcc=0.789 | F1=0.761 | AUC=0.875 | Time=0.37s
Epoch 006 | Loss=0.3046 | TestAcc=0.763 | F1=0.721 | AUC=0.941 | Time=0.44s
Epoch 007 | Loss=0.5430 | TestAcc=0.789 | F1=0.761 | AUC=0.949 | Time=0.51s
Epoch 008 | Loss=0.3017 | TestAcc=0.842 | F1=0.842 | AUC=0.919 | Time=0.58s
Epoch 009 | Loss=0.2417 | TestAcc=0.868 | F1=0.866 | AUC=0.939 | Time=0.66s
Epoch 010 | Loss=0.1973 | Tes

In [26]:
run_graph2vec_pipeline(
    dataset_name="ENZYMES",
    w_acc=0.5, w_f1=0.3, w_auc=0.2,
    embedding_dim=128,
    epochs=10,
    test_size=0.2,
    use_optuna=True,
    n_trials=3,
)

Loaded dataset ENZYMES for Graph2Vec: 600 graphs, 6 classes


[I 2025-11-21 15:11:33,071] A new study created in memory with name: no-name-969b983e-1db5-4996-a612-1b2995dcb44c


ENZYMES filtering: removed 1 graphs with < 3 nodes, kept 599 graphs.
Running Optuna for Graph2Vec+SVM hyperparameter tuning...


[I 2025-11-21 15:11:33,867] Trial 0 finished with value: 0.21122012120801734 and parameters: {'C': 5.230738733974349, 'gamma': 1.2705242995137496}. Best is trial 0 with value: 0.21122012120801734.
[I 2025-11-21 15:11:34,681] Trial 1 finished with value: 0.23571003181974545 and parameters: {'C': 0.12616462870858278, 'gamma': 0.00013262503528969994}. Best is trial 1 with value: 0.23571003181974545.
[I 2025-11-21 15:11:35,433] Trial 2 finished with value: 0.2229684943626405 and parameters: {'C': 0.3051874850456814, 'gamma': 1.6141692229754927}. Best is trial 1 with value: 0.23571003181974545.


Best hyperparameters (Graph2Vec+SVM): {'C': 0.12616462870858278, 'gamma': 0.00013262503528969994}
Running final Graph2Vec embedding on train+test graphs...
Training final SVM on Graph2Vec embeddings...
Graph2Vec Results on ENZYMES -> Acc: 0.167, F1: 0.106, AUC: 0.447, Score: 0.205
Embedding time: 0.84s | SVM training time: 0.15s | Optuna time: 2.36s | Memory usage: 971.66 MB
Graph2Vec summary stored in: /content/drive/MyDrive/InformationSystems/Classification/results/graph2vec_log.csv


In [27]:
run_netlsd_pipeline(
    dataset_name="ENZYMES",
    w_acc=0.5, w_f1=0.3, w_auc=0.2,
    test_size=0.2,
    use_optuna=True,
    n_trials=3,
)

Loaded dataset ENZYMES for NetLSD: 600 graphs, 6 classes


[I 2025-11-21 15:11:36,803] A new study created in memory with name: no-name-e251be39-d5e6-4d2f-841f-4fd7807e2413


ENZYMES filtering: removed 1 graphs with < 3 nodes, kept 599 graphs.
Running Optuna for NetLSD+SVM hyperparameter tuning...


[I 2025-11-21 15:11:40,244] Trial 0 finished with value: 0.21250000000000002 and parameters: {'C': 0.04593304718208167, 'gamma': 0.00029408622785242013}. Best is trial 0 with value: 0.21250000000000002.
[I 2025-11-21 15:11:43,582] Trial 1 finished with value: 0.21278645833333337 and parameters: {'C': 0.020546034101263242, 'gamma': 0.0005022386467944024}. Best is trial 1 with value: 0.21278645833333337.
[I 2025-11-21 15:11:45,982] Trial 2 finished with value: 0.21523437500000003 and parameters: {'C': 0.030472976931070834, 'gamma': 0.0066796320792242144}. Best is trial 2 with value: 0.21523437500000003.


Best hyperparameters (NetLSD+SVM): {'C': 0.030472976931070834, 'gamma': 0.0066796320792242144}
Running final NetLSD embedding on train+test graphs...
Training final SVM on NetLSD embeddings...
NetLSD Results on ENZYMES -> Acc: 0.267, F1: 0.186, AUC: 0.388, Score: 0.267
Embedding time: 5.66s | SVM training time: 0.51s | Optuna time: 9.18s | Memory usage: 982.07 MB
NetLSD summary stored in: /content/drive/MyDrive/InformationSystems/Classification/results/netlsd_log.csv
