In [None]:
!pip install umap-learn



In [None]:
# Imports
import os
import time
import numpy as np
import pandas as pd
import torch

from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import adjusted_rand_score, silhuette_score
from sklearn.manifold import TSNE
import umap.umap_ as umap
import matplotlib.pyplot as plt
import json

In [None]:
# Connect to google drive
from google.colab import drive
drive.mount('/content/drive')
CLASS_DIR = "/content/drive/MyDrive/InformationSystems/Classification/embeddings"
BASE_DIR = "/content/drive/MyDrive/InformationSystems/Clustering"
RESULTS_DIR = f"{BASE_DIR}/results"
PLOTS_DIR = f"{BASE_DIR}/plots"

os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(CLASS_DIR, exist_ok=True)
os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def load_embeddings_and_labels(method_name: str,
                               experiment_num: str,
                               base_dir: str = CLASS_DIR):
    """
    Loads embeddings.npy and labels.npy for the given method/experiment.
    """
    exp_dir = os.path.join(base_dir, method_name, str(experiment_num))

    emb_path = os.path.join(exp_dir, "embeddings.npy")
    labels_path = os.path.join(exp_dir, "labels.npy")

    if not os.path.exists(emb_path):
        raise FileNotFoundError(f"Embeddings file not found: {emb_path}")
    if not os.path.exists(labels_path):
        raise FileNotFoundError(f"Labels file not found: {labels_path}")

    embeddings = np.load(emb_path)
    labels = np.load(labels_path)

    print(f"Loaded embeddings {embeddings.shape} and labels {labels.shape} from {exp_dir}")
    return embeddings, labels

In [None]:
def run_single_clustering(embeddings: np.ndarray,
                          labels_true: np.ndarray,
                          method: str = "kmeans",
                          n_clusters: int | None = None,
                          random_state: int = 42):
    """
    Runs a clustering method ('kmeans' or 'spectral') on the embeddings and returns:
      - predicted labels
      - ARI
      - silhouette score
      - clustering time (seconds)
    """
    if n_clusters is None:
        n_clusters = len(np.unique(labels_true))

    start = time.time()

    if method == "kmeans":
        model = KMeans(
            n_clusters=n_clusters,
            random_state=random_state,
            n_init="auto"
        )
        labels_pred = model.fit_predict(embeddings)

    elif method == "spectral":
        model = SpectralClustering(
            n_clusters=n_clusters,
            affinity="nearest_neighbors",
            assign_labels="kmeans",
            random_state=random_state,
        )
        labels_pred = model.fit_predict(embeddings)

    else:
        raise ValueError(f"Unknown clustering method: {method}")

    clustering_time = time.time() - start

    # --- metrics ---
    ari = adjusted_rand_score(labels_true, labels_pred)

    # silhouette score requires >1 cluster
    sil_score = None
    if len(np.unique(labels_pred)) > 1:
        sil_score = silhouette_score(embeddings, labels_pred)

    result = {
        "clustering_method": method,
        "n_clusters": n_clusters,
        "ari": ari,
        "silhouette": sil_score,
        "clustering_time (s)": clustering_time,
    }

    return result, labels_pred

In [None]:
def compute_tsne_embedding(
    embeddings: np.ndarray,
    n_components: int = 2,
    perplexity: float = 30.0,
    learning_rate: str | float = "auto",
    n_iter: int = 1000,
    random_state: int = 42,
):
    """
    Computes 2D t-SNE embedding and returns:
      - coords_2d: np.ndarray (N, 2)
      - tsne_time: time in sec
      - params: dict with the tsne hyperparameters (for logging)
    """
    tsne = TSNE(
        n_components=n_components,
        perplexity=perplexity,
        learning_rate=learning_rate,
        n_iter=n_iter,
        random_state=random_state,
        init="pca",
    )

    start = time.time()
    coords_2d = tsne.fit_transform(embeddings)
    tsne_time = time.time() - start

    params = {
        "n_components": n_components,
        "perplexity": perplexity,
        "learning_rate": learning_rate,
        "n_iter": n_iter,
        "random_state": random_state,
    }

    print(f" t-SNE computed in {tsne_time:.2f} s with perplexity={perplexity}")
    return coords_2d, tsne_time, params

In [None]:
def compute_umap_embedding(
    embeddings: np.ndarray,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.1,
    metric: str = "euclidean",
    random_state: int = 42,
):
    """
      Calculates 2D UMAP embedding and returns:
      - coords_2d: np.ndarray (N, 2)
      - umap_time: time in sec
      - params: dict with the umap hyperparameters (for logging/report)
    """
    reducer = umap.UMAP(
        n_components=n_components,
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        metric=metric,
        random_state=random_state,
    )

    start = time.time()
    coords_2d = reducer.fit_transform(embeddings)
    umap_time = time.time() - start

    params = {
        "n_components": n_components,
        "n_neighbors": n_neighbors,
        "min_dist": min_dist,
        "metric": metric,
        "random_state": random_state,
    }

    print(f"UMAP computed in {umap_time:.2f} s (n_neighbors={n_neighbors})")
    return coords_2d, umap_time, params

In [None]:
def append_to_clustering_log(log_path: str, row: dict):
    """
    Appends a single result row to a CSV log file.
    If the file does not exist, it creates it with a header.
    """
    df_row = pd.DataFrame([row])

    if os.path.exists(log_path):
        df_row.to_csv(log_path, mode="a", header=False, index=False)
    else:
        # write header the first time
        df_row.to_csv(log_path, mode="w", header=True, index=False)

    print(f"Appended results to {log_path}")

In [None]:
def plot_embedding_2d(coords_2d: np.ndarray,
                      labels: np.ndarray,
                      title: str,
                      save_path: str | None = None):
    """
    Simple 2D scatter plot of an embedding, colored by labels.
    """
    plt.figure(figsize=(6, 5))
    scatter = plt.scatter(
        coords_2d[:, 0],
        coords_2d[:, 1],
        c=labels,
        s=20,
        alpha=0.8,
    )
    plt.title(title)
    plt.xlabel("Dim 1")
    plt.ylabel("Dim 2")
    plt.tight_layout()

    if save_path is not None:
        plt.savefig(save_path, dpi=150)
        print(f"Saved plot to {save_path}")

    plt.close()

In [None]:
def run_clustering_pipeline(method_name: str,
                            experiment_num: str,
                            dataset_name: str,
                            run_kmeans: bool = True,
                            run_spectral: bool = True,
                            run_tsne: bool = False,
                            run_umap: bool = False,
                            tsne_kwargs: dict | None = None,
                            umap_kwargs: dict | None = None,
                            base_dir: str = CLASS_DIR,
                            random_state: int = 42):
    """
    High-level pipeline:
      1. Load embeddings & labels for given method/experiment.
      2. Run k-means and/or spectral clustering.
      3. Compute ARI.
      4. Log results to CSV.
    """
    log_path = os.path.join(RESULTS_DIR, f"{method_name}_clustering.csv")
    embeddings, labels_true = load_embeddings_and_labels(
        method_name=method_name,
        experiment_num=experiment_num,
        base_dir=base_dir,
    )

    n_clusters = len(np.unique(labels_true))

    tsne_coords, tsne_time, tsne_params = None, None, None
    umap_coords, umap_time, umap_params = None, None, None

    if run_tsne:
        tsne_kwargs = tsne_kwargs or {}
        tsne_coords, tsne_time, tsne_params = compute_tsne_embedding(
            embeddings,
            random_state=random_state,
            **tsne_kwargs,
        )

    if run_umap:
        umap_kwargs = umap_kwargs or {}
        umap_coords, umap_time, umap_params = compute_umap_embedding(
            embeddings,
            random_state=random_state,
            **umap_kwargs,
        )

    # Save plots if we computed t-SNE / UMAP
    plot_dir = os.path.join(BASE_DIR,"plots", method_name, str(experiment_num))
    os.makedirs(plot_dir, exist_ok=True)

    tsne_plot_path = None
    umap_plot_path = None

    if tsne_coords is not None:
        tsne_plot_path = os.path.join(
            plot_dir,
            f"tsne_{dataset_name}_true_labels.png"
        )
        plot_embedding_2d(
            tsne_coords,
            labels_true,
            title=f"{method_name} + t-SNE ({dataset_name}, true labels)",
            save_path=tsne_plot_path,
        )

    if umap_coords is not None:
        umap_plot_path = os.path.join(
            plot_dir,
            f"umap_{dataset_name}_true_labels.png"
        )
        plot_embedding_2d(
            umap_coords,
            labels_true,
            title=f"{method_name} + UMAP ({dataset_name}, true labels)",
            save_path=umap_plot_path,
        )

    results = []

    def _base_row():
        return {
            "experiment_num": experiment_num,
            "dataset": dataset_name,
            "embedding_type": method_name,
            # t-SNE logging
            "tsne_used": run_tsne,
            "tsne_time (s)": tsne_time if tsne_time is not None else None,
            "tsne_params": json.dumps(tsne_params) if tsne_params is not None else None,
            # UMAP logging
            "umap_used": run_umap,
            "umap_time (s)": umap_time if umap_time is not None else None,
            "umap_params": json.dumps(umap_params) if umap_params is not None else None,
        }

    if run_kmeans:
        res, labels_kmeans = run_single_clustering(
            embeddings,
            labels_true,
            method="kmeans",
            n_clusters=n_clusters,
            random_state=random_state,
        )
        row = _base_row()
        row.update(res)
        append_to_clustering_log(log_path, row)
        results.append(row)

    if run_spectral:
        res, labels_spectral = run_single_clustering(
            embeddings,
            labels_true,
            method="spectral",
            n_clusters=n_clusters,
            random_state=random_state,
        )
        row = _base_row()
        row.update(res)
        append_to_clustering_log(log_path, row)
        results.append(row)

    return results, tsne_coords, umap_coords, labels_true

In [None]:
out = run_clustering_pipeline(
    method_name="Graph2Vec",  #Graph2Vec or GIN or NetLSD
    experiment_num="21112025_1511",
    dataset_name="ENZYMES",
    run_kmeans=True,
    run_spectral=True,
    run_tsne=True,
    run_umap=True,
    tsne_kwargs={"perplexity": 35.0, "n_iter": 1500},
)


Loaded embeddings (599, 128) and labels (599,) from /content/drive/MyDrive/InformationSystems/Classification/embeddings/Graph2Vec/21112025_1511




 t-SNE computed in 21.27 s with perplexity=35.0


  warn(


[âœ“] UMAP computed in 22.37 s (n_neighbors=15)
Saved plot to /content/drive/MyDrive/InformationSystems/Clustering/plots/Graph2Vec/21112025_1511/tsne_ENZYMES_true_labels.png
Saved plot to /content/drive/MyDrive/InformationSystems/Clustering/plots/Graph2Vec/21112025_1511/umap_ENZYMES_true_labels.png
Appended results to /content/drive/MyDrive/InformationSystems/Clustering/results/Graph2Vec_clustering.csv
Appended results to /content/drive/MyDrive/InformationSystems/Clustering/results/Graph2Vec_clustering.csv
