In [None]:
import numpy as np
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import normalized_mutual_info_score as NMI, adjusted_rand_score as ARI
import networkx as nx
from torch_geometric.utils import to_networkx
from community import community_louvain
from node2vec import Node2Vec
import pandas as pd

#  Cora
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]


G = to_networkx(data, to_undirected=True)


node_features = data.x.numpy()
num_clusters = 7


class GCN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, 16)
        self.conv2 = GCNConv(16, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

def compute_node_embeddings(data, num_clusters):
    model = GCN(data.num_node_features, num_clusters)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    model.train()

    with torch.no_grad():
        initial_embeddings = model(data.x, data.edge_index)
        kmeans = KMeans(n_clusters=num_clusters).fit(initial_embeddings.detach().numpy())
        labels = torch.tensor(kmeans.labels_, dtype=torch.long)

    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        embeddings = model(data.x, data.edge_index).numpy()
    return embeddings

def conductance(G, clusters):
    total_cut = 0
    for cluster in clusters:
        cut = nx.algorithms.cuts.cut_size(G, cluster)
        volume = sum(dict(G.degree(cluster)).values())
        if volume == 0:
            continue
        total_cut += cut / volume
    return total_cut / len(clusters)


gcn_embeddings = compute_node_embeddings(data, num_clusters)


kmeans = KMeans(n_clusters=num_clusters).fit(gcn_embeddings)
hierarchical = AgglomerativeClustering(n_clusters=num_clusters).fit(gcn_embeddings)
spectral = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors').fit(gcn_embeddings)
louvain = community_louvain.best_partition(G)


def deepwalk_embedding(G, dimensions=64, walk_length=30, num_walks=200, workers=4):
    node2vec = Node2Vec(G, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=workers)
    model = node2vec.fit(window=10, min_count=1)
    embeddings = np.array([model.wv[str(node)] for node in G.nodes()])
    return embeddings

deepwalk_embeddings = deepwalk_embedding(G)


node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4).fit()
node2vec_embeddings = np.array([node2vec.wv[str(node)] for node in G.nodes()])


deepwalk_clusters = KMeans(n_clusters=num_clusters).fit_predict(deepwalk_embeddings)
node2vec_clusters = KMeans(n_clusters=num_clusters).fit_predict(node2vec_embeddings)


ground_truth = data.y.numpy()


def compute_metrics(G, labels, ground_truth):
    modularity = nx.algorithms.community.quality.modularity(G, [list(np.where(labels == c)[0]) for c in np.unique(labels)])
    nmi = NMI(ground_truth, labels)
    ari = ARI(ground_truth, labels)
    conductance_value = conductance(G, [list(np.where(labels == c)[0]) for c in np.unique(labels)])
    return modularity, nmi, conductance_value, ari

metrics = {
    'KMeans': compute_metrics(G, kmeans.labels_, ground_truth),
    'Hierarchical Clustering': compute_metrics(G, hierarchical.labels_, ground_truth),
    'Spectral Clustering': compute_metrics(G, spectral.labels_, ground_truth),
    'Louvain Method': compute_metrics(G, np.array(list(louvain.values())), ground_truth),
    'DeepWalk': compute_metrics(G, deepwalk_clusters, ground_truth),
    'Node2Vec': compute_metrics(G, node2vec_clusters, ground_truth),
    'GCN': compute_metrics(G, kmeans.labels_, ground_truth)
}

df = pd.DataFrame(metrics, index=['Modularity', 'NMI', 'Conductance', 'ARI']).transpose()


print(df)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

                         Modularity       NMI  Conductance       ARI
KMeans                     0.338124  0.039522     0.503807  0.028320
Hierarchical Clustering    0.321444  0.035691     0.533756  0.023483
Spectral Clustering        0.320906  0.039848     0.514364  0.020454
Louvain Method             0.812417  0.453054     0.032918  0.228396
DeepWalk                   0.735419  0.459246     0.080744  0.397011
Node2Vec                   0.721063  0.386686     0.130607  0.313690
GCN                        0.338124  0.039522     0.503807  0.028320
