In [None]:
from datasets import load_dataset
import networkx as nx
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from sklearn.metrics import normalized_mutual_info_score as NMI, adjusted_rand_score as ARI
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import from_networkx
from community import community_louvain
from node2vec import Node2Vec
import pandas as pd


dataset = load_dataset('sentiment140')


tweets = dataset['train']['text'][:5000]
sentiments = dataset['train']['sentiment'][:5000]


G = nx.Graph()
for i, tweet in enumerate(tweets):
    G.add_node(i, text=tweet, sentiment=sentiments[i])


for i in range(len(tweets)):
    for j in range(i + 1, len(tweets)):
        if np.random.rand() > 0.95:
            G.add_edge(i, j)

node_features = torch.randn(G.number_of_nodes(), 16)
data = from_networkx(G)
data.x = node_features


class GCN(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(in_channels, 16)
        self.conv2 = GCNConv(16, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

def compute_node_embeddings(data, num_clusters):
    model = GCN(data.x.shape[1], num_clusters)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = torch.nn.CrossEntropyLoss()
    model.train()

    with torch.no_grad():
        initial_embeddings = model(data.x, data.edge_index)
        kmeans = KMeans(n_clusters=num_clusters).fit(initial_embeddings.detach().numpy())
        labels = torch.tensor(kmeans.labels_, dtype=torch.long)

    for epoch in range(200):
        optimizer.zero_grad()
        out = model(data.x, data.edge_index)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        embeddings = model(data.x, data.edge_index).numpy()
    return embeddings

def conductance(G, clusters):
    total_cut = 0
    for cluster in clusters:
        cut = nx.algorithms.cuts.cut_size(G, cluster)
        volume = sum(dict(G.degree(cluster)).values())
        if volume == 0:
            continue
        total_cut += cut / volume
    return total_cut / len(clusters)

num_clusters = 2


gcn_embeddings = compute_node_embeddings(data, num_clusters)


kmeans = KMeans(n_clusters=num_clusters).fit(gcn_embeddings)
hierarchical = AgglomerativeClustering(n_clusters=num_clusters).fit(gcn_embeddings)
spectral = SpectralClustering(n_clusters=num_clusters, affinity='nearest_neighbors').fit(gcn_embeddings)
louvain = community_louvain.best_partition(G)


def deepwalk_embedding(G, dimensions=64, walk_length=30, num_walks=200, workers=4):
    node2vec = Node2Vec(G, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=workers)
    model = node2vec.fit(window=10, min_count=1)
    embeddings = np.array([model.wv[str(node)] for node in G.nodes()])
    return embeddings

deepwalk_embeddings = deepwalk_embedding(G)


node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4).fit()
node2vec_embeddings = np.array([node2vec.wv[str(node)] for node in G.nodes()])


deepwalk_clusters = KMeans(n_clusters=num_clusters).fit_predict(deepwalk_embeddings)
node2vec_clusters = KMeans(n_clusters=num_clusters).fit_predict(node2vec_embeddings)


def compute_metrics(G, labels):
    if G.number_of_edges() == 0:
        return 0, 0, 0, 0  #

    modularity = nx.algorithms.community.quality.modularity(G, [list(np.where(labels == c)[0]) for c in np.unique(labels)])
    nmi = NMI(list(G.nodes), labels)
    ari = ARI(list(G.nodes), labels)
    conductance_value = conductance(G, [list(np.where(labels == c)[0]) for c in np.unique(labels)])
    return modularity, nmi, conductance_value, ari

metrics = {
    'KMeans': compute_metrics(G, kmeans.labels_),
    'Hierarchical Clustering': compute_metrics(G, hierarchical.labels_),
    'Spectral Clustering': compute_metrics(G, spectral.labels_),
    'Louvain Method': compute_metrics(G, np.array(list(louvain.values()))),
    'DeepWalk': compute_metrics(G, deepwalk_clusters),
    'Node2Vec': compute_metrics(G, node2vec_clusters),
    'GCN': compute_metrics(G, kmeans.labels_)
}


df = pd.DataFrame(metrics, index=['Modularity', 'NMI', 'Conductance', 'ARI']).transpose()


print(df)


Found cached dataset sentiment140 (C:/Users/AI-BIO/.cache/huggingface/datasets/sentiment140/sentiment140/1.0.0/7fdc297b986cb1dad1197eae755ef4d204f77fb43ba2bb81cc2a51a7565de122)


  0%|          | 0/2 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/5000 [00:00<?, ?it/s]

Computing transition probabilities:   0%|          | 0/5000 [00:00<?, ?it/s]

                         Modularity       NMI  Conductance  ARI
KMeans                     0.004712  0.150436     0.495285  0.0
Hierarchical Clustering    0.003794  0.148293     0.496117  0.0
Spectral Clustering        0.004758  0.150422     0.495238  0.0
Louvain Method             0.053583  0.389125     0.821852  0.0
DeepWalk                   0.037148  0.149884     0.462617  0.0
Node2Vec                   0.037966  0.150345     0.461966  0.0
GCN                        0.004712  0.150436     0.495285  0.0
