In [17]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import scipy.io
import seaborn as sns
import warnings
from sklearn.cluster import KMeans

In [3]:
from google.colab import files
uploaded = files.upload()

Saving Wiki-Vote.txt to Wiki-Vote.txt


In [26]:
graph = nx.Graph()
with open('Wiki-Vote.txt' , "r") as file:
    f = file.readlines()
    data = []
    for line in f:
        data = line.strip().split()
        graph.add_edges_from([(data[0], data[1])])

In [27]:
m = graph.number_of_edges()
num_nodes = graph.number_of_nodes()
print("Number of nodes: ", num_nodes)
print("Number of edges: ", m)

Number of nodes:  7115
Number of edges:  100762


In [28]:
def modularity_mat(graph):
    m = graph.number_of_edges()
    num_nodes = graph.number_of_nodes()
    nodes = graph.nodes()
    A = np.zeros((num_nodes, num_nodes))
    for i,node1  in enumerate(nodes):
        for j,node2 in enumerate(nodes):
            b = graph.degree[node1]*graph.degree[node2]/(2*m)
            if graph.has_edge(node1, node2):
                a = 1
            else:
                a = 0
            A[i,j] = a-b
    return A

def dominant_eigenvector(A):
    eigenvalues, eigenvectors = np.linalg.eig(A)
    idx = np.argsort(eigenvalues)
    eigenvalues = eigenvalues[idx]
    eigenvectors = eigenvectors[:,idx]
    return eigenvectors[:,-1]


In [None]:
modularity_matrix = modularity_mat(graph)
print("Modularity matrix: ", modularity_matrix)

In [None]:
dominant_eigenvector = dominant_eigenvector(modularity_matrix)
print("Dominant eigenvector: ", dominant_eigenvector)



In [22]:
def binary_clustering(graph, Y):
    partition1= (np.where(Y > 0)[0])
    partition2= (np.where(Y < 0)[0])
    print("Number of nodes in cluster 1: ", len(partition1))
    print("Number of nodes in cluster 2: ", len(partition2))
    nodes = list(graph.nodes())
    cluster1 = [nodes[i] for i in partition1]
    cluster2 = [nodes[i] for i in partition2]
    list_of_clusters = [cluster1, cluster2]
    sub1 = graph.subgraph(cluster1)
    sub2 = graph.subgraph(cluster2)
    return list_of_clusters, sub1, sub2
    

In [23]:
def modularity_fun(clusters,G):
    m = G.number_of_edges()
    sums = []
    for cluster in clusters:
        sum = 0
        
        sub_graph = G.subgraph(cluster)
        nodes = (sub_graph.nodes())
        for node1 in nodes:
            for node2 in nodes:
                t = sub_graph.degree[node1] * sub_graph.degree[node2]/(2*m)
                if sub_graph.has_edge(node1,node2):
                    a = 1
                else:
                    a = 0
                sum+= a - t
            sums.append(sum)
    return(np.sum(sums)/(2*m))

In [24]:
total_clusters =[]
total_modularity=[]
def clustering(graph, modularity,list_of_clusters):
    
    index = list_of_clusters.index(set(graph.nodes()))
    last_cluster = list_of_clusters.pop(index)
    print('----------------------------------------------------------------------------')
    print("\nlength of last cluster: ", len(last_cluster))

    M = modularity_mat(graph)
    D = dominant_eigenvector(M)
    print("dominant eigenvector: ", D)
    res, sub1, sub2 = binary_clustering(graph, D)

    list_of_clusters.append(set(res[0]))
    list_of_clusters.append(set(res[1]))

    print("length of cluster 1: ", len(set(res[0])))
    print("length of cluster 2: ", len(set(res[1])))

    total_clusters.append(list_of_clusters)

    modularity_new = modularity_fun(list_of_clusters, graph)
    total_modularity.append(modularity_new)
    print("new modularity: ", modularity_new)

    if modularity_new < modularity:
        list_of_clusters.pop()
        list_of_clusters.pop()
        list_of_clusters.append(last_cluster)
        return list_of_clusters, res, sub1, sub2


    clustering(sub1, modularity_new, list_of_clusters)
    clustering(sub2, modularity_new, list_of_clusters)
    return list_of_clusters, res, sub1, sub2

In [None]:
modularity =0
list_of_clusters = []
list_of_clusters.append(set(graph.nodes()))
clusters = clustering(graph, modularity, list_of_clusters)

----------------------------------------------------------------------------

length of last cluster:  7115
dominant eigenvector:  [-3.30323002e-03+0.j -4.70672371e-03+0.j -2.72284899e-02+0.j ...
  2.05998369e-04+0.j  3.79765150e-05+0.j  2.27061515e-05+0.j]
Number of nodes in cluster 1:  3347
Number of nodes in cluster 2:  3768
length of cluster 1:  3347
length of cluster 2:  3768


In [16]:
print(len(clusters))

2


In [19]:
for i in clusters:
  print(len(i))

3347
3773


In [24]:
nodes = list(graph.nodes())
nodes = [int(a) for a in nodes]
max_nodes = max(nodes)
print(max_nodes)

ValueError: ignored