In [3]:
import sys,os,time,pickle

#numerical stuff
import scipy as sp # is this necessary? 
import numpy as np # is this necessary?
import networkx as nx # is this necessary? 

#plotting stuff
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import random as ra # is this necessary?

#clustering
from sklearn.cluster import AgglomerativeClustering # is this necessary?
from sklearn.cluster import KMeans # is this necessary?
from sklearn.cluster import DBSCAN # is this necessary?

#metrics
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import pairwise_distances

#graspy and lgc
from graspy.embed import *
from graspy.simulations import sbm
from graspy.cluster import GaussianCluster
import localgraphclustering as lgc

This notebook investigates the performance of LGC and global spectral methods on a class of community detection tasks. In particular, assume $ A \sim SBM(\pi, n, B) $ where 

$$ B = \begin{bmatrix} p & q & ... & q \\ \vdots & \ddots & & \vdots\\ \vdots & & \ddots & \vdots \\ q & ... & q & p \end{bmatrix} $$

and $ \pi = \frac{1}{K} \begin{bmatrix} 1 & 1 & .. & 1 \end{bmatrix} ^{T} $. Apart from the effects of $ p $ and $ q $ on the two clustering techniques, there are two important regimes we are interested in: $ K = O(1) $ and $ K = o(n) $. In the first part of the notebook we fix $ K $. In the second part we allow $ K $ to grow as $ log(n) $ 

In [23]:
# K fixed, n grows, q is fixed 

K = 2 # Fix number of clusters to be 2
h = 500 # step size for the ns we consider
ns = np.arange(500, 10000 + h, step=h) # sizes of graph to consider
mc_its = 10

p = 0.5
q = 0.1
B = np.array([
    [p, q],
    [q, p]
])

pi = (1/K) * np.ones(K)

def experiment(n, pi, B, n_clusters=2, acorn=None):
    if acorn is not None:
        np.random.seed(acorn)
        
    P = np.random.multinomial(n, pi)
    Y = np.concatenate([i*np.ones(P[i]) for i in range(len(pi))])
    A = sbm(P, B)
    G = nx.from_numpy_array(A)
    nx.write_edgelist(G, "sbm.edgelist",data=False)
    g = lgc.GraphLocal('sbm.edgelist','edgelist',' ') 
    
    ASE = AdjacencySpectralEmbed()
    X_hat = ASE.fit_transform(A)
    
    GMM = GaussianCluster(min_components=n_clusters, max_components=n_clusters)
    y = GMM.fit_predict(X_hat)
    ari_ase = ari(y, Y)
    
    LSE = AdjacencySpectralEmbed()
    X_hat = LSE.fit_transform(A)
    
    GMM = GaussianCluster(min_components=n_clusters, max_components=n_clusters)
    y = GMM.fit_predict(X_hat)
    ari_lse = ari(y, Y)
    
    # now local stuff, will require for loop over all nodes in graph..
    ACLs = np.ones((n,n))
    for i in range(n):
        seed = [i]
        acl = lgc.approximate_PageRank(g,seed,normalize=False)[1] + (1/n)*np.ones(n)
        ACLs[i] = acl
        
    #- 
    GMM_local = GaussianCluster(min_components=n_clusters,max_components=n_clusters)
    y = GMM_local.fit_predict(ACLs)
    ari_lgc = ari(y, Y)
    
    return ari_ase, ari_lse, ari_lgc

In [24]:
experiment(100, pi, B)

(1.0, 1.0, -0.0013845001140177417)