In [16]:
import sys,os,time,pickle

#numerical stuff
import scipy as sp # is this necessary? 
import numpy as np # is this necessary?
import networkx as nx # is this necessary? 

#plotting stuff
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import random as ra # is this necessary?

#clustering
from sklearn.cluster import AgglomerativeClustering # is this necessary?
from sklearn.cluster import KMeans # is this necessary?
from sklearn.cluster import DBSCAN # is this necessary?

#metrics
from sklearn.metrics import adjusted_rand_score as ari
from sklearn.metrics import pairwise_distances

#graspy and lgc
from graspy.embed import *
from graspy.simulations import sbm
from graspy.cluster import GaussianCluster
import localgraphclustering as lgc

from tqdm import tqdm

This notebook investigates the performance of LGC and global spectral methods on a class of community detection tasks. In particular, assume $ A \sim SBM(\pi, n, B) $ where 

$$ B = \begin{bmatrix} p & q & ... & q \\ \vdots & \ddots & & \vdots\\ \vdots & & \ddots & \vdots \\ q & ... & q & p \end{bmatrix} $$

and $ \pi = \frac{1}{K} \begin{bmatrix} 1 & 1 & .. & 1 \end{bmatrix} ^{T} $. Apart from the effects of $ p $ and $ q $ on the two clustering techniques, there are two important regimes we are interested in: $ K = O(1) $ and $ K = o(n) $. In the first part of the notebook we fix $ K $. In the second part we allow $ K $ to grow as $ log(n) $ 

In [56]:
# K fixed, n grows, q is fixed 

K = 2 # Fix number of clusters to be 2
nh = 500 # step size for the ns we consider
ns = np.arange(500, 10000 + nh, step=nh) # sizes of graph to consider
mc_its = 10 # number of iterations

p = 0.5 # within-block probability of edge
q = 0.1 # between block probability of edge
B = np.array([
    [p, q],
    [q, p]
])

pi = (1/K) * np.ones(K) # class membership priors

def experiment(n, pi, B, acorn=None):
    if acorn is not None:
        np.random.seed(acorn)
        
    n_clusters = len(pi)
        
    P = np.random.multinomial(n, pi) # multi-nomial trial; number of vertices in each cluster
    Y = np.concatenate([i*np.ones(P[i]) for i in range(len(pi))]) # cluster "labels"
    A = sbm(P, B) # random adjacency matrix
    G = nx.from_numpy_array(A) # graph from adjacency matrix for lgc technique
    nx.write_edgelist(G, "sbm.edgelist",data=False)
    g = lgc.GraphLocal('sbm.edgelist','edgelist',' ') 
    
    ASE = AdjacencySpectralEmbed() # initialize object (defaults?)
    X_hat = ASE.fit_transform(A) # ASE(A)
    
    GMM = GaussianCluster(min_components=n_clusters, max_components=n_clusters) # initialize object
    y = GMM.fit_predict(X_hat) # estimate communities
    ari_ase = ari(y, Y) # how'd we do?!
    
    LSE = AdjacencySpectralEmbed() # initialize LSE object (defaults?)
    X_hat = LSE.fit_transform(A) # LSE(A); matrix operations are in function 
    
    GMM = GaussianCluster(min_components=n_clusters, max_components=n_clusters) # initialize object
    y = GMM.fit_predict(X_hat) # estimate communities
    ari_lse = ari(y, Y) # how'd we do?!
    
    # now local stuff, will require for loop over all nodes in graph..
    ACLs = np.ones((n,n))
    for i in range(n):
        seed = [i]
        acl = lgc.approximate_PageRank(g,seed,normalize=False)[1] 
        ACLs[i] = acl
        
    #- 
    GMM_local = GaussianCluster(min_components=n_clusters,max_components=n_clusters) 
    y = GMM_local.fit_predict(ACLs) # naively cluster assuming gaussianity
    ari_lgc = ari(y, Y) # how'd we do?!
    
    return ari_ase, ari_lse, ari_lgc

def simulation(ns, k_function, p, q_function, mc_its, acorn=None, verbose=False):
    ARIs = np.zeros((len(ns), 3, mc_its))
    
    for i, n in enumerate(ns):
        # get number of clusters
        if isinstance(k_function, int):
            n_clusters=k_function
        else:
            n_clusters = int(k_function(n))

        # assume equal probabilities for each cluster.. may want to change in future
        pi = 1 / n_clusters * np.ones(n_clusters)

        # get components of B matrix
        if isinstance(q_function, float):
            q = q_function
        else:
            q = q_function(n)

        rows_of_B = [q*np.ones(n_clusters) for i in range(n_clusters)]
        for j in range(n_clusters):
            rows_of_B[j][j] = p

        B = np.array(rows_of_B)

        if verbose:
            for j in tqdm(range(mc_its)):
                ARIs[i, :, j] = experiment(n, pi, B, acorn)
            
        else:
            for j in range(mc_its):
                ARIs[i, :, j] = experiment(n, pi, B, acorn)
            
    mean_ARIs = np.mean(ARIs, axis=2)
    stderr_ARIs = np.sqrt(np.var(ARIs, axis=2, ddof=1) / mc_its)
    
    return mean_ARIs, stderr_ARIs

In [57]:
mean, stderr = simulation(ns = [50, 100], k_function=2, p=0.5, q_function=0.2, mc_its=2, verbose=True)


  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:02<00:02,  2.00s/it][A
100%|██████████| 2/2 [00:03<00:00,  1.85s/it][A
  0%|          | 0/2 [00:00<?, ?it/s][A
 50%|█████     | 1/2 [00:06<00:06,  6.02s/it][A
100%|██████████| 2/2 [00:11<00:00,  5.93s/it][A