In [1]:
import pandas as pd
from scipy.sparse import csr_matrix, diags
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score as nmi
import time
import matplotlib.pyplot as plt
from os import listdir
from time import time

import sys
sys.path += ['utils/']  

from node2vec.model import Node2Vec

from node_embedding import *
from dcsbm import *

import warnings
warnings.filterwarnings("ignore")

directory = 'dataset/'

In [2]:
def RunSimulation(dim):

    datasets = ['amazon', 'dblp', 'livejournal', 'youtube']
    
    tv = []
    tnsv = []
    sv = []
    snsv = []

    for name in datasets:

        EL = pd.read_csv(directory + name + '.csv')[['id1', 'id2']]
        n = len(pd.concat([EL.id1, EL.id2]).unique())

        ℓtrue = pd.read_csv(directory + name + '_label.csv').set_index('node')
        ℓtrue = ℓtrue.loc[np.arange(n)].label.values
        n_clusters = len(np.unique(ℓtrue))

        A = csr_matrix((np.ones(len(EL)), (EL.id1, EL.id2)), shape = (n,n))    
        d = A@np.ones(A.shape[0])

        t0 = time()
        res = NodeEmbedding(A, dim, n_epochs = 50, walk_length = 3, k = 1, verbose = True, η = 1., sym = True)
        t = time() - t0
        s = computeScore(res.X, ℓtrue, n_trials = 1, norm_bool = True)

        t0 = time()
        X = Node2VecNS(A, dim, verbose = True)
        tns = time() - t0
        sns = computeScore(X, ℓtrue, n_trials = 1)
    
        tv.append(t)
        tnsv.append(tns)
        sv.append(s)
        snsv.append(sns)
       
    df = pd.DataFrame(columns = datasets)
    df_time = pd.DataFrame(columns = datasets)
    
    row = dict(zip(datasets, sv))
    df = df.append(row, ignore_index = True)
    row = dict(zip(datasets, tv))
    df_time = df_time.append(row, ignore_index = True)
    
    row = dict(zip(datasets, snsv))
    df = df.append(row, ignore_index = True)
    row = dict(zip(datasets, tnsv))
    df_time = df_time.append(row, ignore_index = True)
    
   # save the result
    try:
        nn = (np.max([int(x.split('_')[1]) for x in listdir('saved_files/real_graphs/perf/')]))
        df.to_csv('saved_files/real_graphs/perf/v_' + str(nn+1) + '_.csv', index = False)
        df_time.to_csv('saved_files/real_graphs/time/v_' + str(nn+1) + '_.csv', index = False)

    except:
        df.to_csv('saved_files/real_graphs/perf/v_' + str(1) + '_.csv', index = False)
        df_time.to_csv('saved_files/real_graphs/time/v_' + str(1) + '_.csv', index = False)
        
    return

In [8]:
# n_sim = 10
# dim = 32

# for i in range(n_sim):
#     RunSimulation(dim)

In [6]:
files = listdir('saved_files/real_graphs/perf/')

df_list_perf = []
df_list_time = []

for f in files:
    df_list_perf.append(pd.read_csv('saved_files/real_graphs/perf/' + f))
    df_list_time.append(pd.read_csv('saved_files/real_graphs/time/' + f))
    
df_list_perf = pd.concat(df_list_perf).reset_index()
df_list_time = pd.concat(df_list_time).reset_index()

print(df_list_perf.groupby('index').mean())
print(df_list_time.groupby('index').mean())
print('\n')
print(df_list_perf.groupby('index').std())
print(df_list_time.groupby('index').std())

         amazon      dblp  livejournal   youtube
index                                           
0      0.936627  0.554096     0.907121  0.607016
1      0.934032  0.508152     0.909789  0.646905
          amazon        dblp  livejournal    youtube
index                                               
0       3.004778   20.845956    30.225652   9.312429
1      16.604569  145.069742   166.920541  64.386668


         amazon      dblp  livejournal   youtube
index                                           
0      0.001966  0.002051     0.000818  0.000960
1      0.003293  0.002359     0.000409  0.001013
         amazon      dblp  livejournal   youtube
index                                           
0      0.426105  2.264605     2.136620  0.696179
1      0.803745  6.656247    12.593651  2.748380
