## Benchmarks
This demo shows the performance of other SOTA graph embeddings methods and their limitations:
they do not take attributes into account (only can handle discreet attributes)
they are very dependable onto the reinitialization of the random walks and minor graph changes

In [5]:
import os,sys
import random
import numpy as np
import networkx as nx

sys.path.append(os.path.realpath('lib'))
from lib.data_loader import load_local_data
from benchmarks.sub2vec import Sub2vec

## sub2vec demo

In [6]:
dataset_n='aids'
path='data/'
X,y=load_local_data(path,dataset_n, attributes=False, use_node_deg=False)
sub2vec = Sub2vec(property='s', walkLength=100, output='aids_walk', d=128, iter=100, windowSize=2, p=0.5, model='dm')
sub2vec.obtainRandomWalks(X)
embeddings = sub2vec.calculateEmbeddings()
print(embeddings.shape)

Total vects  2000
(2000, 128)


In [None]:
# re-calculate the embeddings and demonstrate that the cosine similarity doesn't work within rounds 

In [8]:
sub2vec.obtainRandomWalks(X)
embeddings2 = sub2vec.calculateEmbeddings()


Total vects  2000


In [10]:
# display cosine similarity for first 10 embeddings
from sklearn.metrics.pairwise import cosine_similarity
num_graphs, d = embeddings.shape
random_graphs =  np.random.randint(0, num_graphs, size=(10))
for i in random_graphs:
    print(f"Similarity of two graphs within rounds is {cosine_similarity(embeddings[i,:].reshape(1, -1), embeddings2[i,:].reshape(1, -1))}")


Similarity of two graphs within rounds is [[-0.02972129]]
Similarity of two graphs within rounds is [[-0.00695749]]
Similarity of two graphs within rounds is [[0.18097776]]
Similarity of two graphs within rounds is [[-0.0468096]]
Similarity of two graphs within rounds is [[-0.0196318]]
Similarity of two graphs within rounds is [[0.0975268]]
Similarity of two graphs within rounds is [[-0.0731004]]
Similarity of two graphs within rounds is [[0.06960186]]
Similarity of two graphs within rounds is [[0.00319865]]
Similarity of two graphs within rounds is [[-0.16390699]]


In [None]:
# and now calculate the embeddings in one go and check whether they are similar or not this way

In [None]:
X_double = np.hstack((X,X))
sub2vec.obtainRandomWalks(X_double)
embeddings1_2 = sub2vec.calculateEmbeddings()


In [21]:
embeddings1 = embeddings1_2[:2000,:]
embeddings2 = embeddings1_2[2000:,:]
num_graphs, d = embeddings1.shape
random_graphs =  np.random.randint(0, num_graphs, size=(10))
for i in random_graphs:
    print(f"Similarity of two graphs from the same round is {cosine_similarity(embeddings1[i,:].reshape(1, -1), embeddings2[i,:].reshape(1, -1))}")
# much better but still very small at times, not consistent and these are the same graphs!

Similarity of two graphs from the same round is [[0.9879497]]
Similarity of two graphs from the same round is [[0.97967386]]
Similarity of two graphs from the same round is [[0.9444059]]
Similarity of two graphs from the same round is [[0.9886148]]
Similarity of two graphs from the same round is [[0.3091755]]
Similarity of two graphs from the same round is [[0.97971815]]
Similarity of two graphs from the same round is [[0.981849]]
Similarity of two graphs from the same round is [[0.5953318]]
Similarity of two graphs from the same round is [[0.9656513]]
Similarity of two graphs from the same round is [[0.98958343]]


In [22]:
# I modified the graphs removing one single (always connected only to one or max two neighbors node)

In [23]:
from copy import deepcopy
def remove_single_node(graphs_list):
    ''' function just removes a random node from a graph which has one or two neighbors'''
    modified_graphs = []
    for graph in graphs_list:
        nx_graph = graph.nx_graph # get the graph
        degree = nx_graph.degree
        degreeDict = dict(degree)
        random_nodes_to_delete = []
        # pick nodes which have 1 neighbor only
        for node, degree in degreeDict.items():
            if degree == 1:
                random_nodes_to_delete.append(node)
        # else pick nodes which have 2 neighbors
        if len(random_nodes_to_delete) == 0:
            for node, degree in degreeDict.items():
                if degree == 2:
                    random_nodes_to_delete.append(node)
        if len(random_nodes_to_delete)==0:
            modified_graphs.append(graph) # leave the graph as it was
            print('A graph which has no nodes with degrees 1 & 2 detected')
        else:
            copy_graph = deepcopy(graph) # create a new deep copy
            copy_graph.nx_graph.remove_node(random.choice(random_nodes_to_delete))

            modified_graphs.append(copy_graph)
        #
    return modified_graphs


In [24]:
# a small unit test of a function above
subset_X = X[:10]
modified_subset = remove_single_node(subset_X)
print('done')
for i in range(10):
    print(f"graph before modif has {len(subset_X[i].nx_graph)} and after modif {len(modified_subset[i].nx_graph)}")


done
graph before modif has 47 and after modif 46
graph before modif has 11 and after modif 10
graph before modif has 9 and after modif 8
graph before modif has 10 and after modif 9
graph before modif has 16 and after modif 15
graph before modif has 9 and after modif 8
graph before modif has 11 and after modif 10
graph before modif has 10 and after modif 9
graph before modif has 17 and after modif 16
graph before modif has 9 and after modif 8


And now check if the resulting embeddings are different

In [34]:
X_modified =  remove_single_node(X)
X_original_and_modidied = np.hstack((X,X_modified))
sub2vec.obtainRandomWalks(X_original_and_modidied)
embeddings1_2 = sub2vec.calculateEmbeddings()

Total vects  4000


In [35]:
embeddings1 = embeddings1_2[:2000,:]
embeddings2 = embeddings1_2[2000:,:]
num_graphs, d = embeddings1.shape
random_graphs =  np.random.randint(0, num_graphs, size=(10))
for i in random_graphs:
    print(f"Similarity of two graphs with a single node removed is {cosine_similarity(embeddings1[i,:].reshape(1, -1), embeddings2[i,:].reshape(1, -1))}")
# sometimes the change is significant.

Similarity of two graphs with a single node removed is [[0.86806303]]
Similarity of two graphs with a single node removed is [[0.94509435]]
Similarity of two graphs with a single node removed is [[0.73967075]]
Similarity of two graphs with a single node removed is [[0.9920117]]
Similarity of two graphs with a single node removed is [[0.91278666]]
Similarity of two graphs with a single node removed is [[0.99589455]]
Similarity of two graphs with a single node removed is [[0.66772443]]
Similarity of two graphs with a single node removed is [[0.65350485]]
Similarity of two graphs with a single node removed is [[0.69574]]
Similarity of two graphs with a single node removed is [[0.83128214]]


### Conclusion Sub2Vec
The results are not too bad, however, for some cases it is still a lot for the case when we need to compare the vectors. A complete disaster between runs - the embeddings are not coherent at all. Affected quite a lot by the random walk. 
And the main problem with this method - we cannot really use the attributes which are [potentially] extremely important.

# Graph2Vec demo

In [38]:

import hashlib
import tqdm
from joblib import Parallel, delayed
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from benchmarks.graph2vec import feature_extractor # I modified the original code by the authors to directly use it on our data



In [39]:
dataset_n = 'aids'
path = 'data/'
X, y = load_local_data(path, dataset_n, attributes=False, use_node_deg=False)
X = list(X)
print("\nFeature extraction started.\n")
document_collections = Parallel(n_jobs=1)(delayed(feature_extractor)(g,2, str(i)) for i, g in enumerate(X))
print("\nOptimization started.\n")

model = Doc2Vec(document_collections, vector_size = 128, window = 2, min_count = 5, dm = 0,
                sample = 0.0001, workers = 10, epochs = 20, alpha =0.025)
embeddings = model.docvecs.vectors_docs
print(f"Returned embeddings shape is {embeddings.shape}")


Feature extraction started.


Optimization started.

Returned embeddings shape is (2000, 128)


In [40]:
document_collections2 = Parallel(n_jobs=1)(delayed(feature_extractor)(g,2, str(i)) for i, g in enumerate(X))
model = Doc2Vec(document_collections2, vector_size = 128, window = 2, min_count = 5, dm = 0,
                sample = 0.0001, workers = 10, epochs = 20, alpha =0.025)
embeddings2 = model.docvecs.vectors_docs

Check the similarity within the runs

In [41]:
num_graphs, d = embeddings.shape
random_graphs =  np.random.randint(0, num_graphs, size=(10))
for i in random_graphs:
    print(f"Similarity of two graphs within rounds is {cosine_similarity(embeddings[i,:].reshape(1, -1), embeddings2[i,:].reshape(1, -1))}")


Similarity of two graphs within rounds is [[0.7764359]]
Similarity of two graphs within rounds is [[0.84474015]]
Similarity of two graphs within rounds is [[0.8015995]]
Similarity of two graphs within rounds is [[0.8488554]]
Similarity of two graphs within rounds is [[0.49738064]]
Similarity of two graphs within rounds is [[0.8559578]]
Similarity of two graphs within rounds is [[0.7632938]]
Similarity of two graphs within rounds is [[0.8361054]]
Similarity of two graphs within rounds is [[0.78976715]]
Similarity of two graphs within rounds is [[0.8290339]]


In [None]:
# Similarity in a  one run within the same graphs

In [42]:
X_double = np.hstack((X,X))
document_collections = Parallel(n_jobs=1)(delayed(feature_extractor)(g,2, str(i)) for i, g in enumerate(X_double))
model = Doc2Vec(document_collections, vector_size = 128, window = 2, min_count = 5, dm = 0,
                sample = 0.0001, workers = 10, epochs = 20, alpha =0.025)
embeddings1_2 = model.docvecs.vectors_docs


In [43]:
embeddings1 = embeddings1_2[:2000,:]
embeddings2 = embeddings1_2[2000:,:]
num_graphs, d = embeddings1.shape
random_graphs =  np.random.randint(0, num_graphs, size=(10))
for i in random_graphs:
    print(f"Similarity of two graphs from the same round is {cosine_similarity(embeddings1[i,:].reshape(1, -1), embeddings2[i,:].reshape(1, -1))}")
# this actually looks pretty good! So the WL labeling gives pretty good results in comparison with random walks 

Similarity of two graphs from the same round is [[0.9953762]]
Similarity of two graphs from the same round is [[0.99253607]]
Similarity of two graphs from the same round is [[0.9720104]]
Similarity of two graphs from the same round is [[0.98403585]]
Similarity of two graphs from the same round is [[0.9915637]]
Similarity of two graphs from the same round is [[0.9915637]]
Similarity of two graphs from the same round is [[0.9949548]]
Similarity of two graphs from the same round is [[0.98219454]]
Similarity of two graphs from the same round is [[0.9917908]]
Similarity of two graphs from the same round is [[0.984567]]


Finally, check if the algo is robust for node removal 

In [44]:
X_modified =  remove_single_node(X)
X_original_and_modidied = np.hstack((X,X_modified))
document_collections = Parallel(n_jobs=1)(delayed(feature_extractor)(g,2, str(i)) for i, g in enumerate(X_original_and_modidied))
model = Doc2Vec(document_collections, vector_size = 128, window = 2, min_count = 5, dm = 0,
                sample = 0.0001, workers = 10, epochs = 20, alpha =0.025)
embeddings1_2 = model.docvecs.vectors_docs


In [45]:
embeddings1 = embeddings1_2[:2000,:]
embeddings2 = embeddings1_2[2000:,:]
num_graphs, d = embeddings1.shape
random_graphs =  np.random.randint(0, num_graphs, size=(10))
for i in random_graphs:
    print(f"Similarity of two graphs from the same round is {cosine_similarity(embeddings1[i,:].reshape(1, -1), embeddings2[i,:].reshape(1, -1))}")
# this actually looks pretty good! So the WL labeling gives pretty good results in comparison with random walks 

Similarity of two graphs from the same round is [[0.83382845]]
Similarity of two graphs from the same round is [[0.9170121]]
Similarity of two graphs from the same round is [[0.60226]]
Similarity of two graphs from the same round is [[0.6322068]]
Similarity of two graphs from the same round is [[0.7120235]]
Similarity of two graphs from the same round is [[0.16714764]]
Similarity of two graphs from the same round is [[0.5319908]]
Similarity of two graphs from the same round is [[0.80314404]]
Similarity of two graphs from the same round is [[0.8657018]]
Similarity of two graphs from the same round is [[0.83312845]]


not really robust to this one, so this is a limitation of graph2vec

# Conclusion on two embedding methods

In [None]:
It will be awesome to allow to use node attributes of graphs in resulting embeddings.
It will be nice to have constant embeddings withing algorithm run but in the same moment to make them robust for minor graph 
changes. There are methods which allow for that (for example, subgraph mathcing kernel) - but they way too long to compute for a real-case scenario.
If we could have had a 