In [1]:

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from sklearn.model_selection import KFold
from copy import deepcopy
import sys

# node2vec paper
#https://arxiv.org/pdf/1607.00653.pdf

### Load and preprocess data

In [2]:
from load_data import *
dataset_name = "Actor"
data_dir = "../Data/" + dataset_name

total_graph = load_geometric_dataset(dataset_name)
#total_graph = load_reddit(data_dir)
#total_graph = load_youtube(data_dir)
#total_graph = load_flickr(data_dir)
#total_graph = load_blogcatalog(data_dir)
#total_graph = load_cora(data_dir)
#total_graph = load_pubmed(data_dir)
print(total_graph['Multioutput'])
print(total_graph['N_nodes'], total_graph['N_edges'])

  Referenced from: '/opt/anaconda3/envs/venv/lib/python3.7/site-packages/libpyg.so'
  Reason: tried: '/usr/local/opt/python@3.10/Frameworks/Python.framework/Versions/3.10/Python' (no such file), '/Library/Frameworks/Python.framework/Versions/3.10/Python' (no such file), '/System/Library/Frameworks/Python.framework/Versions/3.10/Python' (no such file)
  Referenced from: '/opt/anaconda3/envs/venv/lib/python3.7/site-packages/libpyg.so'
  Reason: tried: '/usr/local/opt/python@3.10/Frameworks/Python.framework/Versions/3.10/Python' (no such file), '/Library/Frameworks/Python.framework/Versions/3.10/Python' (no such file), '/System/Library/Frameworks/Python.framework/Versions/3.10/Python' (no such file)


nr of self-loop edges: 93
False
7600 26659


### Functions for training  

In [3]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0
       
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        self.epoch += 1

def compute_pi(total_graph, p, q):
    pi_dict = {i:[] for i in range(total_graph['N_nodes'])}
    for i in range(total_graph['N_nodes']):
        neighbors = total_graph['edges'].get(i, [])
        n_neighbors = len(neighbors)
        probability_dist = np.ones((n_neighbors+2))
        probability_dist[0:n_neighbors] *= 1/q
        #probability_dist[n_neighbors] = 1     # staying at current node
        probability_dist[n_neighbors+1] = 1/p   # returning to the same node we came from
        norm = 1/p + 1 + n_neighbors/q
        p_normed = probability_dist/norm
        pi_dict[i] = p_normed
    return pi_dict 


def alias_sample(prev, current, neighbors, pi_dict):
    n_neighbors = len(neighbors)
    p_normed = pi_dict[current]
    sampled_indx = np.random.choice(n_neighbors+2,  p=p_normed)
    if sampled_indx==n_neighbors:
        return current
    elif sampled_indx==n_neighbors+1:
        return prev
    else:
        return neighbors[sampled_indx]

    

def learn_features(G, dim, walks_per_node, walk_length, context_size, p, q, SGD_epochs):
    pi = compute_pi(G, p, q)
    walks = [[]]*walks_per_node*G['N_nodes']
    c = 0
    for i in range(walks_per_node):
        print(i)
        for node in G["nodes"]:
            walk = node2vec_walk(G, node, walk_length, pi)
            walks[c] = walk
            c += 1
            if node%int(G["N_nodes"]/10)==0:
                print(node/G['N_nodes'])
 
    f = SDG(walks, context_size, dim, SGD_epochs)
    return f


def node2vec_walk(G, start_node, walk_length, pi):
    walk = [0]*(walk_length+1)
    walk[0] = start_node
    for i in range(walk_length):
        curr = walk[i]
        if i==0:
            prev = start_node
        else:
            prev = walk[i-1]

        neighbors = G['edges'][curr]
        sample = alias_sample(prev, curr, neighbors, pi)
        walk[i+1] = sample
    return walk
    

def SDG(walks, context_size=10, dim=128, n_epochs=5):
    """Use Word2Vec with SGD to learn embedding based on walks"""
    #sg=1 tells it to use skip-gram algorithm, min_count=0 tells it to not skip "word" that occur only 1 time   
    model = Word2Vec(sentences=walks, vector_size=dim, window=context_size, min_count=0, sg=1, workers=8, epochs=n_epochs, compute_loss=True, callbacks=[EpochLogger()])
    return model

### Train embedding model

In [4]:

# Parameters taken from original node2vec paper:
dim = 128    # should be 128
walks_per_node = 10  #should be 10
walk_length = 80    # should be 80
context_size = 10
# From Khosla et al. these piwere the best performing settings in most cases:
p = 0.25
q = 4
SGD_epochs = 1

USE_PRETRAINED = True
if USE_PRETRAINED:
    embedding_model = Word2Vec.load("../Results/node2vec/{}.model".format(dataset_name))
   #embedding_model = Word2Vec.load("../Results/node2vec/blogcatalog.model")
else:
    embedding_model = learn_features(total_graph, dim, walks_per_node, walk_length, context_size, p, q, SGD_epochs)
    embedding_model.save("../Results/node2vec/{}.model".format(dataset_name))

### Evaluate

In [5]:
## Create 5-fold validation set for NC

import utils

NC_5folds = {}
kf = KFold(n_splits=5, shuffle=True)
nodes = np.array([i for i in range(total_graph['N_nodes'])])
for i, (train_index, test_index) in enumerate(kf.split(nodes)):  
    NC_5folds[i] = {"train":nodes[train_index], "test":nodes[test_index]}


reverse_fraction = 0
LP_test_X_unb, LP_test_Y_unb, training_graph_unbalanced, test_graph_unbalanced = utils.split_graphs(total_graph, directed=True)
LP_test_X, LP_test_Y = utils.balance_test_graph(total_graph, LP_test_X_unb, LP_test_Y_unb, test_graph_unbalanced, directed=True, reverse_fraction=reverse_fraction)
LP_train_X, LP_train_Y = utils.balance_training_graph(training_graph_unbalanced, total_graph, directed=True)


splitting graphs
0.099932478055368
0.199864956110736
0.299797434166104
0.399729912221472
0.49966239027683995
0.599594868332208
0.699527346387576
0.799459824442944
0.8993923024983119
0.9993247805536799
balancing test graph
0.19993998049366044
0.39987996098732087
0.5998199414809813
0.7997599219746417
0.9996999024683022
balancing training graph
0.099932478055368
0.199864956110736
0.299797434166104
0.399729912221472
0.49966239027683995
0.599594868332208
0.699527346387576
0.799459824442944
0.8993923024983119
0.9993247805536799


### Evaluate NC 

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from  sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler

mb = MultiLabelBinarizer(classes=[i for i in range(total_graph['N_classes'])])
scaler = StandardScaler()


f1_macro_list = []
f1_micro_list = []
# 5-fold cross validation
for i in range(5):
    print(i)
    training_nodes = NC_5folds[i]['train']
    test_nodes = NC_5folds[i]['test']
    X_train = scaler.fit_transform(np.array([embedding_model.wv[node] for node in training_nodes], dtype=object))
    X_test = scaler.fit_transform(np.array([embedding_model.wv[node] for node in test_nodes], dtype=object))
    # For the datasets that only have one one label per node, it gives better results to not use multioutputclassifier
    if not total_graph['Multioutput']:
        Y_train_sequence = np.array([total_graph['groups'][node][0]  for node in training_nodes],dtype=int)
        Y_test_sequence = np.array([total_graph['groups'][node][0] for node in test_nodes], dtype=int)
        log_reg = LogisticRegression(multi_class="ovr", max_iter=200)
        Y_train = Y_train_sequence
        Y_test = Y_test_sequence
        log_reg.fit(X_train, Y_train)
        Y_pred = log_reg.predict(X_test)
        Y_pred = utils.onehot(Y_pred, total_graph['N_classes'])
        Y_test = utils.onehot(Y_test, total_graph['N_classes'])
    else:
      
        Y_train_sequence = np.array([total_graph['groups'][node]  for node in training_nodes], dtype=object)
        Y_test_sequence = np.array([total_graph['groups'][node] for node in test_nodes], dtype=object)
        Y_train = mb.fit_transform(Y_train_sequence)
        Y_test = mb.fit_transform(Y_test_sequence)
        log_reg = MultiOutputClassifier(LogisticRegression(multi_class="ovr"))
        log_reg.fit(X_train, Y_train)
        Y_pred = log_reg.predict(X_test)
        
    f1_macro = utils.compute_f1_macro(Y_test, Y_pred, total_graph['N_classes'])
    f1_micro = utils.compute_f1_micro(Y_test, Y_pred,total_graph['N_classes'])
    f1_macro_list.append(f1_macro)
    f1_micro_list.append(f1_micro)
    print(f1_macro, f1_micro)
    
print(np.mean(f1_micro_list))
print(np.mean(f1_macro_list))

0
0.18332972717212054 0.23223684210526316
1
0.19528937541194544 0.25263157894736843
2
0.17675910029959113 0.22894736842105262
3
0.1708518712697923 0.23026315789473684
4
0.16841645376564962 0.22236842105263158
0.23328947368421055
0.1789293055838198


### Evaluate LP

In [8]:
Y_train = LP_train_Y
Y_test = LP_test_Y

from sklearn.metrics import roc_auc_score

# build representation of edge datasets using inner product of the representation of the two nodes
X_train = np.zeros((len(LP_train_X), 1))
for i, edge in enumerate(LP_train_X):
    u = edge[0]
    v = edge[1]
    X_train[i] = utils.get_edge_representation(embedding_model.wv[u], embedding_model.wv[v])
X_test = np.zeros((len(LP_test_X), 1))
for i, edge in enumerate(LP_test_X):
    u = edge[0]
    v = edge[1]
    X_test[i] = utils.get_edge_representation(embedding_model.wv[u], embedding_model.wv[v])
    
print("fit model")
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
Y_probs = classifier.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(Y_test, Y_probs)
print(roc_auc)
  

fit model
0.9601912175587157


### Save results

In [10]:


with open("../Results/node2vec/{}_metrics{}.csv".format(dataset_name, reverse_fraction), "w") as file:
    settings_str = "Results for Node2vec embedding generated with p={}, q={}, walk length={}, walks per node={}, sgd_epochs={}\n".format(p,q,
    walk_length, walks_per_node, SGD_epochs)
    file.write(settings_str)
    #header = "Dataset; F1 macro; F1 micro; ROC-AUC \n"
    header = "Dataset; F1 macro; F1 micro; ROC-AUC_{} \n".format(reverse_fraction)
    file.write(header)
    data_row = "{dataset};{f1mac};{f1mic};{roc}".format(dataset=dataset_name, f1mac=np.mean(f1_macro_list), f1mic=np.mean(f1_micro_list), roc=roc_auc)
    file.write(data_row)

