In [None]:

import numpy as np
from gensim.models import keyedvectors
import sys
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from sklearn.model_selection import KFold
from copy import deepcopy


### Load and preprocess data

In [None]:
dataset_name = "BlogCatalog"
data_dir = "./Data/BlogCatalog-dataset/data"
with open(data_dir+"/groups.csv", "r") as file:
    N_groups = len(file.readlines())
with open(data_dir+"/nodes.csv", "r") as file:
    N_nodes = len(file.readlines())

total_graph = {"edges":{i+1:[] for i in range(N_nodes)}, "nodes":[i+1 for i in range(N_nodes)], 
               "groups":{i+1:[] for i in range(N_nodes)},  "N_nodes":N_nodes}


N_classes = 0
with open(data_dir+'/groups.csv', "r") as file:
    N_classes = len(file.readlines())
print(N_classes)

edge_list = []
N_edges = 0
with open(data_dir+"/edges.csv", "r") as file:
    for line in file.readlines():
        node1  = int(line.split(",")[0])
        node2 = int(line.split(",")[1])
        total_graph['edges'][node1].append(node2)
        total_graph['edges'][node2].append(node1)
        N_edges += 1
        edge_list.append([node1, node2])

with open(data_dir+"/group-edges.csv", "r") as file:
    for line in file.readlines():
        node  = int(line.split(",")[0])
        group = int(line.split(",")[1])
        total_graph['groups'][node].append(group)

good_nodes = []
for i in range(N_nodes):
    node = i+1
    if len(total_graph['edges'][node])>1:
        good_nodes.append(node)

print(len(good_nodes))



In [None]:
## Create 5-fold validation set

NC_5folds = {}
kf = KFold(n_splits=5, shuffle=True)
nodes = np.array([i+1 for i in range(N_nodes)])

for i, (train_index, test_index) in enumerate(kf.split(nodes)):
    
    NC_5folds[i] = {"train":nodes[train_index], "test":nodes[test_index]}



In [None]:
# Select 50% of the edges for training, leave remaining for testing.
# Want the remaining graph to still be connected, so we only remove edges if there are several neighbors
n_test_samples = int(N_edges*0.5)
training_graph = deepcopy(total_graph["edges"])
LP_test_X = []
LP_test_Y = []
counter = 0

n_neighbors = {i+1:0 for i in range(N_nodes)}
for i in range(N_nodes):
    n_neighbors[i+1] = len(training_graph[i+1])

all_nodes = [i+1 for i in range(N_nodes)]
edges_indices = [i for i in range(N_edges)]
removed_edge_indices = set()
while counter<n_test_samples:
    node1 = np.random.choice(good_nodes)
    if n_neighbors[node1]>1:
        node2 = np.random.choice(training_graph[node1])
        if n_neighbors[node2]>1:

            n_neighbors[node1] -= 1
            n_neighbors[node2] -= 1

            # Add to test data
            LP_test_X.append([node1, node2])
            LP_test_Y.append(1)

            # remove edge from training graph
            training_graph[node1].remove(node2)
            training_graph[node2].remove(node1)

            found_neg_edge_bool = False
            while not found_neg_edge_bool:
                node1, node2 = np.random.choice(all_nodes, 2)
                if not node1 in total_graph['edges'][node2]:
                    LP_test_X.append([node1, node2])
                    LP_test_Y.append(0)
                    found_neg_edge_bool = True

            counter += 1
                
            if counter%int(n_test_samples/10)==0:
                print(counter/n_test_samples)

In [None]:
# Create balanced training data. Each edge is added only one time!
LP_train_X = []
LP_train_Y = []
all_nodes = [i+1 for i in range(N_nodes)]
added_edges = {i+1:{} for i in range(N_nodes)}
for node, neighbors in training_graph.items():
    for nb in neighbors:
        if not added_edges[node].get(nb, False):
            added_edges[node][nb] = True
            added_edges[nb][node] = True
            LP_train_X.append([node, nb])
            LP_train_Y.append(1)


for i in range(n_test_samples):
    found_neg_edge_bool = False
    while not found_neg_edge_bool:
        node1, node2 = np.random.choice(all_nodes, 2)
        if not node1 in total_graph['edges'][node2]:
            LP_train_X.append([node1, node2])
            LP_train_Y.append(0)
            found_neg_edge_bool = True
    
    if i%int(n_test_samples/10)==0:
        print(i/n_test_samples)


### Functions for training  

In [None]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0
       
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        self.epoch += 1


def alias_sample(prev, current, neighbors, p, q):
    n_neighbors = len(neighbors)
    probability_dist = np.zeros((n_neighbors+2))
    available_nodes = [0]*(len(neighbors)+2)
    probability_dist[0] = 1/p   # returning to the same node we came from
    available_nodes[0] = prev
    probability_dist[1] = 1     # staying at current node
    available_nodes[1] = current
    probability_dist[2::] = np.ones_like(neighbors)*1/q
    available_nodes[2::] = neighbors
    norm = np.sum(probability_dist)
    p_normed = probability_dist/norm
    sampled_node = np.random.choice(available_nodes,  p=p_normed)
    return sampled_node
    

def learn_features(G, dim, walks_per_node, walk_length, context_size, p, q, SGD_epochs):
    walks = [[] for i in range(walks_per_node)]
    for i in range(walks_per_node):
        print(i)
        for node in G["nodes"]:
            walk = node2vec_walk(G, node, walk_length, p, q)
            walks.append(walk)
    f = SDG(walks, context_size, dim, SGD_epochs)
    return f


def node2vec_walk(G, start_node, walk_length, p, q):
    walk = [start_node]
    for i in range(walk_length):
        curr = walk[-1]
        if i==0:
            prev = start_node
        else:
            prev = walk[-2]
        neighbors = G['edges'][curr]
        sample = alias_sample(prev, curr, neighbors, p, q)
        walk.append(sample)
    return walk
    

def SDG(walks, context_size=10, dim=128, n_epochs=5):
    """Use Word2Vec with SGD to learn embedding based on walks"""
    #sg=1 tells it to use skip-gram algorithm, min_count=0 tells it to not skip "word" that occur only 1 time   
    model = Word2Vec(sentences=walks, vector_size=dim, window=context_size, min_count=0, sg=1, workers=8, epochs=n_epochs, compute_loss=True, callbacks=[EpochLogger()])
    return model

### Train embedding model

In [None]:

# Parameters taken from original node2vec paper:
dim = 128    # should be 128
walks_per_node = 10
walk_length = 80    # should be 80
context_size = 10
# From Khosla et al. these were the best performing settings in most cases:
p = 0.25
q = 4
SGD_epochs = 1

USE_PRETRAINED = True
if USE_PRETRAINED:
    embedding_model = Word2Vec.load("./Results/BlogCatalog/blogcatalog.model")
else:
    SAVE_BOOL = True
    embedding_model = learn_features(total_graph, dim, walks_per_node, walk_length, context_size, p, q, SGD_epochs)
    if SAVE_BOOL:
        embedding_model.save("./Results/BlogCatalog/blogcatalog.model")

### Functions for evaluation tasks

In [139]:
def precision_and_recall(Y_true, Y_pred):
    # count true positives and false positives and false negatives
    nclasses = len(Y_true[0])
    TP_list = [0]*nclasses
    FP_list = [0]*nclasses
    FN_list = [0]*nclasses
    for j in range(nclasses):
       for i, pred in enumerate(Y_pred):
            if pred[j]==1 and Y_true[i][j]==1:
                TP_list[j] += 1
            elif pred[j]==1 and  Y_true[i][j]==0:
                FP_list[j] += 1
            elif pred[j]==0 and Y_true[i][j]==1:
                FN_list[j] += 1 

    return TP_list, FP_list, FN_list

def compute_f1_macro(Y_true, Y_pred):
    nclasses = len(Y_true[0])
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred)
    f1_scores = [0]*nclasses
    for k in range(nclasses):
        if TP_list[k]==0:
            continue
        f1_scores[k] = TP_list[k]/(TP_list[k]+0.5*(FP_list[k]+FN_list[k])) 
    return np.sum(f1_scores)/nclasses


def compute_f1_micro(Y_true, Y_pred):
    TP_list, FP_list, FN_list = precision_and_recall(Y_true, Y_pred)
    TP = np.sum(TP_list)
    FP = np.sum(FP_list)
    FN = np.sum(FN_list)
    return TP/(TP + 0.5*(FN+FP))


def compute_accuracy(Y_true, Y_pred):
    n_correct = 0
    n_tot = 0
    nclasses = len(Y_true[0])
    for i, pred in enumerate(Y_pred):
        for j in range(nclasses):
            n_tot += 1
            if pred[j]==Y_true[i][j]:
                n_correct += 1
    return n_correct/n_tot

def sigmoid(z):
    return 1/(1 + np.exp(-z))

def get_edge_representation(fu,fv):
    return sigmoid(np.dot(fu,fv))



### Evaluate NC

In [118]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

mb = MultiLabelBinarizer(classes=[i+1 for i in range(N_classes)])

f1_macro_list = []
f1_micro_list = []
accuracy_scores_list = []
# 5-fold cross validation
for i in range(5):
    print(i)
    training_nodes = NC_5folds[i]['train']
    test_nodes = NC_5folds[i]['test']
    X_train = np.array([embedding_model.wv[node] for node in training_nodes], dtype=object)
    X_test = np.array([embedding_model.wv[node] for node in test_nodes], dtype=object)
    Y_train_sequence = np.array([total_graph['groups'][node]  for node in training_nodes], dtype=object)
    Y_test_sequence = np.array([total_graph['groups'][node] for node in test_nodes], dtype=object)
    Y_train = mb.fit_transform(Y_train_sequence)
    Y_test = mb.fit_transform(Y_test_sequence)
    log_reg = MultiOutputClassifier(LogisticRegression(multi_class="ovr"))
    log_reg.fit(X_train, Y_train)
    Y_pred = log_reg.predict(X_test)
    acc = compute_accuracy(Y_test, Y_pred)
    f1_macro = compute_f1_macro(Y_test, Y_pred)
    f1_micro = compute_f1_micro(Y_test, Y_pred)
    accuracy_scores_list.append(acc)
    f1_macro_list.append(f1_macro)
    f1_micro_list.append(f1_micro)
    
print(np.mean(f1_micro_list))
print(np.mean(f1_macro_list))

0
(2063, 39)
1
(2063, 39)
2
(2062, 39)
3
(2062, 39)
4
(2062, 39)
0.2933387375465004
0.16219503330439097


In [None]:
if SAVE_BOOL:
    with open("./Results/BlogCatalog/metrics.csv", "w") as file:
        settings_str = "Node2vec embedding generated with p={}, q={}, walk length={}, walks per node={}, sgd_epochs={}\n".format(p,q,
        walk_length, walks_per_node, SGD_epochs)
        file.write(settings_str)
        header = "Dataset; Total Accuracy; F1 macro; F1 micro\n"
        file.write(header)
        data_row = "{dataset};{acc};{f1mac};{f1mic}".format(dataset=dataset_name, acc=np.mean(accuracy_scores_list), 
        f1mac=np.mean(f1_macro_list), f1mic=np.mean(f1_micro_list))
        file.write(data_row)



### Evaluate LP

### Save results

In [None]:
Y_train = LP_train_Y
Y_test = LP_test_Y

from sklearn.metrics import roc_auc_score

X_train = np.zeros((len(LP_train_X), 1))
for i, edge in enumerate(LP_train_X):
    u = edge[0]
    v = edge[1]
    X_train[i] = get_edge_representation(embedding_model.wv[u], embedding_model.wv[v])
X_test = np.zeros((len(LP_test_X), 1))
for i, edge in enumerate(LP_test_X):
    u = edge[0]
    v = edge[1]
    X_test[i] = get_edge_representation(embedding_model.wv[u], embedding_model.wv[v])

classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
Y_probs = classifier.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(Y_test, Y_probs)
print(roc_auc)
  