# Pip Install

In [None]:
!pip install nodevectors

# Import Module

In [19]:
# Graphs
import networkx as nx
from nodevectors import Node2Vec

# Utils
import os
import random
from tqdm.notebook import tqdm
import gzip
import pickle

# Data
import numpy as np
import pandas as pd

# Utils

In [18]:
def get_training_graph(graph, edges_to_remove):
  residual_g = graph.copy()

  for edge in edges_to_remove:
    residual_g.remove_edge(edge[0], edge[1])
  
  return residual_g

def save(object, filename, protocol = 0):
  """Saves a compressed object to disk
  """
  file = gzip.GzipFile(filename, 'wb')
  file.write(pickle.dumps(object, protocol))
  file.close()

# Load Data

In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/MLNS/data/training_set.txt', sep=' ', header=None)
df_test = pd.read_csv('/content/drive/MyDrive/MLNS/data/testing_set.txt', sep=' ', header=None)
node_info = pd.read_csv('/content/drive/MyDrive/MLNS/data/node_information.csv', header=None, names=['node_id', 'date', 'title', 'authors', 'journal', 'abstract'])

# Creating the graph

In [5]:
G = nx.DiGraph()
non_edges = []
for i in tqdm(range(len(df_train))):
  if df_train.iloc[i,2] == 1:
    G.add_edge(df_train.iloc[i,0], df_train.iloc[i,1])
  else:
    non_edges.append((df_train.iloc[i,0], df_train.iloc[i,1]))

for node in tqdm(node_info.node_id):
  if not (node in G.nodes()):
    G.add_node(node)

HBox(children=(FloatProgress(value=0.0, max=615512.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=27770.0), HTML(value='')))




# Split in Train/Dev three times

In [6]:
all_edges = list(G.edges())
non_edges_copy = non_edges.copy()

In [7]:
seed = 42
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
np.random.shuffle(all_edges)
np.random.shuffle(non_edges_copy)

In [8]:
print(f"Number of positive examples : {len(all_edges)}")
print(f"Number of negative examples : {len(non_edges_copy)}")

Number of positive examples : 335130
Number of negative examples : 280382


In [9]:
number_pos_dev = int(0.06 * len(all_edges))
number_neg_dev = int(0.06 * len(non_edges_copy))
print(f"The number of pos examples : Dev {number_pos_dev} / Training {len(all_edges) - number_pos_dev}.")
print(f"The number of neg examples : Dev {number_neg_dev} / Training {len(non_edges) - number_pos_dev}.")

The number of pos examples : Dev 20107 / Training 315023.
The number of neg examples : Dev 16822 / Training 260275.


In [10]:
print(f"Proportion of positive example in data : {len(all_edges)/(len(all_edges) + len(non_edges))}")
print(f"Proportion of positive example in training : {(len(all_edges) - number_pos_dev) / (len(all_edges) - number_pos_dev + len(non_edges) - number_pos_dev)}")
print(f"Proportion of positive example in dev : {number_pos_dev/(number_pos_dev+number_neg_dev)}")

Proportion of positive example in data : 0.5444735439763969
Proportion of positive example in training : 0.5475822964793898
Proportion of positive example in dev : 0.5444772401093991


We are good to do as we have the same proportion in all our data :)

In [11]:
pos_samples_dev_1 = all_edges[:number_pos_dev]
pos_samples_dev_2 = all_edges[number_pos_dev: 2*number_pos_dev]
pos_samples_dev_3 = all_edges[2*number_pos_dev: 3*number_pos_dev]
neg_samples_dev_1 = non_edges_copy[:number_neg_dev]
neg_samples_dev_2 = non_edges_copy[number_neg_dev: 2*number_neg_dev]
neg_samples_dev_3 = non_edges_copy[2*number_neg_dev: 3*number_neg_dev]

In [12]:
assert len(pos_samples_dev_3) == len(pos_samples_dev_2) == len(pos_samples_dev_1)
assert len(neg_samples_dev_3) == len(neg_samples_dev_2) == len(neg_samples_dev_1)

In [13]:
pos_samples_dev_1[:10]

[(9601170, 9406020),
 (9801159, 9704043),
 (5127, 5028),
 (112170, 9611077),
 (9811221, 9711011),
 (9508156, 9506052),
 (11272, 9901029),
 (9812158, 9605184),
 (109077, 3188),
 (2213, 9208011)]

In [14]:
neg_samples_dev_1[:10]

[(9206018, 9301129),
 (9805029, 9903043),
 (9311009, 9911068),
 (112083, 211278),
 (9601015, 3241),
 (12249, 107050),
 (111044, 9502047),
 (9906103, 106124),
 (9603012, 9306090),
 (9210152, 9903227)]

# Get 3 different graph

In [15]:
graph_1 = get_training_graph(G, pos_samples_dev_1)
graph_2 = get_training_graph(G, pos_samples_dev_2)
graph_3 = get_training_graph(G, pos_samples_dev_3)

# Node Embedding

In [16]:
### Global Parameters ###
n_components=128
walklen=80
epochs=30
return_weight=1
neighbor_weight=1
keep_walks=False
window=10
negative=4
iter=20
batch_words=128

## First Graph

In [None]:
g2v = Node2Vec(n_components=n_components, 
               walklen=walklen, 
               epochs=epochs,
               return_weight=return_weight,
               neighbor_weight=neighbor_weight,
               keep_walks=keep_walks,
               verbose=True,               
               threads=os.cpu_count(), 
               w2vparams={'window': window, 'negative':negative, 'iter': iter, 'batch_words':batch_words})
g2v.fit(graph_1)

Making walks... 



Done, T=3.23
Mapping Walk Names... Done, T=87.89
Training W2V... 

In [None]:
emb_per_nodes = {}
for node in tqdm(list(graph_1.nodes())):
    emb_per_nodes[node] = g2v.predict(node)
save(emb_per_nodes, '/content/drive/MyDrive/MLNS/data_generated/n2v_graph_1.files') # Save the node embedding

## Second Graph

In [17]:
g2v = Node2Vec(n_components=n_components, 
               walklen=walklen, 
               epochs=epochs,
               return_weight=return_weight,
               neighbor_weight=neighbor_weight,
               keep_walks=keep_walks,
               verbose=True,               
               threads=os.cpu_count(), 
               w2vparams={'window': window, 'negative':negative, 'iter': iter, 'batch_words':batch_words})
g2v.fit(graph_2)

Making walks... 



Done, T=4.17
Mapping Walk Names... Done, T=116.90
Training W2V... Done, T=4750.53


In [20]:
emb_per_nodes = {}
for node in tqdm(list(graph_2.nodes())):
    emb_per_nodes[node] = g2v.predict(node)
save(emb_per_nodes, '/content/drive/MyDrive/MLNS/data_generated/n2v_graph_2.files') # Save the node embedding

HBox(children=(FloatProgress(value=0.0, max=27770.0), HTML(value='')))




## Third graph

In [21]:
g2v = Node2Vec(n_components=n_components, 
               walklen=walklen, 
               epochs=epochs,
               return_weight=return_weight,
               neighbor_weight=neighbor_weight,
               keep_walks=keep_walks,
               verbose=True,               
               threads=os.cpu_count(), 
               w2vparams={'window': window, 'negative':negative, 'iter': iter, 'batch_words':batch_words})
g2v.fit(graph_3)

Making walks... Done, T=0.63
Mapping Walk Names... Done, T=117.65
Training W2V... Done, T=4747.62


In [22]:
emb_per_nodes = {}
for node in tqdm(list(graph_3.nodes())):
    emb_per_nodes[node] = g2v.predict(node)
save(emb_per_nodes, '/content/drive/MyDrive/MLNS/data_generated/n2v_graph_3.files') # Save the node embedding

HBox(children=(FloatProgress(value=0.0, max=27770.0), HTML(value='')))


