Link for the data and embeddings: https://drive.google.com/drive/folders/1PKT2xfyovvTEYlCdyBIr-a1HXWNzjfKV?usp=sharing

Link to have a full vision of our project: https://drive.google.com/drive/folders/1124Fy4Vh3syHdZImZlseOGwuMK4XEUVK?usp=sharing

We have provided the code for this challenge, but we additionaly include this notebook which uses pre-computed embeddings available on the drive folder mentioned below, to reproduce the results in an easier way.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#link to the data in the drive
%cd "/content/drive/MyDrive/data_challenge_2021/ALTEGRAD-challenge---Link-prediction/"

/content/drive/.shortcut-targets-by-id/1UTpO-qa1QNYNkfp66OwNAuhMDSV6K3ZX/data_challenge_2021/ALTEGRAD-challenge---Link-prediction


In [48]:
!pip install -q transformers nodevectors

[K     |████████████████████████████████| 3.5 MB 14.6 MB/s 
[K     |████████████████████████████████| 895 kB 10.5 MB/s 
[K     |████████████████████████████████| 6.8 MB 32.3 MB/s 
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
[K     |████████████████████████████████| 596 kB 84.5 MB/s 
[?25h  Building wheel for nodevectors (setup.py) ... [?25l[?25hdone
  Building wheel for csrgraph (setup.py) ... [?25l[?25hdone


## Import packages and graph

In [49]:
import pickle
import gzip
import os
from tqdm import tqdm
import networkx as nx
import torch
from transformers import AutoTokenizer, AutoModel
from nodevectors import Node2Vec
import numpy as np

In [None]:
# Create a graph
G = nx.read_edgelist('edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int)
nodes = list(G.nodes())
n = G.number_of_nodes()
m = G.number_of_edges()

In [None]:
#load authors and abstracts
authors = dict()
with open('../authors.txt', 'r',  encoding="utf8") as f:
    for line in f:
        node, author = line.split('|--|')
        authors[int(node)] = author

abstracts = dict()
with open('../abstracts.txt', 'r',  encoding="utf8") as f:
    for line in f:
        node, abstract = line.split('|--|')
        abstracts[int(node)] = abstract

## Useful functions

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import log_loss
import scipy
from scipy import spatial 

def get_training_graph(graph, edges_to_remove):
    res_graph = graph.copy()
    for edge in edges_to_remove:
        res_graph.remove_edge(edge[0], edge[1])
    return res_graph

def load_features(filename):
    with gzip.open(filename, "rb") as f:
        loaded_object = pickle.load(f)
        return loaded_object

def load_adjacency_author(authors, path_adj = 'adjacencyfinal.npz'):
  A = scipy.sparse.load_npz(path_adj)
  list_all_authors = []
  aut = list(authors.values())
  for x in aut:
    x_list = x.split(",")
    x_list[-1] = x_list[-1][:-1]
    for auth in x_list:
      list_all_authors.append(auth)

  unique_authors = np.unique(list_all_authors)

  aut_to_index = {}
  i=0
  for auth in unique_authors:
    aut_to_index[auth] = i
    i+=1
  return A, aut_to_index
  
def cosine_sim(arr1, arr2, eps=0.001):
    norm1 = np.linalg.norm(arr1)
    norm2 = np.linalg.norm(arr2)
    if norm1 < eps or norm2 < eps:
      return 0
    return np.dot(arr1, arr2)/(np.linalg.norm(arr1)*np.linalg.norm(arr2))

def return_metrics(true, preds, thres=0.5):
    preds_label = np.where(preds > thres, 1, 0)
    f1 = f1_score(true, preds_label)
    auc = roc_auc_score(true, preds)
    logloss = log_loss(true, preds.astype(np.float64))
    return f1, auc, logloss

def extract_features2(graph, authors, n2v, t2v,a2v, samples, pagerank, gd, partition):

    features = list()
    A, aut_to_index = load_adjacency_author(authors, path_adj = 'adjacencyfinal.npz')

    for edge in tqdm(samples):
         ## Graph features

        sum_dg = graph.degree(edge[0]) + graph.degree(edge[1])
        diff_dg = abs(graph.degree(edge[0]) - graph.degree(edge[1]))

        AAI = list(nx.adamic_adar_index(graph, [(edge[0], edge[1])]))[0][2]
        JC = list(nx.jaccard_coefficient(graph, [(edge[0], edge[1])]))[0][2]
        # Preferential Attachment
        PA = list(nx.preferential_attachment(graph, [(edge[0], edge[1])]))[0][2]
        # Common Neighbors
        CN = len(list(nx.common_neighbors(graph, u=edge[0], v=edge[1])))
        # Page Rank
        PR = np.log(pagerank[edge[0]] * pagerank[edge[1]])
        # Katz
        #KZ = np.log(katz[node_left] * katz[node_right])
        
        #other graph features
        if partition[edge[0]] == partition[edge[1]]:
            com_partition = 1
        else:
            com_partition = 0
        cluster_coeff = gd["clustering_coeff"][edge[0]] * gd["clustering_coeff"][edge[1]]
        eigenvector = gd["eigenvector"][edge[0]] * gd["eigenvector"][edge[1]]

        cosine_node = cosine_sim(n2v[edge[0]], n2v[edge[1]])
        cosine_author = cosine_sim(a2v[edge[0]], a2v[edge[1]])
        #cosine_abstract = cosine_sim(t2v[edge[0]], t2v[edge[1]])
        #distance between sentence embeddings using word2vec
        dist_abstract = np.linalg.norm(t2v[edge[0]] - t2v[edge[1]])
        
        features_final = np.concatenate([n2v[edge[0]], n2v[edge[1]], a2v[edge[0]], a2v[edge[1]]])
        
        # Common Authors
        authors_left = authors[edge[0]]
        authors_right = authors[edge[1]]
        L1 = list(set(authors_left.strip().split(',')))
        L2 = list(set(authors_right.strip().split(',')))
        colab = 0
        for author in L1:
          for author2 in L2:
            colab += A[aut_to_index[author], aut_to_index[author2]] 

        colab_mean = colab/(len(authors_left)*len(authors_right))

        if authors_left is None or authors_right is None:
            common_authors = float('nan')
        else:
            common_authors = len(list(set(authors_left.strip().split(',')).intersection(authors_right.strip().split(','))))

        total_features = list(features_final) + [JC, AAI, PR, PA, CN, com_partition, cluster_coeff, eigenvector, cosine_node, dist_abstract, cosine_author, 
                                                 sum_dg, diff_dg, common_authors, colab, colab_mean]

        features.append(total_features)

    return np.stack(features)

## Word2vec for abstract

In [None]:
#@title
def abstract_embedding(abstract, model):
    num_features = model.trainables.layer1_size
    result = np.zeros(num_features)
    words = abstract.split()
    not_in_vocab = 0
    for word in words:
        if word in model.wv.vocab:
            result += model.wv[word]
        else:
            not_in_vocab += 1
    if len(words) - not_in_vocab != 0:
        result /= (len(words) - not_in_vocab)
    else:
        result = 0
    return result

In [None]:
#@title
from gensim.models import word2vec
import nltk

# Tokenization
print("Loading stemmer and stop words...")
nltk.download('punkt')
stemmer = nltk.stem.PorterStemmer()
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))

In [None]:
#@title
#Word training corpus
training_words = []
for node in nodes:
  training_words += [word for word in abstracts[node].split() if word not in stpwds]
training_words = list(set(training_words))

In [None]:
#@title
model = word2vec.Word2Vec(training_words, workers=4, size=64, min_count=5, window=20)

In [None]:
#@title
!pwd

/content/drive/.shortcut-targets-by-id/1UTpO-qa1QNYNkfp66OwNAuhMDSV6K3ZX/data_challenge_2021/ALTEGRAD-challenge---Link-prediction


In [None]:
#@title
model.save("wv_model.model")

In [None]:
#@title
#wv_model = word2vec.Word2Vec.load(path_wv_model)
print("Creating word embeddings")
abstract_emb = dict()
for node in tqdm(nodes):
  abstract_emb[node] = abstract_embedding(abstracts[node], model)

Creating word embeddings
Making feature vectors for the abstract...


100%|██████████| 138499/138499 [00:07<00:00, 19134.10it/s]


In [None]:
#@title
print('saving embeddings')
file = gzip.GzipFile('../abstract_word2vec_64.emb', 'wb')
file.write(pickle.dumps(abstract_emb))
file.close()

saving embeddings


Using pre-computed embeddings

In [None]:
ABSTRACT_EMBEDDING_FILENAME = '../text_emb.emb'
WORD2VEC = '../abstract_word2vec_64.emb'
EMBEDDING_FILENAME = '../NETMF42_emb.emb'
AUTHORS_EMBEDDING = '../authors_emb.emb'
# Features related to the texts
text2vec = load_features(WORD2VEC)
aut2vec = load_features(AUTHORS_EMBEDDING)
nodes2vec = load_features(EMBEDDING_FILENAME)

Split data into training and validationsets

In [None]:
from random import randint
import community.community_louvain as com

#create m (number edges) negative samples (i.e pair of nodes with no edge)
non_edges = []
#generate only negative samples 70%
for i in tqdm(range(int(m))):
  n1 = nodes[randint(0, n-1)]
  n2 = nodes[randint(0, n-1)]
  while (n1, n2) in G.edges() or n1==n2:
      n1 = nodes[randint(0, n-1)]
      n2 = nodes[randint(0, n-1)]
  non_edges.append((n1, n2))

In [None]:
all_edges = list(G.edges())
seed = 0
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7f058afe8410>

In [None]:
np.random.shuffle(all_edges)
np.random.shuffle(non_edges)

print(f"Number of positive examples : {len(all_edges)}")
print(f"Number of negative examples : {len(non_edges)}")

Number of positive examples : 1091955
Number of negative examples : 1091955


In [None]:
number_pos_dev = int(0.1 * len(all_edges))
number_neg_dev = int(0.1* len(non_edges))

print(f"The number of pos examples : Dev {number_pos_dev} / Training {len(all_edges) - number_pos_dev}.")
print(f"The number of neg examples : Dev {number_neg_dev} / Training {len(non_edges) - number_pos_dev}.")

The number of pos examples : Dev 109195 / Training 982760.
The number of neg examples : Dev 109195 / Training 982760.


In [None]:
pos_samples_dev = all_edges[:number_pos_dev]
neg_samples_dev = non_edges[:number_neg_dev]
pos_samples_train = list(set(all_edges) - set(pos_samples_dev))
neg_samples_train = list(set(non_edges) - set(neg_samples_dev))

In [None]:
graph = get_training_graph(G, pos_samples_dev)
pr = nx.pagerank(graph,alpha=0.85, max_iter=200)

In [None]:
graph_dicts = dict()
graph_dicts["clustering_coeff"] = nx.algorithms.cluster.clustering(graph)
graph_dicts["eigenvector"] = nx.algorithms.centrality.eigenvector_centrality(graph)
partition = com.best_partition(graph)

In [None]:
#split data
train_samples = pos_samples_train + neg_samples_train
train_labels = [1 for x in pos_samples_train] + [0 for x in neg_samples_train]
dev_samples = pos_samples_dev + neg_samples_dev
dev_labels = [1 for x in pos_samples_dev] + [0 for x in neg_samples_dev]

In [None]:
X_train = extract_features2(graph, authors, nodes2vec, text2vec,aut2vec, train_samples, pr, graph_dicts, partition)

100%|██████████| 1965492/1965492 [27:04<00:00, 1210.12it/s]


In [None]:
X_dev = extract_features2(graph, authors, nodes2vec, text2vec,aut2vec, dev_samples, pr, graph_dicts, partition)

100%|██████████| 218390/218390 [03:00<00:00, 1209.39it/s]


In [None]:
# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))
        
X_test =  extract_features2(graph, authors, nodes2vec, text2vec, aut2vec, node_pairs, pr, graph_dicts, partition)

100%|██████████| 106692/106692 [01:31<00:00, 1169.89it/s]


In [None]:
np.save('X_train_wv.npy', X_train)
np.save('X_dev_wv.npy', X_dev)
np.save('X_test_wv.npy', X_test)
np.save('train_labels_wv.npy', np.array(train_labels))
np.save('dev_labels_wv.npy', np.array(dev_labels))

## To read pre-computed training and validation matrices

In [None]:
X_train = np.load('X_train_all.npy')
X_dev = np.load('X_dev_all.npy')
X_test = np.load('X_test_all.npy')
train_labels = np.load('train_labels.npy')
dev_labels = np.load('dev_labels.npy')

## MLP

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import tensorflow as tf

def neural_net(X_train, y_train, X_validation, y_validation):
    print("NEURAL NET")
    early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=2, mode='auto')

    model = Sequential()
    model.add(Dense(160, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(160, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  metrics=['accuracy']
                  )

    model.fit(X_train, y_train,
              verbose=1,
              epochs=200,
              batch_size=100,
              callbacks=[early_stop],
              validation_data=(X_validation, y_validation))

    y_pred = model.predict(X_validation)
    print(return_metrics(y_validation, y_pred))
    return model

In [50]:
model = neural_net(X_train, list(train_labels), X_dev, list(dev_labels))

### XGBoost

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, accuracy_score, plot_roc_curve

In [None]:
clf = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=2000, n_jobs=4, tree_method='gpu_hist', predictor="gpu_predictor", random_state=42, seed=42)
clf.fit(X_train, train_labels, eval_metric="logloss", early_stopping_rounds=300, eval_set=[(X_dev, dev_labels)], verbose=1)

[0]	validation_0-logloss:0.613173
Will train until validation_0-logloss hasn't improved in 300 rounds.
[1]	validation_0-logloss:0.547619
[2]	validation_0-logloss:0.493281
[3]	validation_0-logloss:0.447015
[4]	validation_0-logloss:0.407836
[5]	validation_0-logloss:0.373888
[6]	validation_0-logloss:0.3446
[7]	validation_0-logloss:0.319157
[8]	validation_0-logloss:0.296564
[9]	validation_0-logloss:0.277044
[10]	validation_0-logloss:0.259598
[11]	validation_0-logloss:0.24444
[12]	validation_0-logloss:0.230825
[13]	validation_0-logloss:0.218558
[14]	validation_0-logloss:0.207303
[15]	validation_0-logloss:0.197545
[16]	validation_0-logloss:0.188846
[17]	validation_0-logloss:0.180731
[18]	validation_0-logloss:0.173659
[19]	validation_0-logloss:0.167402
[20]	validation_0-logloss:0.161871
[21]	validation_0-logloss:0.156519
[22]	validation_0-logloss:0.152122
[23]	validation_0-logloss:0.147919
[24]	validation_0-logloss:0.144201
[25]	validation_0-logloss:0.140646
[26]	validation_0-logloss:0.137792

XGBClassifier(n_estimators=2000, n_jobs=4, predictor='gpu_predictor',
              random_state=42, seed=42, tree_method='gpu_hist')

In [None]:
y_proba = clf.predict_proba(X_dev)
return_metrics(dev_labels, y_proba[:,1])

(0.9648527117555291, 0.9949885577687302, 0.09107243316769008)

### Submission

In [None]:
y_pred = clf.predict_proba(X_test)[:,1]

In [None]:
# Write predictions to a file
import csv
predictions = zip(range(len(y_pred)), y_pred)
with open(f"submission_new_trial.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row)