<a href="https://colab.research.google.com/github/nstsj/SN_final_project/blob/master/node2vec_neat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# all imports are here

import networkx as nx
# !pip3 install node2vec # un-comment if necessary
from node2vec import Node2Vec
import json
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import choice
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from keras.models import Sequential
from keras.layers import Dense
from keras import backend as K

## Let's create all the necessary functions:

```recall``` & ```precision``` for evaluation

```authors_to_emb``` speaks for itself

```all_coauthors_pairs``` and ```random_not_coauthors``` to experiment on graphs



In [0]:
def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def authors_to_emb(authors, embs):
    try:
        out = list(embs[authors[0]])
        out.extend(list(embs[authors[1]]))
        return out
    except:
        return None


def all_coauthors_pairs(coauthors, exception_graph=None):
    if len(coauthors) < 2:
        return None
    pairs = []
    for i in range(len(coauthors)):
        for j in range(len(coauthors[i + 1:])):
            if exception_graph:
                try:
                    dist = nx.shortest_path_length(graph, coauthors[i], coauthors[i + j + 1])
                except:
                    dist = 100
                if dist != 1:
                    pairs.append([coauthors[i], coauthors[i + j + 1]])
            else:
                pairs.append([coauthors[i], coauthors[i + j + 1]])
    return pairs


def random_not_coauthors(graph, need_num, authors):
    pairs = []
    for _ in range(need_num * 4):
        a1 = choice(list(graph.nodes()))
        a2 = choice(list(graph.nodes()))
        
        try:
            dist = nx.shortest_path_length(graph, a1, a2)
        except:
            dist = 100
        if (not a1 == a2) and (dist > 1) and (a1 in authors) and (a2 in authors):
            pairs.append([a1, a2])
        if len(pairs) >= need_num:
            break
    return pairs

## uncomment and use these cells if you're running code from Google Colab

In [8]:
#from google.colab import files
#files.upload()

Saving arxivData.json to arxivData.json


In [9]:
#! ls #let's check that it's really here

arxivData.json	sample_data


## main part

In [0]:
# opening the file locally, creating our dataset

with open('arxivData.json') as json_data:
    dataset = json.load(json_data)
    json_data.close()

In [0]:
# making a raw graph 

graph = nx.Graph()
authors = set([])
X_train_val = []
y_train_val = []
for paper in dataset:
    if 2014 <= paper['year'] <= 2016: # filtering by year
        try:
            paper_authors = []
            for author in json.loads(re.sub("'", '"', paper['author'])):
                paper_authors.append(author['name'])
            if len(paper_authors) > 1:
                authors.update(paper_authors)
            coauthors = all_coauthors_pairs(paper_authors)
            for pair in coauthors:
                graph.add_edge(pair[0], pair[1])
            X_train_val.extend(coauthors)
        except:
            pass


data_size = len(X_train_val)
X_train_val.extend(random_not_coauthors(graph, data_size * 1, authors))

y_train_val = [[0.0, 1.0]] * data_size
y_train_val.extend([[1.0, 0.0]] * (len(X_train_val) - data_size))

In [31]:
data_size # let's look at it

13887

In [0]:
graph_test = nx.Graph()
X_test = []
y_test = []
for paper in dataset:
    if paper['year'] >= 2017:
        try:
            paper_authors = []
            for author in json.loads(re.sub("'", '"', paper['author'])):
                if author['name'] in authors:
                    paper_authors.append(author['name'])
            coauthors = all_coauthors_pairs(paper_authors)
            for pair in coauthors:
                graph_test.add_edge(pair[0], pair[1])
            X_test.extend(all_coauthors_pairs(paper_authors, graph))
        except:
            pass


data_size = len(X_test)
X_test.extend(random_not_coauthors(graph_test, data_size*5, authors))

y_test = [[0.0, 1.0]] * data_size
y_test.extend([[1.0, 0.0]] * (len(X_test) - data_size))

X_test_raw = X_test
y_test_raw = y_test

In [0]:
mask_valid = [(pair[0] in authors and pair[1] in authors) for pair in X_test_raw]

In [0]:
X_test_valid = list(np.array(X_test_raw)[mask_valid])
y_test_valid = list(np.array(y_test_raw)[mask_valid])

In [15]:
preds_nx = [pred for _, _, pred in nx.jaccard_coefficient(graph, [x for x in X_test_raw])]

y_test = [np.argmax(y) for y in np.array(y_test_raw)]

y_pred = [1 if pred >= 0.5 else 0 for pred in preds_nx]
print(classification_report(y_test, y_pred))
print('\n')
print(confusion_matrix(y_test, y_pred))
print(roc_auc_score(y_test, preds_nx))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91     69435
           1       0.92      0.00      0.01     13887

    accuracy                           0.83     83322
   macro avg       0.88      0.50      0.46     83322
weighted avg       0.85      0.83      0.76     83322



[[69430     5]
 [13832    55]]
0.5989623392410661


### node2vec 
let's sample our graph by generating random walks from each node of the graph

we'll use [node2vec](https://github.com/eliorc/node2vec) library

In [26]:
# print('Generate walks')
node2vec = Node2Vec(graph, dimensions=20, walk_length=16, num_walks=10, workers=10)

Computing transition probabilities:   0%|          | 31/24475 [00:00<01:19, 308.37it/s]

Generate walks


Computing transition probabilities: 100%|██████████| 24475/24475 [00:08<00:00, 2749.85it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-26-6f1d20845d69>", line 2, in <module>
    node2vec = Node2Vec(graph, dimensions=20, walk_length=16, num_walks=10, workers=10)
  File "/usr/local/lib/python3.6/dist-packages/node2vec/node2vec.py", line 67, in __init__
    self.walks = self._generate_walks()
  File "/usr/local/lib/python3.6/dist-packages/node2vec/node2vec.py", line 154, in _generate_walks
    in enumerate(num_walks_lists, 1))
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 1017, in __call__
    self.retrieve()
  File "/usr/local/lib/python3.6/dist-packages/joblib/parallel.py", line 909, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py", line 562, in wrap_future_result
    return future.resu

KeyboardInterrupt: ignored

In [17]:

print('Learn embeddings')
embs = node2vec.fit(window=10, min_count=1)

Learn embeddings


### let's create our NN

In [18]:
model = Sequential()
model.add(Dense(40))
model.add(Dense(30, activation='tanh'))
model.add(Dense(20, activation='relu'))
# model.add(Dense(10, activation='exponential'))  ### Вот этот слой лишний. Возможно активация не очень.
model.add(Dense(2, activation='softmax'))
model.compile(loss="categorical_crossentropy", optimizer='rmsprop', metrics=["accuracy"])





In [0]:
# let's train it 

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
X_train = np.array([authors_to_emb(pair, embs) for pair in X_train])
y_train = np.array(y_train)
X_val = np.array([authors_to_emb(pair, embs) for pair in X_val])
y_val = np.array(y_val)
model.fit(X_train, y_train, batch_size=128, epochs=40, validation_data=(X_val, y_val))

### Let's test it

In [20]:
# 1 - соавторы, 0 - нет
#               precision    recall  f1-score   support

#            0       0.99      0.99      0.99     15866
#            1       0.99      0.99      0.99     15712

#    micro avg       0.99      0.99      0.99     31578
#    macro avg       0.99      0.99      0.99     31578
# weighted avg       0.99      0.99      0.99     31578
# [[15652   214]
#  [  155 15557]]
# 0.9989353879890981


y_pred = [np.argmax(y) for y in model.predict(X_val)]
y_true = [np.argmax(y) for y in y_val]
print(classification_report(y_true, y_pred))
print('\n')
print(confusion_matrix(y_true, y_pred))
print(roc_auc_score(y_true, [proba[1] for proba in model.predict(X_val)]))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98     15866
           1       0.99      0.98      0.98     15712

    accuracy                           0.98     31578
   macro avg       0.98      0.98      0.98     31578
weighted avg       0.98      0.98      0.98     31578



[[15640   226]
 [  261 15451]]
0.9979556622122701


In [21]:
# nx.resource_allocation_index
#               precision    recall  f1-score   support
#            0       0.83      1.00      0.91     69435
#            1       0.76      0.00      0.00     13887

#    micro avg       0.83      0.83      0.83     83322
#    macro avg       0.80      0.50      0.46     83322
# weighted avg       0.82      0.83      0.76     83322
# [[69429     6]
#  [13868    19]]
# 0.5990594303456509

# nx.jaccard_coefficient 
#               precision    recall  f1-score   support

#            0       0.83      1.00      0.91     69435
#            1       1.00      0.00      0.01     13887

#    micro avg       0.83      0.83      0.83     83322
#    macro avg       0.92      0.50      0.46     83322
# weighted avg       0.86      0.83      0.76     83322
# [[69435     0]
#  [13832    55]]
# 0.5990585638635837


# 1*1 Logreg
#               precision    recall  f1-score   support

#            0       0.86      0.99      0.92     69353
#            1       0.83      0.19      0.31     13887

#    micro avg       0.86      0.86      0.86     83240
#    macro avg       0.84      0.59      0.62     83240
# weighted avg       0.85      0.86      0.82     83240
# [[68812   541]
#  [11241  2646]]
# 0.6395078283412827

# 5*1 All_pairs
#               precision    recall  f1-score   support

#            0       0.92      0.98      0.95    135084
#            1       0.84      0.59      0.69     27216

#    micro avg       0.91      0.91      0.91    162300
#    macro avg       0.88      0.78      0.82    162300
# weighted avg       0.91      0.91      0.91    162300
# [[132044   3040]
#  [ 11130  16086]]

# 5*1 New pairs
#               precision    recall  f1-score   support

#            0       0.87      0.98      0.92     69367
#            1       0.68      0.24      0.35     13887

#    micro avg       0.85      0.85      0.85     83254
#    macro avg       0.77      0.61      0.64     83254
# weighted avg       0.83      0.85      0.82     83254
# [[67779  1588]
#  [10569  3318]]
# 0.6553634735001194

# Random
#              precision    recall  f1-score   support

#            0       1.00      0.79      0.88     83254
#            1       0.00      0.00      0.00         0

#    micro avg       0.79      0.79      0.79     83254
#    macro avg       0.50      0.39      0.44     83254
# weighted avg       1.00      0.79      0.88     83254
# [[65546 17708]
#  [    0     0]]

# 1*1 New pairs
#               precision    recall  f1-score   support

#            0       0.87      0.95      0.91     69353
#            1       0.55      0.29      0.38     13887

#    micro avg       0.84      0.84      0.84     83240
#    macro avg       0.71      0.62      0.65     83240
# weighted avg       0.82      0.84      0.82     83240
# [[66055  3298]
#  [ 9823  4064]]
# 0.6657878721401573

# 1*1 New pairs Stack more layers 40e
#               precision    recall  f1-score   support

#            0       0.87      0.96      0.91     69341
#            1       0.59      0.30      0.39     13887

#    micro avg       0.85      0.85      0.85     83228
#    macro avg       0.73      0.63      0.65     83228
# weighted avg       0.82      0.85      0.83     83228
# [[66511  2830]
#  [ 9789  4098]]
# 0.7027983388267736

# 1*1 New pairs Stack MORE layers 40e
#               precision    recall  f1-score   support

#            0       0.87      0.97      0.92     69341
#            1       0.64      0.27      0.38     13887

#    micro avg       0.85      0.85      0.85     83228
#    macro avg       0.75      0.62      0.65     83228
# weighted avg       0.83      0.85      0.83     83228
# [[67246  2095]
#  [10151  3736]]
# 0.6888330186522706

mask_valid = [np.array(authors_to_emb(pair, embs)).shape == (40,) for pair in X_test_raw]
X_test = np.array([authors_to_emb(pair, embs) for pair in X_test_raw
                   if np.array(authors_to_emb(pair, embs)).shape == (40,)])
y_test = np.array(y_test_raw)[mask_valid]
y_pred = [np.argmax(y) for y in model.predict(X_test)]
y_true = [np.argmax(y) for y in y_test]
print(classification_report(y_true, y_pred))
print('\n')
print(confusion_matrix(y_true, y_pred))
print(roc_auc_score(y_true, [proba[1] for proba in model.predict(X_test)]))



              precision    recall  f1-score   support

           0       0.87      0.97      0.92     69435
           1       0.62      0.28      0.38     13887

    accuracy                           0.85     83322
   macro avg       0.74      0.62      0.65     83322
weighted avg       0.83      0.85      0.83     83322



[[67057  2378]
 [10011  3876]]
0.7030813657929027


In [0]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
X_train = np.array([cosine_similarity(embs.wv.__getitem__(pair[0]).reshape(1, -1),
                                      embs.wv.__getitem__(pair[1]).reshape(1, -1))[0] for pair in X_train])
y_train = [np.argmax(y) for y in y_train]

In [23]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
X_val = np.array([cosine_similarity(embs.wv.__getitem__(pair[0]).reshape(1, -1),
                                      embs.wv.__getitem__(pair[1]).reshape(1, -1))[0] for pair in X_val])
y_val = [np.argmax(y) for y in y_val]

y_pred = log_reg.predict(X_val)
print(classification_report(y_val, y_pred))
print('\n')
print(confusion_matrix(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     15866
           1       0.99      0.99      0.99     15712

    accuracy                           0.99     31578
   macro avg       0.99      0.99      0.99     31578
weighted avg       0.99      0.99      0.99     31578



[[15770    96]
 [   97 15615]]


In [25]:
X_test = np.array([cosine_similarity(embs.wv.__getitem__(pair[0]).reshape(1, -1),
                                      embs.wv.__getitem__(pair[1]).reshape(1, -1))[0] 
                   for pair in np.array(X_test_raw)[mask_valid]])
y_test = [np.argmax(y) for y in np.array(y_test_raw)[mask_valid]]

y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))
print('\n')
print(confusion_matrix(y_test, y_pred))
print(roc_auc_score(y_test, [proba[1] for proba in log_reg.predict_proba(X_test)]))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92     69435
           1       0.82      0.19      0.30     13887

    accuracy                           0.86     83322
   macro avg       0.84      0.59      0.61     83322
weighted avg       0.85      0.86      0.82     83322



[[68888   547]
 [11316  2571]]
0.648030013611339
