In [1]:
import networkx as nx
import pandas as pd

from gensim.models import Word2Vec
import numpy as np
from scipy import spatial

from random import randint

import csv

In [2]:
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using= nx.DiGraph(), nodetype=int) # Create Directed Graph from Edgelist
nodes = list(G.nodes())
num_of_nodes = G.number_of_nodes()
num_of_edges = G.number_of_edges()

with open('data/abstracts_preprocessed.txt', 'r', encoding = "UTF-8") as f: # Read Abstracts
    abstracts_list = [item.split(",") for item in f.read().split("#")]
        
authors_list = []
with open('data/authors.txt', 'r', encoding = "UTF-8") as f: # Read Author Lists
    for line in f:
        authors_list.append(set(line.split('|--|')[1].replace("\n", "").split(","))) 

df = pd.DataFrame(data = {'abstracts': abstracts_list, 'authors': authors_list}) # Create Dataframe

In [3]:
w2v = Word2Vec(df.loc[:,"abstracts"], min_count=1) # Build Word2Vec Model

def similarity(id1, id2): # Calculate Semantic Similarity of two Abstracts
    text1 = list(set(df.loc[id1,"abstracts"])) # Get Abstract of Paper id1
    text2 = list(set(df.loc[id2,"abstracts"])) # Get Abstract of Paper id2

    vector1 = []
    vector2 = []

    for word in text1: # Collect Vectors of words in Abstract of Paper id1
        vector1.append(w2v.wv[word])

    mean1 = np.array(vector1).mean(axis=0)

    for word in text2: # Collect Vectors of words in Abstract of Paper id1
        vector2.append(w2v.wv[word])

    mean2 = np.array(vector2).mean(axis=0)

    return 1 - spatial.distance.cosine(mean1, mean2) # Return Cosine Distance of two Mean Vectors

In [4]:
def jaccard_similarity(A, B): # Calculate Jaccard Index of two Sets
    nominator = A.intersection(B) # Get intersection of two sets

    denominator = A.union(B) # Get union of two sets
    
    return len(nominator)/len(denominator) # Return ratio of sizes

In [7]:
pagerank = nx.pagerank(G) # PageRank

hits_vals = nx.hits(G) # HITS Algorithm

hubness = hits_vals[0] # Hubness
authority = hits_vals[1] # Authority

cores = nx.core_number(G) # Core Number

und_G = G.to_undirected() # Convert Graph to Undirected

# Features:
# (0) Semantic Similarity of Abstracts
# (1) Jaccard Index of Abstracts
# (2) Jaccard Index of Author Lists
# (3) PageRank of First Node
# (4) PageRank of Second Node
# (5) Hubness of First Node
# (6) Authority of Second Node
# (7) Out-Degree of First Node
# (8) In-Degree of Second Node
# (9) Max k-Core of First Node
# (10) Max k-Core of Second Node
# (11) Number of Common Neighbors

# (12) Class

x_train = np.zeros((2*num_of_edges, 13))

for i, edge in enumerate(G.edges()):
    # an edge
    x_train[i,0] = similarity(edge[0], edge[1])
    
    x_train[i,1] = jaccard_similarity(set(df.loc[edge[0],"abstracts"]), set(df.loc[edge[1],"abstracts"])) 
    x_train[i,2] = jaccard_similarity(set(df.loc[edge[0],"authors"]), set(df.loc[edge[1],"authors"]))
    
    x_train[i,3] = pagerank[edge[0]]
    x_train[i,4] = pagerank[edge[1]]
    
    x_train[i,5] = hubness[edge[0]]
    x_train[i,6] = authority[edge[1]]
    
    x_train[i,7] = G.out_degree[edge[0]]
    x_train[i,8] = G.in_degree[edge[1]]
    
    x_train[i,9] = cores[edge[0]]
    x_train[i,10] = cores[edge[1]]
    
    x_train[i,11] = len(list(nx.common_neighbors(und_G, edge[0], edge[1])))

    x_train[i,12] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, num_of_nodes-1)
    n2 = randint(0, num_of_nodes-1)
    
    x_train[num_of_edges+i,0] = similarity(n1, n2)

    x_train[num_of_edges+i,1] = jaccard_similarity(set(df.loc[n1,"abstracts"]), set(df.loc[n2,"abstracts"]))
    x_train[num_of_edges+i,2] = jaccard_similarity(set(df.loc[n1,"authors"]), set(df.loc[n2,"authors"]))
    
    x_train[num_of_edges+i,3] = pagerank[n1]
    x_train[num_of_edges+i,4] = pagerank[n2]
    
    x_train[num_of_edges+i,5] = hubness[n1]
    x_train[num_of_edges+i,6] = authority[n2]
    
    x_train[num_of_edges+i,7] = G.out_degree[n1]
    x_train[num_of_edges+i,8] = G.in_degree[n2]
    
    x_train[num_of_edges+i,9] = cores[n1]
    x_train[num_of_edges+i,10] = cores[n2]
    
    x_train[num_of_edges+i,11] = len(list(nx.common_neighbors(und_G, n1, n2)))
    
    x_train[num_of_edges+i,12] = 0
    
with open("data/train.csv","w") as f: #NEEDS NAME
    csv_out = csv.writer(f)
    csv_out.writerow(['Semantic Similarity of Abstracts','Jaccard Index of Abstracts', 'Jaccard Index of Author Lists', 'PageRank of First Node', 'PageRank of Second Node', 'Hubness of First Node', 'Authority of Second Node', 'Out-Degree of First Node', 'In-Degree of Second Node', 'Max k-Core of First Node', 'Max k-Core of Second Node', 'Number of Common Neighbors', 'Class'])
    for row in x_train:
        csv_out.writerow(row)

In [11]:
node_pairs = []
with open('data/test.txt', 'r') as f: # Read Test.txt
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))
        
x_test = np.zeros((len(node_pairs), 12))

# Create a Test matrix using the same 12 Features
for i, node_pair in enumerate(node_pairs):
    x_test[i,0] = similarity(node_pair[0], node_pair[1])
    
    x_test[i,1] = jaccard_similarity(set(df.loc[node_pair[0],"abstracts"]), set(df.loc[node_pair[1],"abstracts"]))
    x_test[i,2] = jaccard_similarity(set(df.loc[node_pair[0],"authors"]), set(df.loc[node_pair[1],"authors"]))
    
    x_test[i,3] = pagerank[node_pair[0]]
    x_test[i,4] = pagerank[node_pair[1]]
    
    x_test[i,5] = hubness[node_pair[0]]
    x_test[i,6] = authority[node_pair[1]]
    
    x_test[i,7] = G.out_degree[node_pair[0]]
    x_test[i,8] = G.in_degree[node_pair[1]]
    
    x_test[i,9] = cores[node_pair[0]]
    x_test[i,10] = cores[node_pair[1]]
    
    x_test[i,11] = len(list(nx.common_neighbors(und_G, node_pair[0], node_pair[1])))

print('x_test shape:', x_test.shape)

with open("data/test.csv","w") as f: # Writing to test.csv
    csv_out = csv.writer(f)
    csv_out.writerow(['Semantic Similarity of Abstracts','Jaccard Index of Abstracts', 'Jaccard Index of Author Lists', 'PageRank of First Node', 'PageRank of Second Node', 'Hubness of First Node', 'Authority of Second Node', 'Out-Degree of First Node', 'In-Degree of Second Node', 'Max k-Core of First Node', 'Max k-Core of Second Node', 'Number of Common Neighbors'])
    for row in x_test:
        csv_out.writerow(row)

x_test shape: (106692, 12)
