In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from random import randint

from sklearn.model_selection import KFold

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostClassifier


from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import csv

from scipy import spatial
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int) # read graph
nodes = list(G.nodes())
num_of_nodes = G.number_of_nodes()
num_of_edges = G.number_of_edges()

adjacency_matrix = nx.adjacency_matrix(G)


abstracts_list = [] #Init List Abstracts
with open('data/abstracts.txt', 'r', encoding = "UTF-8") as f:
    for line in f:
        abstracts_list.append(line.split('|--|')[1].replace("\n", ""))  
        
authors_list = [] #Init List Authors
with open('data/authors.txt', 'r', encoding = "UTF-8") as f:
    for line in f:
        authors_list.append(set(line.split('|--|')[1].replace("\n", "").split(","))) 

df = pd.DataFrame(data = {'abstracts': abstracts_list, 'authors': authors_list})

In [56]:
import nltk
import contractions
import inflect
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
import re, string, unicodedata
from collections import Counter

corpus = df.loc[:,"abstracts"]


def remove_round_brackets(data):
  return re.sub('\(.*?\)','',data)

def remove_punc(data):
  trans = str.maketrans('','', string.punctuation)
  return data.translate(trans)

def white_space(data):
  return ' '.join(data.split())

def complete_noise(data):
  new_data = remove_round_brackets(data)
  new_data = remove_punc(new_data)
  new_data = white_space(new_data)
  return new_data

def text_lower(data):
  return data.lower()

def contraction_replace(data):
  return contractions.fix(data)

def number_to_text(data):
  temp_str = data.split()
  string = []
  for i in temp_str:
    if i.isdigit():
      temp = inflect.engine().number_to_words(i)
      string.append(temp)
    else:
      string.append(i)
  return temp_str

def normalization(data):
  text = text_lower(data)
  text = number_to_text(text)

  text = " ".join(text)
  
  text = contraction_replace(text)
  tokens = nltk.word_tokenize(text)
  return tokens

abstracts_fixed = []
for i, txt in enumerate(corpus):
  print("Current Text:", i, "from", len(corpus))
  denoised_txt = complete_noise(txt)
  tokens = normalization(denoised_txt)

  tags = nltk.pos_tag(tokens, tagset='universal')

  unique_tags = list(Counter(tags))

  all_tags = []
  for tag in unique_tags:
    if(tag[1]=="DET" or tag[1]=="ADP" or tag[1]=="PRT"):
      all_tags.append(tag[0])

  abstracts_fixed.append([item for item in tokens if item not in all_tags])

Current Text: 0 from 138499
Current Text: 1 from 138499
Current Text: 2 from 138499
Current Text: 3 from 138499
Current Text: 4 from 138499
Current Text: 5 from 138499
Current Text: 6 from 138499
Current Text: 7 from 138499
Current Text: 8 from 138499
Current Text: 9 from 138499
Current Text: 10 from 138499
Current Text: 11 from 138499
Current Text: 12 from 138499
Current Text: 13 from 138499
Current Text: 14 from 138499
Current Text: 15 from 138499
Current Text: 16 from 138499
Current Text: 17 from 138499
Current Text: 18 from 138499
Current Text: 19 from 138499
Current Text: 20 from 138499
Current Text: 21 from 138499
Current Text: 22 from 138499
Current Text: 23 from 138499
Current Text: 24 from 138499
Current Text: 25 from 138499
Current Text: 26 from 138499
Current Text: 27 from 138499
Current Text: 28 from 138499
Current Text: 29 from 138499
Current Text: 30 from 138499
Current Text: 31 from 138499
Current Text: 32 from 138499
Current Text: 33 from 138499
Current Text: 34 from 13

In [66]:
w2v = Word2Vec(abstracts_fixed, min_count=1)

def similarity(id1, id2):
    text1 = abstracts_fixed[id1]
    text2 = abstracts_fixed[id2]

    vector1 = []
    vector2 = []

    for word in text1:
        vector1.append(w2v.wv[word])

    mean1 = np.array(vector1).mean(axis=0)

    for word in text2:
        vector2.append(w2v.wv[word])

    mean2 = np.array(vector2).mean(axis=0)

    return 1 - spatial. distance. cosine(mean1, mean2)

In [71]:
authors = df.loc[:,"authors"]
H = G.to_directed()

pagerank = nx.pagerank(H)

# Features:
# (1) sum of number of unique terms of the two nodes' abstracts
# (2) absolute value of difference of number of unique terms of the two nodes' abstracts
# (3) number of common terms between the abstracts of the two nodes
# (4) number of common authors between the authorlists of the two nodes
# (5) pagerank of first node
# (6) pagerank of second node

x = np.zeros((2*num_of_edges, 7))
y = np.zeros(2*num_of_edges)

for i, edge in enumerate(G.edges()):
    # an edge
    x[i,0] = len(abstracts_fixed[edge[0]]) + len(abstracts_fixed[edge[1]])
    x[i,1] = abs(len(abstracts_fixed[edge[0]]) - len(abstracts_fixed[edge[1]]))
    x[i,2] = len(set(abstracts_fixed[edge[0]]).intersection(abstracts_fixed[edge[1]]))
    x[i,3] = len(authors[edge[0]].intersection(authors[edge[1]]))
    x[i,4] = pagerank[edge[0]]
    x[i,5] = pagerank[edge[1]]
    x[i,6] = similarity(edge[0], edge[1])

    y[i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, num_of_nodes-1)
    n2 = randint(0, num_of_nodes-1)
    x[num_of_edges+i,0] = len(abstracts_fixed[n1]) + len(abstracts_fixed[n2])
    x[num_of_edges+i,1] = abs(len(abstracts_fixed[n1]) - len(abstracts_fixed[n2]))
    x[num_of_edges+i,2] = len(set(abstracts_fixed[n1]).intersection(abstracts_fixed[n2]))
    x[num_of_edges+i,3] = len(authors[n1].intersection(authors[n2]))
    x[num_of_edges+i,4] = pagerank[n1]
    x[num_of_edges+i,5] = pagerank[n2]
    x[num_of_edges+i,6] = similarity(n1, n2)
    
    y[num_of_edges+i] = 0

  mean1 = np.array(vector1).mean(axis=0)
  ret = ret.dtype.type(ret / rcount)
  mean2 = np.array(vector2).mean(axis=0)


In [74]:
x, y = shuffle(x, y)
x_train, x_test, y_train, y_test = train_test_split(x, y)

clf = LogisticRegression(solver='liblinear',random_state=34)
clf.fit(x_train, y_train)
y_pred = clf.predict_proba(x_test)
y_pred = y_pred[:,1]


y_predictions = []
for i in y_pred:
    if i >= 0.5:
        y_predictions.append(1)
    else:
        y_predictions.append(0)

print("Accuracy: " + str(accuracy_score(y_predictions, y_test)) + "%")

Accuracy: 0.7986365751000956%


In [73]:
# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))

# Create the test matrix. Use the same 6 features as above
X_test = np.zeros((len(node_pairs), 7))
for i,node_pair in enumerate(node_pairs):
    X_test[i,0] = len(abstracts_fixed[node_pair[0]]) + len(abstracts_fixed[node_pair[1]])
    X_test[i,1] = abs(len(abstracts_fixed[node_pair[0]]) - len(abstracts_fixed[node_pair[1]]))
    X_test[i,2] = len(set(abstracts_fixed[node_pair[0]]).intersection(abstracts_fixed[node_pair[1]]))
    X_test[i,3] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))
    X_test[i,4] = pagerank[node_pair[0]]
    X_test[i,5] = pagerank[node_pair[1]]
    X_test[i,6] = similarity(node_pair[0], node_pair[1])

print('Size of test matrix:', X_test.shape)

X_train, y_train = shuffle(x, y)

# START OF MODEL
clf = LinearRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# END OF MODEL

print("Number of Predictions: ", len(y_pred))

# Write predictions to a file
predictions = zip(range(len(y_pred)), y_pred)
with open("submission_better.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row)

print("Predictions written to file")

  mean1 = np.array(vector1).mean(axis=0)
  ret = ret.dtype.type(ret / rcount)
  mean2 = np.array(vector2).mean(axis=0)


Size of test matrix: (106692, 7)
Number of Predictions:  106692
Predictions written to file
