## Import Packages

In [1]:
import networkx as nx
import pandas as pd
import numpy as np
from random import randint

from sklearn.model_selection import KFold

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostClassifier


from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt

import csv

from scipy import spatial
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

## Functions

In [24]:
def similarity(text_a, text_b):
    model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
    return 1 - spatial.distance.cosine(get_vector(model, text_a), get_vector(model, text_b))

def preprocess(s):
    return [i.lower() for i in s]

def get_vector(model, s):
    print(model.wv.key_to_index())
    return np.sum(np.array([model.wv[i] for i in preprocess(s)]), axis=0)

## Read Data

In [2]:
G = nx.read_edgelist('data/edgelist.txt', delimiter=',', create_using=nx.Graph(), nodetype=int) # read graph
nodes = list(G.nodes())
num_of_nodes = G.number_of_nodes()
num_of_edges = G.number_of_edges()

adjacency_matrix = nx.adjacency_matrix(G)


abstracts_list = [] #Init List Abstracts
with open('data/abstracts.txt', 'r', encoding = "UTF-8") as f:
    for line in f:
        abstracts_list.append(set(line.split('|--|')[1].replace("\n", "").split()))  
        
authors_list = [] #Init List Authors
with open('data/authors.txt', 'r', encoding = "UTF-8") as f:
    for line in f:
        authors_list.append(set(line.split('|--|')[1].replace("\n", "").split(","))) 

df = pd.DataFrame(data = {'abstracts': abstracts_list, 'authors': authors_list})

  adjacency_matrix = nx.adjacency_matrix(G)


## Prepare Data
#### Baseline Data, Common Authors

In [3]:
abstracts = df.to_dict()['abstracts']
authors = df.to_dict()['authors']

# Features:
# (1) sum of number of unique terms of the two nodes' abstracts
# (2) absolute value of difference of number of unique terms of the two nodes' abstracts
# (3) number of common terms between the abstracts of the two nodes
# (4) number of common authors between the authorlists of the two nodes

x = np.zeros((2*num_of_edges, 4))
y = np.zeros(2*num_of_edges)

for i, edge in enumerate(G.edges()):
    # an edge
    x[i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    x[i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    x[i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    x[i,3] = len(authors[edge[0]].intersection(authors[edge[1]]))

    y[i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, num_of_nodes-1)
    n2 = randint(0, num_of_nodes-1)
    x[num_of_edges+i,0] = len(abstracts[n1]) + len(abstracts[n2])
    x[num_of_edges+i,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    x[num_of_edges+i,2] = len(abstracts[n1].intersection(abstracts[n2]))
    x[num_of_edges+i,3] = len(authors[n1].intersection(authors[n2]))
    
    y[num_of_edges+i] = 0

#### Baseline Data, Common Authors, Pagerank

In [11]:
abstracts = df.to_dict()['abstracts']
authors = df.to_dict()['authors']
H = G.to_directed()

pagerank = nx.pagerank(H)

# Features:
# (1) sum of number of unique terms of the two nodes' abstracts
# (2) absolute value of difference of number of unique terms of the two nodes' abstracts
# (3) number of common terms between the abstracts of the two nodes
# (4) number of common authors between the authorlists of the two nodes
# (5) pagerank of first node
# (6) pagerank of second node

x = np.zeros((2*num_of_edges, 6))
y = np.zeros(2*num_of_edges)

for i, edge in enumerate(G.edges()):
    # an edge
    x[i,0] = len(abstracts[edge[0]]) + len(abstracts[edge[1]])
    x[i,1] = abs(len(abstracts[edge[0]]) - len(abstracts[edge[1]]))
    x[i,2] = len(abstracts[edge[0]].intersection(abstracts[edge[1]]))
    x[i,3] = len(authors[edge[0]].intersection(authors[edge[1]]))
    x[i,4] = pagerank[edge[0]]
    x[i,5] = pagerank[edge[1]]

    y[i] = 1

    # a randomly generated pair of nodes
    n1 = randint(0, num_of_nodes-1)
    n2 = randint(0, num_of_nodes-1)
    x[num_of_edges+i,0] = len(abstracts[n1]) + len(abstracts[n2])
    x[num_of_edges+i,1] = abs(len(abstracts[n1]) - len(abstracts[n2]))
    x[num_of_edges+i,2] = len(abstracts[n1].intersection(abstracts[n2]))
    x[num_of_edges+i,3] = len(authors[n1].intersection(authors[n2]))
    x[num_of_edges+i,4] = pagerank[n1]
    x[num_of_edges+i,5] = pagerank[n2]
    
    y[num_of_edges+i] = 0

## Cross Validate Attempt

In [None]:
indexes = []
train_accuracy = []
cv_accuracy = []
test_accuracy = []

x_train, x_test, y_train, y_test = train_test_split(x, y)

train_data = pd.DataFrame(data = {'sum_of_unique_terms': x_train[:,0], 'diff_of_unique_terms': x_train[:,1], 'common_terms': x_train[:,2], 'common_authors': x_train[:,3], 'target': y_train})

kf = KFold(n_splits=2, shuffle=False)
model = LogisticRegression(solver='liblinear',random_state=34)

feature_df = train_data.drop('target' ,axis= 1)
target_df = train_data[['target']]

fold_index = 1
for train_index, test_index in kf.split(train_data):
    X_train_cv = feature_df.iloc[train_index]
    X_test_cv = feature_df.iloc[test_index]
    Y_train_cv = target_df.iloc[train_index]
    Y_test_cv = target_df.loc[test_index]
        
    #Train the model
    model.fit(X_train_cv, Y_train_cv.values.ravel()) #Training the model


    train_pred = model.predict_proba(X_train_cv)
    train_pred = train_pred[:,1]

    train_pred_fixed = []
    for item in train_pred: 
        if item >= 0.5: 
            train_pred_fixed.append(1)
        else:
            train_pred_fixed.append(0)

    cv_pred = model.predict_proba(X_test_cv)
    cv_pred = cv_pred[:,1]

    cv_pred_fixed = []
    for item in cv_pred: 
        if item >= 0.5: 
            cv_pred_fixed.append(1)
        else:
            cv_pred_fixed.append(0)

    test_pred = model.predict_proba(x_test)
    test_pred = test_pred[:,1]

    test_pred_fixed = []
    for item in test_pred: 
        if item >= 0.5: 
            test_pred_fixed.append(1)
        else:
            test_pred_fixed.append(0)

    indexes.append(fold_index)
    train_accuracy.append(accuracy_score(Y_train_cv, train_pred_fixed))
    cv_accuracy.append(accuracy_score(Y_test_cv, cv_pred_fixed))
    test_accuracy.append(accuracy_score(y_test, test_pred_fixed))

    print("Current Index:", fold_index)
    fold_index += 1

plt.plot(indexes, train_accuracy, label="Train Data")
plt.plot(indexes, cv_accuracy, label="Validate Data")
plt.plot(indexes, test_accuracy, label="Test Data")

plt.legend(loc="upper left")
plt.grid()

plt.show()

## Predict and Write to CSV

#### Baseline Data, Common Authors 

In [None]:
# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))

# Create the test matrix. Use the same 4 features as above
X_test = np.zeros((len(node_pairs), 4))
for i,node_pair in enumerate(node_pairs):
    X_test[i,0] = len(abstracts[node_pair[0]]) + len(abstracts[node_pair[1]])
    X_test[i,1] = abs(len(abstracts[node_pair[0]]) - len(abstracts[node_pair[1]]))
    X_test[i,2] = len(abstracts[node_pair[0]].intersection(abstracts[node_pair[1]]))
    X_test[i,3] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))

print('Size of test matrix:', X_test.shape)

X_train, y_train = shuffle(x, y)


# START OF MODEL

# END OF MODEL

print("Number of Predictions: ", len(y_pred))

# Write predictions to a file
predictions = zip(range(len(y_pred)), y_pred)
with open("submission_common_authors_abc.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row)

#### Baseline Data, Common Authors, Pagerank

In [21]:
# Read test data. Each sample is a pair of nodes
node_pairs = list()
with open('data/test.txt', 'r') as f:
    for line in f:
        t = line.split(',')
        node_pairs.append((int(t[0]), int(t[1])))

# Create the test matrix. Use the same 6 features as above
X_test = np.zeros((len(node_pairs), 6))
for i,node_pair in enumerate(node_pairs):
    X_test[i,0] = len(abstracts[node_pair[0]]) + len(abstracts[node_pair[1]])
    X_test[i,1] = abs(len(abstracts[node_pair[0]]) - len(abstracts[node_pair[1]]))
    X_test[i,2] = len(abstracts[node_pair[0]].intersection(abstracts[node_pair[1]]))
    X_test[i,3] = len(authors[node_pair[0]].intersection(authors[node_pair[1]]))
    X_test[i,4] = pagerank[node_pair[0]]
    X_test[i,5] = pagerank[node_pair[1]]

print('Size of test matrix:', X_test.shape)

X_train, y_train = shuffle(x, y)


# START OF MODEL
clf = LinearRegression()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# END OF MODEL

print("Number of Predictions: ", len(y_pred))

# Write predictions to a file
predictions = zip(range(len(y_pred)), y_pred)
with open("linear_baseline_common_authors_pagerank.csv","w") as pred:
    csv_out = csv.writer(pred)
    csv_out.writerow(['id','predicted'])
    for row in predictions:
        csv_out.writerow(row)

print("Predictions written to file")

Size of test matrix: (106692, 6)
Number of Predictions:  106692
Predictions written to file


## Logistic Regression

In [23]:
x, y = shuffle(x, y)
x_train, x_test, y_train, y_test = train_test_split(x, y)

# Use logistic regression to predict if two nodes are linked by an edge
clf = LogisticRegression(solver='liblinear',random_state=34)
clf.fit(x_train, y_train)
y_pred = clf.predict_proba(x_test)
y_pred = y_pred[:,1]

y_predictions = []
for i in y_pred: 
    if i >= 0.5: 
        y_predictions.append(1)
    else:
        y_predictions.append(0)

print("Accuracy: " + str(accuracy_score(y_predictions, y_test)) + "%")

Accuracy: 0.6991362289323013%


##### Baseline Data, Common Authors: 0.7265512529808893%
##### Baseline Data, Common Authors, Pagerank: 0.727549461699922%

## Linear Regression

In [24]:
x, y = shuffle(x, y)
x_train, x_test, y_train, y_test = train_test_split(x, y)

# Use logistic regression to predict if two nodes are linked by an edge
clf = LinearRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

y_predictions = []
for i in y_pred: 
    if i >= 0.5: 
        y_predictions.append(1)
    else:
        y_predictions.append(0)

print("Accuracy: " + str(accuracy_score(y_predictions, y_test)) + "%")

Accuracy: 0.7559260629549176%


##### Baseline Data, Common Authors: 0.7235657847019477%
##### Baseline Data, Common Authors, Pagerank: 0.776417731117371%

## AdaBoost

In [17]:
x, y = shuffle(x, y)
x_train, x_test, y_train, y_test = train_test_split(x, y)

# Use logistic regression to predict if two nodes are linked by an edge
abc = AdaBoostClassifier(n_estimators=100, learning_rate=1)
model = abc.fit(x_train, y_train)
y_pred = model.predict(x_test)

y_predictions = []
for i in y_pred: 
    if i >= 0.5: 
        y_predictions.append(1)
    else:
        y_predictions.append(0)

print("Accuracy: " + str(accuracy_score(y_predictions, y_test)) + "%")

Accuracy: 0.7850297997355205%


##### Baseline Data, Common Authors: 0.7058306378645294%
##### Baseline Data, Common Authors, Pagerank: 0.7721501598965526% 0.7850297997355205%

## Voting Regressor

## Word2Vec test

In [3]:
abstracts_list_2 = []
with open('data/abstracts.txt', 'r', encoding = "UTF-8") as f:
    for line in f:
        abstracts_list_2.append(line.split('|--|')[1].replace("\n", "").split())

In [40]:
# test_abstracts = [[word.lower() for word in abstract] for abstract in abstracts[:5]]

tmp = abstracts_list_2[:4]

fixed = []

for abstract in tmp:
    fixed.append([word.lower() for word in abstract])

model = Word2Vec(fixed, min_count=1)

def similarity(text_a, text_b):
    return 1 - spatial.distance.cosine(get_vector(model, text_a), get_vector(model, text_b))

def preprocess(s):
    return [i.lower() for i in s]

def get_vector(model, s):
    return np.sum(np.array([model.wv[i] for i in preprocess(s)]), axis=0)

print("Paper 0 - Paper 1", similarity(fixed[0], fixed[1]))
# print("Paper 0 - Paper 2", similarity(tmp[0], tmp[2]))
print("Paper 1 - Paper 2", similarity(tmp[1], tmp[2]))

Paper 0 - Paper 1 0.8254383206367493
Paper 1 - Paper 2 0.7354351282119751
