In [41]:
import pandas as pd
import numpy as np
import random
import networkx as nx
from tqdm import tqdm
import matplotlib.pyplot as plt

import seaborn as sns
from scipy.special import expit

# Pre-processing

In [2]:
# load the train data
file = open('train.txt', 'r')
lines = file.readlines()

# find unique author pairs and count the number of papers coauthored
from itertools import combinations
aPairs = {}
for line in lines:
    coAuthors = list(map(int, line.split()))
    for pair in combinations(coAuthors,2):
        pair = tuple(sorted(pair))
        if pair in aPairs:
            aPairs[pair] += 1
        else:
            aPairs[pair] = 1

# nca: the number of pairs coauthored
# exist: whether the pair of authors has coauthored
edges = pd.DataFrame(list(aPairs.items()), columns=['Pair', 'NCA'])
edges[['Source', 'Sink']] = pd.DataFrame(edges['Pair'].tolist(), index=edges.index)
edges['Exist'] = pd.DataFrame([1]*(edges.shape[0]))
edges = edges[["Source", "Sink", "NCA", "Exist"]]

In [3]:
# df of existent edges
edges.head()

Unnamed: 0,Source,Sink,NCA,Exist
0,0,356,14,1
1,0,1236,14,1
2,356,1236,14,1
3,0,1655,9,1
4,0,1797,4,1


In [4]:
edges.shape

(16036, 4)

In [5]:
# export edges to csv
# edges.to_csv("edges.csv")

In [6]:
# list of existent nodes
nodeL = sorted(np.union1d(edges['Source'].values, edges['Sink'].values))
len(nodeL)

3767

In [7]:
"""
The following code is adapted from:
https://www.analyticsvidhya.com/blog/2020/01/link-prediction-how-to-predict-your-future-connections-on-facebook/
"""

# create graph
G = nx.from_pandas_edgelist(edges, "Source", "Sink", create_using=nx.Graph())

# plot graph
# plt.figure(figsize=(10,10))

# pos = nx.random_layout(G, seed=23)
# nx.draw(G, with_labels=False,  pos = pos, node_size = 20, alpha = 0.3, width = 0.3)

# plt.show()

# build adjacency matrix
adj_G = nx.to_numpy_matrix(G, nodelist = nodeL)

# get unconnected node-pairs
all_unconnected_pairs = []

# traverse adjacency matrix
offset = 0
for i in tqdm(range(adj_G.shape[0])):
    for j in range(offset,adj_G.shape[1]):
        if i != j:
            if adj_G[i,j] == 0:
                all_unconnected_pairs.append([nodeL[i],nodeL[j]])
    offset = offset + 1

# df of non-existent edges (ne)
ne = pd.DataFrame([tuple(pair) for pair in all_unconnected_pairs], columns=['Source', 'Sink'])
ne['Exist'] = pd.DataFrame([0]*(ne.shape[0]))
ne['NCA'] = pd.DataFrame([0]*(ne.shape[0]))

ne.head()

100%|██████████| 3767/3767 [00:17<00:00, 220.45it/s] 


Unnamed: 0,Source,Sink,Exist,NCA
0,0,1,0,0
1,0,2,0,0
2,0,3,0,0
3,0,4,0,0
4,0,5,0,0


In [8]:
# concatenate non-existent and existent edges to form a df of all possible edges (ca)
ca = pd.concat([edges, ne])
ca.head()

Unnamed: 0,Source,Sink,NCA,Exist
0,0,356,14,1
1,0,1236,14,1
2,356,1236,14,1
3,0,1655,9,1
4,0,1797,4,1


In [9]:
ca["Pair"] = list(zip(ca.Source, ca.Sink))
ca = ca[["Pair", "Source", "Sink", "NCA", "Exist"]]
ca.head()

Unnamed: 0,Pair,Source,Sink,NCA,Exist
0,"(0, 356)",0,356,14,1
1,"(0, 1236)",0,1236,14,1
2,"(356, 1236)",356,1236,14,1
3,"(0, 1655)",0,1655,9,1
4,"(0, 1797)",0,1797,4,1


In [10]:
ca.shape

(7093263, 5)

In [11]:
# export ca to csv
# ca.to_csv("ca.csv")

# Feature Generation

In [12]:
# Common Neighbours
cnL = []
for u,v in tqdm(ca.Pair):
    cn = sorted(nx.common_neighbors(G,u,v))
    cnL.append(((u,v), len(cn)))
cnDF = pd.DataFrame(cnL, columns=["Pair", "CN"])
train = ca.join(cnDF.set_index('Pair'), on="Pair")

100%|██████████| 7093263/7093263 [02:39<00:00, 44435.61it/s]


In [13]:
# Adamic-Adar Index
aa_index = nx.adamic_adar_index(G, ca.Pair)
aaDF = pd.DataFrame([((u,v),p) for u,v,p in aa_index], columns=["Pair", "AA"])
train = train.join(aaDF.set_index('Pair'), on="Pair")

In [14]:
# Resource Allocation
ra_index = nx.resource_allocation_index(G, ca.Pair)
raDF = pd.DataFrame([((u,v),p) for u,v,p in ra_index], columns=["Pair", "RA"])
train = train.join(raDF.set_index('Pair'), on="Pair")

In [15]:
# Jaccard's Coeffiecient
jc = nx.jaccard_coefficient(G, ca.Pair)
jcDF = pd.DataFrame([((u, v), p) for u, v, p in jc], columns=["Pair", "JC"])
train = train.join(jcDF.set_index('Pair'), on="Pair")

In [16]:
# Preferential Attachment
pa = nx.preferential_attachment(G, ca.Pair)
paDF = pd.DataFrame([((u, v), p) for u, v, p in pa], columns=["Pair", "PA"])
train = train.join(paDF.set_index('Pair'), on="Pair")

In [17]:
# Katz Index
# The calculation of the Katz index is referred to "Fast Computation of Katz Index for Efficient
# Processing of Link Prediction Queries": https://arxiv.org/pdf/1912.06525.pdf

I = np.identity(len(G.nodes))
beta = 0.05
# beta is set to 0.05 as it is a commonly accepted value in the research community accoring to Qi et. al
# 'predicting co-author relationship in medical co-authorship networks'

from numpy.linalg import inv
K = inv(I - adj_G*beta) - I

kiL = []
offset=0
for i in range(len(nodeL)):
    for j in range(offset, len(nodeL)):
        if i != j:
            kiL.append(((nodeL[i], nodeL[j]), K[i, j]))
    offset += 1

kiDF = pd.DataFrame(kiL, columns=["Pair", "KI"])
train = train.join(kiDF.set_index('Pair'), on="Pair")

In [18]:
# Pagerank
pr = nx.pagerank(G)
prDF = pd.DataFrame.from_dict(pr, orient='index', columns=['PR_s1'])
train = train.join(prDF, on="Source")
prDF = prDF.rename(columns = {'PR_s1':'PR_s2'})
train = train.join(prDF, on="Sink")

In [19]:
train = train.dropna()
train.head()
# NCA: number of co-authored papers
# CN: Common Neighbours
# AA: Adamic-Adar Index
# RA: Resource Allocation
# JC: Jaccard's Coeffiecient
# PA: Preferential Attachment
# KI: Katz Index
# PR_s1: Pagerank of the source node
# PR_s2: Pagerank of the sink node

Unnamed: 0,Pair,Source,Sink,NCA,Exist,CN,AA,RA,JC,PA,KI,PR_s1,PR_s2
0,"(0, 356)",0,356,14,1,7,2.899858,0.628968,0.7,72,0.079962,0.00022,0.00024
1,"(0, 1236)",0,1236,14,1,6,2.471649,0.531746,0.428571,96,0.075137,0.00022,0.000302
2,"(356, 1236)",356,1236,14,1,7,2.812086,0.587302,0.5,108,0.074232,0.00024,0.000302
3,"(0, 1655)",0,1655,9,1,7,2.976054,0.668651,0.466667,112,0.083302,0.00022,0.000376
4,"(0, 1797)",0,1797,4,1,7,2.899858,0.628968,0.7,72,0.081045,0.00022,0.000245


In [20]:
train.shape

(7093261, 13)

In [21]:
# export train to csv
train.to_csv("train.csv", index=False)

In [30]:
test = pd.read_csv("test-public.csv")
test["Pair"] = list(zip(test.Source, test.Sink))
test

Unnamed: 0,Id,Source,Sink,Pair
0,1,0,2917,"(0, 2917)"
1,2,0,2956,"(0, 2956)"
2,3,1,4038,"(1, 4038)"
3,4,2,1848,"(2, 1848)"
4,5,3,513,"(3, 513)"
...,...,...,...,...
1995,1996,3865,3924,"(3865, 3924)"
1996,1997,3917,4025,"(3917, 4025)"
1997,1998,3922,3947,"(3922, 3947)"
1998,1999,3955,3987,"(3955, 3987)"


In [31]:
y_train = train.Exist
X_train = train[["NCA", "CN", "AA", "RA", "JC", "PA", "KI", "PR_s1", "PR_s2"]]

In [38]:
from scipy.special import expit # this is the logistic function
sigmoid = expit

def risk(w, X, y):
    prob_1 = sigmoid(X @ w) 
    cross_entropy = - y @ np.log(prob_1) - (1. - y) @ np.log(1. - prob_1)  # fill in
    return cross_entropy

In [43]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty='none')
clf.fit(X, y)
w_sklearn = np.r_[clf.intercept_, clf.coef_.squeeze()]
print("Weights according to GD: {}".format(w_history_gd[-1]))
print("Weights according to scikit-learn: {}".format(w_sklearn))

AttributeError: module 'scipy' has no attribute '__version__'