In [15]:
import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm

In [19]:
edges = pd.read_csv("edges.csv")
nodeL = np.union1d(edges['Source'].values, edges['Sink'].values)
G = nx.from_pandas_edgelist(edges, "Source", "Sink", create_using=nx.Graph())
adj_G = nx.to_numpy_matrix(G, nodelist = nodeL)

ca = pd.read_csv("ca.csv")
ca.head()

Unnamed: 0,Source,Sink,nca,exist
0,0,356,14,1
1,0,1236,14,1
2,356,1236,14,1
3,0,1655,9,1
4,0,1797,4,1


In [25]:
ca["Pair"] = list(zip(ca.Source, ca.Sink))
ca = ca[["Pair", "Source", "Sink", "nca", "exist"]]
ca.head()

Unnamed: 0,Pair,Source,Sink,nca,exist
0,"(0, 356)",0,356,14,1
1,"(0, 1236)",0,1236,14,1
2,"(356, 1236)",356,1236,14,1
3,"(0, 1655)",0,1655,9,1
4,"(0, 1797)",0,1797,4,1


In [26]:
def add_feature(featureL, featureN, df):
    '''
    Adds a feature to the dataframe as a column
    featureL: a list of ((u, v), p) where u and v are the source and sink of the edge and
              p is the value of the feature
    featureN: the name of the feature
    df: the dataframe where the feature is added
    '''
    featureDF = pd.DataFrame(featureL, columns=['Pair', featureN])
    df = df.join(featureDF.set_index('Pair'), on='Pair')
    return df

In [27]:
# Jaccard's Coeffiecient
jc = nx.jaccard_coefficient(G, ca.Pair)
jcL = [((u, v), p) for u, v, p in jc]
ca = add_feature(jcL, "jc", ca)
ca.head()

Unnamed: 0,Pair,Source,Sink,nca,exist,jc
0,"(0, 356)",0,356,14,1,0.7
1,"(0, 1236)",0,1236,14,1,0.428571
2,"(356, 1236)",356,1236,14,1,0.5
3,"(0, 1655)",0,1655,9,1,0.466667
4,"(0, 1797)",0,1797,4,1,0.7


In [28]:
# Preferential Attachment
pa = nx.preferential_attachment(G, ca.Pair)
paL = [((u, v), p) for u, v, p in pa]
ca = add_feature(paL, "pa", ca)
ca.head()

Unnamed: 0,Pair,Source,Sink,nca,exist,jc,pa
0,"(0, 356)",0,356,14,1,0.7,72
1,"(0, 1236)",0,1236,14,1,0.428571,96
2,"(356, 1236)",356,1236,14,1,0.5,108
3,"(0, 1655)",0,1655,9,1,0.466667,112
4,"(0, 1797)",0,1797,4,1,0.7,72


In [29]:
# Katz Index
"""
Katz_beta:
The calculation of the Katz index is referred to "Fast Computation of Katz Index for Efficient
Processing of Link Prediction Queries": https://arxiv.org/pdf/1912.06525.pdf
"""

I = np.identity(len(G.nodes))
beta = 0.05
# beta is set to 0.05 as it is a commonly accepted value in the research community accoring to Qi et. al
# 'predicting co-author relationship in medical co-authorship networks'

from numpy.linalg import inv
K = inv(I - adj_G*beta) - I

kiL = []
offset=0
for i in range(len(nodeL)):
    for j in range(offset, len(nodeL)):
        if i != j:
            kiL.append(((nodeL[i], nodeL[j]), K[i, j]))
    offset += 1

ca = add_feature(kiL, "ki", ca)
ca.head()

Unnamed: 0,Pair,Source,Sink,nca,exist,jc,pa,ki
0,"(0, 356)",0,356,14,1,0.7,72,0.079962
1,"(0, 1236)",0,1236,14,1,0.428571,96,0.075137
2,"(356, 1236)",356,1236,14,1,0.5,108,0.074232
3,"(0, 1655)",0,1655,9,1,0.466667,112,0.083302
4,"(0, 1797)",0,1797,4,1,0.7,72,0.081045
