In [1]:
import networkx as nx
import pandas as pd
import numpy as np

We create a graph object from the data.

In [2]:
G = nx.Graph()
digraph = nx.DiGraph()

with open("training.txt", "r") as f:
    for line in f:
        line = line.split()
        digraph.add_node(line[0])
        digraph.add_node(line[1])
        if line[2] == '1':
            G.add_edge(line[0], line[1])
            digraph.add_edge(line[0], line[1])
        else:
            G.add_nodes_from([line[0], line[1]])


In [5]:
print(nx.info(G))
print(nx.info(digraph))

Name: 
Type: Graph
Number of nodes: 33226
Number of edges: 283623
Average degree:  17.0724
Name: 
Type: DiGraph
Number of nodes: 33226
Number of edges: 283623
Average in degree:   8.5362
Average out degree:   8.5362


In [6]:
df_train = pd.read_csv("training.txt", sep=" ", header=None)
df_train.columns = ["node_1", "node_2", "label"]

df_test = pd.read_csv("testing.txt", sep=" ", header=None)
df_test.columns = ["node_1", "node_2"]

We add 7 features fromm the graph topology to better train the model.
1) Jaccard Index
2)Adamic-Adar Index
3)Preferential Attachment
4)Resource Allocation
5)Common Neighbors
6)Salton Index
7)Soresen Index

In [9]:
jaccard = [] #jaccard index
aa = [] #adamic-adar index
pa = [] #preferential attachment
ra = [] #resource allocation


f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u,v,p in nx.jaccard_coefficient(G, [(line[0], line[1])]):
        jaccard.append(p)
    for u, v, p in nx.adamic_adar_index(G, [(line[0], line[1])]):
        aa.append(p)
    for u, v, p in nx.preferential_attachment(G, [(line[0], line[1])]):
        pa.append(p)
    for u, v, p in nx.resource_allocation_index(G, [(line[0], line[1])]):
        ra.append(p)
        
        
df_train["Jaccard"] = jaccard
df_train["Adamic-Adar"] = aa      
df_train["Preferential Attachment"] = pa
df_train["Resource Allocation"] = ra

In [10]:
f = open("training.txt", "r")

def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))

cn = [] #common neighbors
si = [] #salton index
sorI = [] #sorensen index

for line in f:
    line = line.split()
    n1 = G.neighbors(line[0])
    n2 = G.neighbors(line[1])
    inter = len(intersection(n1,n2))
    cn.append(inter)
    if G.degree(line[0]) != 0 and G.degree(line[1]) != 0:
        si.append(inter / np.sqrt(G.degree(line[0])*G.degree(line[1])))
    else:
        si.append(0)
    sorI.append(2*inter/ (G.degree(line[0]) + G.degree(line[1])))

df_train["Salton Index"] = si
df_train["Sorensen Index"] = sorI
df_train["Common Neighbors"] = cn

df_train.head()

Unnamed: 0,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Salton Index,Sorensen Index,Common Neighbors
0,10481,5428,1,0.005618,0.192569,7018,0.005556,0.011937,0.011173,1
1,7353,30328,0,0.0,0.0,36,0.0,0.0,0.0,0
2,8627,3547,1,0.005979,0.60522,24130,0.006253,0.02575,0.011887,4
3,10232,21925,1,0.0,0.0,1368,0.0,0.0,0.0,0
4,7110,3288,1,0.0,0.0,624,0.0,0.0,0.0,0


In [12]:
df_train.to_csv("df_train.csv")
#df_train = pd.read_csv("df_train.csv")

In [5]:
df_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Common Neighbors,Salton Index,Sorensen Index
0,0,0,10481,5428,1,0.005618,0.192569,7018,0.005556,1,0.011937,0.011173
1,1,1,7353,30328,0,0.0,0.0,36,0.0,0,0.0,0.0
2,2,2,8627,3547,1,0.005979,0.60522,24130,0.006253,4,0.02575,0.011887
3,3,3,10232,21925,1,0.0,0.0,1368,0.0,0,0.0,0.0
4,4,4,7110,3288,1,0.0,0.0,624,0.0,0,0.0,0.0


In [None]:
lp = [] #local path
epsilon = 0.001

f = open("training.txt", "r")

for line in f:
    line = line.split()
    path2 = len(list(nx.all_simple_paths(G, 
                                     source = line[0],target=line[1], cutoff=2)))
    path3 = len(list(nx.all_simple_paths(G, 
                                     source = line[0],target=line[1], cutoff=3)))
    lp.append(path2 + epsilon*path3)

df_train["Local Path"] = lp
    


In [3]:
A = nx.adjacency_matrix(G)

In [4]:
A2 = A * A 

In [None]:
A3 = np.dot(A) A * A2 

Idem for the testing data (testing.txt)

In [None]:
aa = []
f = open("testing.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.adamic_adar_index(graph, [(line[0], line[1])]):
        aa.append(p)
df_test["Adamic-Adar"] = aa

pa = []

for line in f:
    line = line.split()
    for u, v, p in nx.preferential_attachment(graph, [(line[0], line[1])]):
        pa.append(p)
        
df_test["Preferential Attachment"] = pa

ra = []

for line in f:
    line = line.split()
    for u, v, p in nx.resource_allocation_index(graph, [(line[0], line[1])]):
        ra.append(p)
        
df_test["Resource Allocation"] = ra

In [5]:
df_train = pd.read_csv("df_train.csv")
df_train

Unnamed: 0.1,Unnamed: 0,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation
0,0,10481,5428,1,0.005618,0.192569,7018,0.005556
1,1,7353,30328,0,0.000000,0.000000,36,0.000000
2,2,8627,3547,1,0.005979,0.605220,24130,0.006253
3,3,10232,21925,1,0.000000,0.000000,1368,0.000000
4,4,7110,3288,1,0.000000,0.000000,624,0.000000
...,...,...,...,...,...,...,...,...
453792,453792,11186,4520,0,0.007905,0.428844,2684,0.041824
453793,453793,12892,31446,0,0.000000,0.000000,80,0.000000
453794,453794,16857,23822,0,0.058824,0.156530,81,0.001681
453795,453795,5520,6394,1,0.004854,0.203556,4706,0.007353


In [19]:
df_train.to_csv("df_train.csv")

In [3]:
df_train = pd.read_csv("df_train.csv")

In [None]:
f = open("training.txt", "r")

conec = []

for line in f:
    line = line.split()
    conec.append(nx.node_connectivity(G,line[0],line[1]))

df_train["Node Connectivity"] = conec
    