In [1]:
import networkx as nx
import pandas as pd
import numpy as np

We create a graph object from the data.

In [23]:
G = nx.Graph()
digraph = nx.DiGraph()

with open("training.txt", "r") as f:
    for line in f:
        line = line.split()
        digraph.add_node(line[0])
        digraph.add_node(line[1])
        if line[2] == '1':
            G.add_edge(line[0], line[1])
            digraph.add_edge(line[0], line[1])
        else:
            G.add_nodes_from([line[0], line[1]])


In [3]:
print(nx.info(G))
print(nx.info(digraph))

Name: 
Type: Graph
Number of nodes: 33226
Number of edges: 283623
Average degree:  17.0724
Name: 
Type: DiGraph
Number of nodes: 33226
Number of edges: 283623
Average in degree:   8.5362
Average out degree:   8.5362


In [None]:
df_train = pd.read_csv("training.txt", sep=" ", header=None)
df_train.columns = ["node_1", "node_2", "label"]

We add 7 features fromm the graph topology to better train the model.
1) Jaccard Index
2)Adamic-Adar Index
3)Preferential Attachment
4)Resource Allocation
5)Common Neighbors
6)Salton Index
7)Soresen Index

In [9]:
jaccard = [] #jaccard index
aa = [] #adamic-adar index
pa = [] #preferential attachment
ra = [] #resource allocation


f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u,v,p in nx.jaccard_coefficient(G, [(line[0], line[1])]):
        jaccard.append(p)
    for u, v, p in nx.adamic_adar_index(G, [(line[0], line[1])]):
        aa.append(p)
    for u, v, p in nx.preferential_attachment(G, [(line[0], line[1])]):
        pa.append(p)
    for u, v, p in nx.resource_allocation_index(G, [(line[0], line[1])]):
        ra.append(p)
        
        
df_train["Jaccard"] = jaccard
df_train["Adamic-Adar"] = aa      
df_train["Preferential Attachment"] = pa
df_train["Resource Allocation"] = ra

In [10]:
f = open("training.txt", "r")

def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))

cn = [] #common neighbors
si = [] #salton index
sorI = [] #sorensen index

for line in f:
    line = line.split()
    n1 = G.neighbors(line[0])
    n2 = G.neighbors(line[1])
    inter = len(intersection(n1,n2))
    cn.append(inter)
    if G.degree(line[0]) != 0 and G.degree(line[1]) != 0:
        si.append(inter / np.sqrt(G.degree(line[0])*G.degree(line[1])))
    else:
        si.append(0)
    sorI.append(2*inter/ (G.degree(line[0]) + G.degree(line[1])))

df_train["Salton Index"] = si
df_train["Sorensen Index"] = sorI
df_train["Common Neighbors"] = cn

df_train.head()

Unnamed: 0,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Salton Index,Sorensen Index,Common Neighbors
0,10481,5428,1,0.005618,0.192569,7018,0.005556,0.011937,0.011173,1
1,7353,30328,0,0.0,0.0,36,0.0,0.0,0.0,0
2,8627,3547,1,0.005979,0.60522,24130,0.006253,0.02575,0.011887,4
3,10232,21925,1,0.0,0.0,1368,0.0,0.0,0.0,0
4,7110,3288,1,0.0,0.0,624,0.0,0.0,0.0,0


### New Features with neighborhood - dispersion, Hub Promoted Inde, Hub Depressed Index, Leicht–Holme–Newman Index

In [None]:
f = open("training.txt", "r")
   
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))
hpi = [] #Hub Promoted Index
hdi = [] #Hub Depressed Index
lhni = [] #Leicht–Holme–Newman Index

for line in f:
    line = line.split()
    n1 = G.neighbors(line[0])
    n2 = G.neighbors(line[1])
    inter = len(intersection(n1,n2))
    if G.degree(line[0]) != 0 and G.degree(line[1]) != 0:
        hpi.append(inter / np.minimum(G.degree(line[0]),G.degree(line[1])))
        hdi.append(inter / np.maximum(G.degree(line[0]),G.degree(line[1])))
        lhni.append(inter / G.degree(line[0]) * G.degree(line[1]))
    else:
        hpi.append(0)
        hdi.append(0)
        lhni.append(0)

df_train["Hub Promoted Index"] = hpi
df_train["Hub Depressed Index"] = hdi
df_train["Leicht–Holme–Newman Index"] = lhni

df_train.head()

In [None]:
f = open("training.txt", "r")

disp = [] #dispersion

for line in f:
    line = line.split()
    disp.append(nx.dispersion(digraph,line[0], line[1]))

df_train["Dispersion"] = disp

In [12]:
df_train.to_csv("df_train.csv")
#df_train = pd.read_csv("df_train.csv")

In [5]:
df_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Common Neighbors,Salton Index,Sorensen Index
0,0,0,10481,5428,1,0.005618,0.192569,7018,0.005556,1,0.011937,0.011173
1,1,1,7353,30328,0,0.0,0.0,36,0.0,0,0.0,0.0
2,2,2,8627,3547,1,0.005979,0.60522,24130,0.006253,4,0.02575,0.011887
3,3,3,10232,21925,1,0.0,0.0,1368,0.0,0,0.0,0.0
4,4,4,7110,3288,1,0.0,0.0,624,0.0,0,0.0,0.0


# Idem for the testing data (testing.txt)

In [None]:
df_test = pd.read_csv("testing.txt", sep=" ", header=None)
df_test.columns = ["node_1", "node_2"]

In [7]:
jaccard = [] #jaccard index
aa = [] #adamic-adar index
pa = [] #preferential attachment
ra = [] #resource allocation


f = open("testing.txt", "r")

for line in f:
    line = line.split()
    for u,v,p in nx.jaccard_coefficient(G, [(line[0], line[1])]):
        jaccard.append(p)
    for u, v, p in nx.adamic_adar_index(G, [(line[0], line[1])]):
        aa.append(p)
    for u, v, p in nx.preferential_attachment(G, [(line[0], line[1])]):
        pa.append(p)
    for u, v, p in nx.resource_allocation_index(G, [(line[0], line[1])]):
        ra.append(p)
        
        
df_test["Jaccard"] = jaccard
df_test["Adamic-Adar"] = aa      
df_test["Preferential Attachment"] = pa
df_test["Resource Allocation"] = ra

In [8]:
f = open("testing.txt", "r")

def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))

cn = [] #common neighbors
si = [] #salton index
sorI = [] #sorensen index

for line in f:
    line = line.split()
    n1 = G.neighbors(line[0])
    n2 = G.neighbors(line[1])
    inter = len(intersection(n1,n2))
    cn.append(inter)
    if G.degree(line[0]) != 0 and G.degree(line[1]) != 0:
        si.append(inter / np.sqrt(G.degree(line[0])*G.degree(line[1])))
    else:
        si.append(0)
    sorI.append(2*inter/ (G.degree(line[0]) + G.degree(line[1])))

df_test["Salton Index"] = si
df_test["Sorensen Index"] = sorI
df_test["Common Neighbors"] = cn

df_test.head()

Unnamed: 0,node_1,node_2,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Salton Index,Sorensen Index,Common Neighbors
0,870,10284,0.0,0.0,2387,0.0,0.0,0.0,0
1,620,15300,0.045614,2.305634,4512,0.046293,0.193535,0.087248,13
2,21115,31904,0.0,0.0,60,0.0,0.0,0.0,0
3,3021,28396,0.0,0.0,32,0.0,0.0,0.0,0
4,10780,6135,0.195652,2.37006,450,0.202136,0.424264,0.327273,9


# New features

In [32]:
f = open("testing.txt", "r")
   
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))
hpi = [] #Hub Promoted Index
hdi = [] #Hub Depressed Index
lhni = [] #Leicht–Holme–Newman Index

for line in f:
    line = line.split()
    n1 = G.neighbors(line[0])
    n2 = G.neighbors(line[1])
    inter = len(intersection(n1,n2))
    if G.degree(line[0]) != 0 and G.degree(line[1]) != 0:
        hpi.append(inter / np.minimum(G.degree(line[0]),G.degree(line[1])))
        hdi.append(inter / np.maximum(G.degree(line[0]),G.degree(line[1])))
        lhni.append(inter / G.degree(line[0]) * G.degree(line[1]))
    else:
        hpi.append(0)
        hdi.append(0)
        lhni.append(0)

df_test["Hub Promoted Index"] = hpi
df_test["Hub Depressed Index"] = hdi
df_test["Leicht–Holme–Newman Index"] = lhni

df_test.head()

Unnamed: 0.1,Unnamed: 0,node_1,node_2,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Salton Index,Sorensen Index,Common Neighbors,Hub Promoted Index,Hub Depressed Index,Leicht–Holme–Newman Index
0,0,870,10284,0.0,0.0,2387,0.0,0.0,0.0,0,0.0,0.0,0.0
1,1,620,15300,0.045614,2.305634,4512,0.046293,0.193535,0.087248,13,0.8125,0.046099,0.737589
2,2,21115,31904,0.0,0.0,60,0.0,0.0,0.0,0,0.0,0.0,0.0
3,3,3021,28396,0.0,0.0,32,0.0,0.0,0.0,0,0.0,0.0,0.0
4,4,10780,6135,0.195652,2.37006,450,0.202136,0.424264,0.327273,9,0.9,0.2,2.0


In [33]:
f = open("testing.txt", "r")

disp = [] #dispersion

for line in f:
    line = line.split()
    disp.append(nx.dispersion(digraph,line[0], line[1]))

df_test["Dispersion"] = disp

In [34]:
df_test.info()
df_test.to_csv("df_test.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113450 entries, 0 to 113449
Data columns (total 14 columns):
Unnamed: 0                   113450 non-null int64
node_1                       113450 non-null int64
node_2                       113450 non-null int64
Jaccard                      113450 non-null float64
Adamic-Adar                  113450 non-null float64
Preferential Attachment      113450 non-null int64
Resource Allocation          113450 non-null float64
Salton Index                 113450 non-null float64
Sorensen Index               113450 non-null float64
Common Neighbors             113450 non-null int64
Hub Promoted Index           113450 non-null float64
Hub Depressed Index          113450 non-null float64
Leicht–Holme–Newman Index    113450 non-null float64
Dispersion                   113450 non-null float64
dtypes: float64(9), int64(5)
memory usage: 12.1 MB


# Draft - ignore this part

In [36]:
#df_test = pd.read_csv("df_test.csv")
df_test.head()

Unnamed: 0.1,Unnamed: 0,node_1,node_2,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Salton Index,Sorensen Index,Common Neighbors,Hub Promoted Index,Hub Depressed Index,Leicht–Holme–Newman Index,Dispersion
0,0,870,10284,0.0,0.0,2387,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
1,1,620,15300,0.045614,2.305634,4512,0.046293,0.193535,0.087248,13,0.8125,0.046099,0.737589,0.0
2,2,21115,31904,0.0,0.0,60,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
3,3,3021,28396,0.0,0.0,32,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
4,4,10780,6135,0.195652,2.37006,450,0.202136,0.424264,0.327273,9,0.9,0.2,2.0,0.0


In [35]:
df_test.to_csv("df_test.csv")

In [3]:
df_train = pd.read_csv("df_train.csv")

Calculate dispersion between u and v in G.

A link between two actors (u and v) has a high dispersion when their mutual ties (s and t) are not well connected with each other:

In [6]:
f = open("training.txt", "r")

disp = [] #dispersion

for line in f:
    line = line.split()
    disp.append(nx.dispersion(digraph,line[0], line[1]))

df_train["Dispersion"] = disp
    

In [10]:
df_train["Dispersion"].describe()

count    453797.000000
mean          0.317187
std           3.006946
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         331.210894
Name: Dispersion, dtype: float64

In [29]:
f = open("training.txt", "r")
   
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2))
hpi = [] #Hub Promoted Index
hdi = [] #Hub Depressed Index
lhni = [] #Leicht–Holme–Newman Index

for line in f:
    line = line.split()
    n1 = G.neighbors(line[0])
    n2 = G.neighbors(line[1])
    inter = len(intersection(n1,n2))
    if G.degree(line[0]) != 0 and G.degree(line[1]) != 0:
        hpi.append(inter / np.minimum(G.degree(line[0]),G.degree(line[1])))
        hdi.append(inter / np.maximum(G.degree(line[0]),G.degree(line[1])))
        lhni.append(inter / G.degree(line[0]) * G.degree(line[1]))
    else:
        hpi.append(0)
        hdi.append(0)
        lhni.append(0)

df_train["Hub Promoted Index"] = hpi
df_train["Hub Depressed Index"] = hdi
df_train["Leicht–Holme–Newman Index"] = lhni
df_train.head()


Unnamed: 0.1,Unnamed: 0,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Salton Index,Sorensen Index,Common Neighbors,Dispersion,Hub Promoted Index,Hub Depressed Index,Leicht–Holme–Newman Index
0,0,10481,5428,1,0.005618,0.192569,7018,0.005556,0.011937,0.011173,1,0.0,0.017241,0.008264,0.479339
1,1,7353,30328,0,0.0,0.0,36,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
2,2,8627,3547,1,0.005979,0.60522,24130,0.006253,0.02575,0.011887,4,0.0,0.105263,0.006299,0.23937
3,3,10232,21925,1,0.0,0.0,1368,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0
4,4,7110,3288,1,0.0,0.0,624,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0


In [30]:
df_train.to_csv("df_train.csv")