In [1]:
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

We create the graph object with the networkx library and then the dataframes.

In [3]:
graph = nx.Graph()
f = open("training.txt", "r")
for line in f:
    line = line.split()
    if line[2] == '1':
        graph.add_edge(line[0], line[1])
    else:
        graph.add_nodes_from([line[0], line[1]])

In [4]:
df_train = pd.read_csv("training.txt", sep=" ", header=None)
df_train.columns = ["node_1", "node_2", "label"]

df_test = pd.read_csv("testing.txt", sep=" ", header=None)
df_test.columns = ["node_1", "node_2"]

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453797 entries, 0 to 453796
Data columns (total 3 columns):
node_1    453797 non-null int64
node_2    453797 non-null int64
label     453797 non-null int64
dtypes: int64(3)
memory usage: 10.4 MB


We make the feature extraction here, for the MVP we use the Jaccard coefficient, the Adamic-Adar index and the preferential attachment (https://networkx.github.io/documentation/stable/reference/algorithms/link_prediction.html ).

In [11]:
jaccard = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u,v,p in nx.jaccard_coefficient(graph, [(line[0], line[1])]):
        jaccard.append(p)



In [12]:
df_train["Jaccard"] = jaccard

In [16]:
aa = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.adamic_adar_index(graph, [(line[0], line[1])]):
        aa.append(p)
df_train["Adamic-Adar"] = aa

In [18]:
pa = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.preferential_attachment(graph, [(line[0], line[1])]):
        pa.append(p)
        
df_train["Preferential Attachment"] = pa

In [55]:
ra = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.resource_allocation_index(graph, [(line[0], line[1])]):
        ra.append(p)

df_train["Resource Allocation"] = ra

We train the RandomForestClassifier with $~80\%$ of the data.

In [75]:
classifier = RandomForestClassifier(n_estimators=20, max_depth=20, 
                                    random_state=0)
columns = ["Jaccard", "Adamic-Adar", "Preferential Attachment", "Resource Allocation"]

X = df_train[columns][:-90700]
y = df_train["label"][:-90700]
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})

def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [76]:
predictions = classifier.predict(df_train[columns][-90700:])

evaluate_model(predictions, y[-90700:])

Unnamed: 0,metric,value
0,accuracy,0.528787
1,precision,0.625569
2,recall,0.618583


In [58]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,Preferential Attachment,0.610625
1,Resource Allocation,0.252877
2,Adamic-Adar,0.118037
3,Jaccard,0.018461


Calculating feature extraction for testing data

In [None]:
aa = []
f = open("testing.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.adamic_adar_index(graph, [(line[0], line[1])]):
        aa.append(p)
df_test["Adamic-Adar"] = aa

In [30]:
pa = []
f = open("testing.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.preferential_attachment(graph, [(line[0], line[1])]):
        pa.append(p)
        
df_test["Preferential Attachment"] = pa

In [60]:
ra = []
f = open("testing.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.resource_allocation_index(graph, [(line[0], line[1])]):
        ra.append(p)
        
df_test["Resource Allocation"] = ra

In [33]:
predictions = classifier.predict(df_test[columns])

In [64]:
df_train[columns][:-90700].head()

Unnamed: 0,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation
0,0.005618,0.192569,7018,0.005556
1,0.0,0.0,36,0.0
2,0.005979,0.60522,24130,0.006253
3,0.0,0.0,1368,0.0
4,0.0,0.0,624,0.0


In [63]:
df_train[-90700:].head()

Unnamed: 0,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation
363097,1722,2038,0,0.0,0.0,341,0.0
363098,23473,5027,1,0.0,0.0,14280,0.0
363099,7651,4147,0,0.011905,0.108296,400,9.8e-05
363100,2511,16436,0,0.047619,0.127792,112,0.0004
363101,26610,20115,0,0.0,0.0,42,0.0


In [61]:
df_train.to_csv('df_train.csv')

In [62]:
df_test.to_csv('df_test.csv')

In [78]:
2 / ((1/0.617387)+(1/0.625361))

0.6213484177113944