In [1]:
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

We create the graph object with the networkx library and then the dataframes.

In [3]:
graph = nx.Graph()
f = open("training.txt", "r")
for line in f:
    line = line.split()
    if line[2] == '1':
        graph.add_edge(line[0], line[1])
    else:
        graph.add_nodes_from([line[0], line[1]])

In [4]:
df_train = pd.read_csv("training.txt", sep=" ", header=None)
df_train.columns = ["node_1", "node_2", "label"]

df_test = pd.read_csv("testing.txt", sep=" ", header=None)
df_test.columns = ["node_1", "node_2"]

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453797 entries, 0 to 453796
Data columns (total 3 columns):
node_1    453797 non-null int64
node_2    453797 non-null int64
label     453797 non-null int64
dtypes: int64(3)
memory usage: 10.4 MB


We make the feature extraction here, for the MVP we use the Jaccard coefficient, the Adamic-Adar index and the preferential attachment (https://networkx.github.io/documentation/stable/reference/algorithms/link_prediction.html ).

In [11]:
jaccard = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u,v,p in nx.jaccard_coefficient(graph, [(line[0], line[1])]):
        jaccard.append(p)



In [12]:
df_train["Jaccard"] = jaccard

In [16]:
aa = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.adamic_adar_index(graph, [(line[0], line[1])]):
        aa.append(p)
df_train["Adamic-Adar"] = aa

In [18]:
pa = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.preferential_attachment(graph, [(line[0], line[1])]):
        pa.append(p)
        
df_train["Preferential Attachment"] = pa

We train the RandomForestClassifier with $~80\%$ of the data.

In [46]:
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, 
                                    random_state=0)
columns = ["Jaccard", "Adamic-Adar", "Preferential Attachment"]

X = df_train[columns][:-90700]
y = df_train["label"][:-90700]
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})

def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [51]:
predictions = classifier.predict(df_train[columns][-90700:])

evaluate_model(predictions, y[-90700:])

Unnamed: 0,metric,value
0,accuracy,0.528765
1,precision,0.625333
2,recall,0.619427


In [52]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,Preferential Attachment,0.580388
1,Adamic-Adar,0.244622
2,Jaccard,0.17499


Calculating feature extraction for testing data

In [None]:
aa = []
f = open("testing.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.adamic_adar_index(graph, [(line[0], line[1])]):
        aa.append(p)
df_test["Adamic-Adar"] = aa

In [30]:
pa = []
f = open("testing.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.preferential_attachment(graph, [(line[0], line[1])]):
        pa.append(p)
        
df_test["Preferential Attachment"] = pa

Unnamed: 0,node_1,node_2,Jaccard,Adamic-Adar,Preferential Attachment
0,870,10284,0.0,0.0,2387
1,620,15300,0.045614,2.305634,4512
2,21115,31904,0.0,0.0,60
3,3021,28396,0.0,0.0,32
4,10780,6135,0.195652,2.37006,450


In [33]:
predictions = classifier.predict(df_test[columns])

In [50]:
df_train[columns][:-90700].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 363097 entries, 0 to 363096
Data columns (total 3 columns):
Jaccard                    363097 non-null float64
Adamic-Adar                363097 non-null float64
Preferential Attachment    363097 non-null int64
dtypes: float64(2), int64(1)
memory usage: 8.3 MB


In [49]:
df_train[-90700:].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90700 entries, 363097 to 453796
Data columns (total 6 columns):
node_1                     90700 non-null int64
node_2                     90700 non-null int64
label                      90700 non-null int64
Jaccard                    90700 non-null float64
Adamic-Adar                90700 non-null float64
Preferential Attachment    90700 non-null int64
dtypes: float64(2), int64(4)
memory usage: 4.2 MB


In [53]:
df_train.to_csv('df_train.csv')

In [54]:
df_test.to_csv('df_test.csv')