In [1]:
import networkx as nx
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [3]:
graph = nx.Graph()
f = open("training.txt", "r")
for line in f:
    line = line.split()
    if line[2] == '1':
        graph.add_edge(line[0], line[1])
    else:
        graph.add_nodes_from([line[0], line[1]])

In [4]:
df_train = pd.read_csv("training.txt", sep=" ", header=None)
df_train.columns = ["node_1", "node_2", "label"]

df_test = pd.read_csv("testing.txt", sep=" ", header=None)
df_test.columns = ["node_1", "node_2"]

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453797 entries, 0 to 453796
Data columns (total 3 columns):
node_1    453797 non-null int64
node_2    453797 non-null int64
label     453797 non-null int64
dtypes: int64(3)
memory usage: 10.4 MB


In [11]:
jaccard = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u,v,p in nx.jaccard_coefficient(graph, [(line[0], line[1])]):
        jaccard.append(p)



In [12]:
df_train["Jaccard"] = jaccard

In [16]:
aa = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.adamic_adar_index(graph, [(line[0], line[1])]):
        aa.append(p)
df_train["Adamic-Adar"] = aa

In [18]:
pa = []
f = open("training.txt", "r")

for line in f:
    line = line.split()
    for u, v, p in nx.preferential_attachment(graph, [(line[0], line[1])]):
        pa.append(p)
        
df_train["Preferential Attachment"] = pa

In [20]:
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, 
                                    random_state=0)
columns = ["Jaccard", "Adamic-Adar", "Preferential Attachment"]

X = df_train[columns]
y = df_train["label"]
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [22]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})

def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [25]:
predictions = classifier.predict(df_train[columns])

evaluate_model(predictions, y)

Unnamed: 0,metric,value
0,accuracy,0.897212
1,precision,0.91965
2,recall,0.915529


In [27]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,Preferential Attachment,0.587175
1,Adamic-Adar,0.237413
2,Jaccard,0.175411
