In [1]:
import networkx as nx
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('df_train.csv')
df_train["id"] = np.arange(453797)
df_train.head()


Unnamed: 0.1,Unnamed: 0,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,id
0,0,10481,5428,1,0.005618,0.192569,7018,0.005556,0
1,1,7353,30328,0,0.0,0.0,36,0.0,1
2,2,8627,3547,1,0.005979,0.60522,24130,0.006253,2
3,3,10232,21925,1,0.0,0.0,1368,0.0,3
4,4,7110,3288,1,0.0,0.0,624,0.0,4


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453797 entries, 0 to 453796
Data columns (total 9 columns):
Unnamed: 0                 453797 non-null int64
node_1                     453797 non-null int64
node_2                     453797 non-null int64
label                      453797 non-null int64
Jaccard                    453797 non-null float64
Adamic-Adar                453797 non-null float64
Preferential Attachment    453797 non-null int64
Resource Allocation        453797 non-null float64
id                         453797 non-null int64
dtypes: float64(3), int64(6)
memory usage: 31.2 MB


#### Evaluation metrics definitions:

In [4]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    f1 = 2 / ((1/precision) + (1/recall))
    
    metrics = ["accuracy", "precision", "recall", "F1"]
    values = [accuracy, precision, recall, f1]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})

def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

### Random Forest classifier:

In [5]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=20, max_depth=20, 
                                    random_state=0)
columns = ["Jaccard", "Adamic-Adar", "Preferential Attachment", "Resource Allocation"]

X = df_train[columns][:-90000]
y = df_train["label"][:-90000]
classifier.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [14]:
from sklearn.model_selection import cross_val_score

cv = np.mean(cross_val_score(classifier, X, y, cv=5))
print("Accuracy using RF with 5 cross validation: {}%".format(round(cv * 100, 2)))

Accuracy using RF with 5 cross validation: 89.46%


In [18]:
predictions = classifier.predict(X)

print("Result for training data:")
evaluate_model(predictions, y)

Result for training data:


Unnamed: 0,metric,value
0,accuracy,0.911989
1,precision,0.935505
2,recall,0.922845
3,F1,0.929132


In [19]:
predictions = classifier.predict(df_train[columns][-90000:])

print("Result for testing data:")
evaluate_model(predictions, y[-90000:])

Result for testing data:


Unnamed: 0,metric,value
0,accuracy,0.529867
1,precision,0.626443
2,recall,0.619559
3,F1,0.622982


In [20]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,Preferential Attachment,0.544817
1,Resource Allocation,0.248271
2,Adamic-Adar,0.168727
3,Jaccard,0.038185


### MLP Classifier

In [21]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X, y)

cv = np.mean(cross_val_score(clf, X, y, cv=5))
print("Accuracy using RF with 5 cross validation: {}%".format(round(cv * 100, 2)))



Accuracy using RF with 5 cross validation: 62.52%


In [22]:
predictions = clf.predict(X)

print("Training results: ")
evaluate_model(predictions, y)

Training results: 


Unnamed: 0,metric,value
0,accuracy,0.625176
1,precision,0.625176
2,recall,1.0
3,F1,0.769364


In [23]:
predictions = clf.predict(df_train[columns][-90000:])

print("Test results: ")
evaluate_model(predictions, y[-90000:])

Test results: 


Unnamed: 0,metric,value
0,accuracy,0.626933
1,precision,0.626933
2,recall,1.0
3,F1,0.770693


### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X, y)

predictions = clf.predict(df_train[columns][-90000:])

evaluate_model(predictions, y[-90000:])

Unnamed: 0,metric,value
0,accuracy,0.506911
1,precision,0.626163
2,recall,0.529792
3,F1,0.57396


### Support Vector Machine

from sklearn import svm
clf = svm.SVC(kernel = 'rbf')
clf.fit(X, y)

predictions = clf.predict(df_train[columns][-90000:])

evaluate_model(predictions, y[-90000:])

#### Tpot

In [10]:
from tpot import TPOTClassifier



In [11]:
pipeline_optimizer = TPOTClassifier()

In [12]:
pipeline_optimizer = TPOTClassifier(generations=2, population_size=10, cv=3,
                                    verbosity=1)

In [13]:
pipeline_optimizer.fit(X, y)



TPOT closed during evaluation in one generation.


TPOT closed prematurely. Will use the current best pipeline.
Best pipeline: BernoulliNB(RobustScaler(input_matrix), alpha=0.01, fit_prior=False)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=3,
               disable_update_check=False, early_stop=None, generations=2,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=10,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=1, warm_start=False)