In [1]:
import networkx as nx
import pandas as pd
import numpy as np

In [7]:
df_train = pd.read_csv('df_train.csv')
df_train["id"] = np.arange(453797)
df_train.head()


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,node_1,node_2,label,Jaccard,Adamic-Adar,Preferential Attachment,Resource Allocation,Common Neighbors,Salton Index,Sorensen Index,id
0,0,0,10481,5428,1,0.005618,0.192569,7018,0.005556,1,0.011937,0.011173,0
1,1,1,7353,30328,0,0.0,0.0,36,0.0,0,0.0,0.0,1
2,2,2,8627,3547,1,0.005979,0.60522,24130,0.006253,4,0.02575,0.011887,2
3,3,3,10232,21925,1,0.0,0.0,1368,0.0,0,0.0,0.0,3
4,4,4,7110,3288,1,0.0,0.0,624,0.0,0,0.0,0.0,4


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453797 entries, 0 to 453796
Data columns (total 13 columns):
Unnamed: 0                 453797 non-null int64
Unnamed: 0.1               453797 non-null int64
node_1                     453797 non-null int64
node_2                     453797 non-null int64
label                      453797 non-null int64
Jaccard                    453797 non-null float64
Adamic-Adar                453797 non-null float64
Preferential Attachment    453797 non-null int64
Resource Allocation        453797 non-null float64
Common Neighbors           453797 non-null int64
Salton Index               453157 non-null float64
Sorensen Index             453797 non-null float64
id                         453797 non-null int64
dtypes: float64(5), int64(8)
memory usage: 45.0 MB


#### Evaluation metrics definitions:

In [4]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    f1 = 2 / ((1/precision) + (1/recall))
    
    metrics = ["accuracy", "precision", "recall", "F1"]
    values = [accuracy, precision, recall, f1]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})

def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [8]:
columns = ["Jaccard", "Adamic-Adar", "Preferential Attachment", "Resource Allocation",
           "Common Neighbors","Salton Index","Sorensen Index"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_train[columns], df_train["label"], test_size=0.25, random_state=42)


### Random Forest classifier:

In [9]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=20, max_depth=20, 
                                    random_state=0)


classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=20, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [11]:
from sklearn.model_selection import cross_val_score

cv = np.mean(cross_val_score(classifier, X_train, y_train, cv=5))
print("Accuracy using RF with 5 cross validation: {}%".format(round(cv * 100, 2)))

Accuracy using RF with 5 cross validation: 89.42%


In [14]:
predictions = classifier.predict(X_train)

print("Result for training data:")
evaluate_model(predictions, y_train)

Result for training data:


Unnamed: 0,metric,value
0,accuracy,0.911517
1,precision,0.934726
2,recall,0.922827
3,F1,0.928738


In [15]:
predictions = classifier.predict(X_test)

print("Result for testing data:")
evaluate_model(predictions, y_test)

Result for testing data:


Unnamed: 0,metric,value
0,accuracy,0.893574
1,precision,0.917703
2,recall,0.911626
3,F1,0.914654


In [16]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,Preferential Attachment,0.531621
1,Resource Allocation,0.179626
2,Adamic-Adar,0.104628
3,Jaccard,0.051069
4,Sorensen Index,0.050823
5,Salton Index,0.044345
6,Common Neighbors,0.037887


### MLP Classifier

In [23]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2))
clf.fit(X_train, y_train)

cv = np.mean(cross_val_score(clf, X_train, y_train, cv=5))
print("Accuracy using RF with 5 cross validation: {}%".format(round(cv * 100, 2)))



Accuracy using RF with 5 cross validation: 67.81%


In [24]:
predictions = clf.predict(X_train)

print("Training results: ")
evaluate_model(predictions, y_train)

Training results: 


Unnamed: 0,metric,value
0,accuracy,0.390428
1,precision,0.629018
2,recall,0.059449
3,F1,0.108632


In [25]:
predictions = clf.predict(X_test)

print("Test results: ")
evaluate_model(predictions, y_test)

Test results: 


Unnamed: 0,metric,value
0,accuracy,0.391106
1,precision,0.642857
2,recall,0.059982
3,F1,0.109726


### Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train, y_train)

cv = np.mean(cross_val_score(clf, X_train, y_train, cv=5))
print("Accuracy using RF with 5 cross validation: {}%".format(round(cv * 100, 2)))

Accuracy using RF with 5 cross validation: 85.82%


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [27]:
predictions = clf.predict(X_train)

print("Training results: ")
evaluate_model(predictions, y_train)

Training results: 


Unnamed: 0,metric,value
0,accuracy,0.860554
1,precision,0.96071
2,recall,0.809943
3,F1,0.878908


In [28]:
predictions = clf.predict(X_test)

print("Test results: ")
evaluate_model(predictions, y_test)

Test results: 


Unnamed: 0,metric,value
0,accuracy,0.861208
1,precision,0.961493
2,recall,0.810599
3,F1,0.879621


### Ada BoostClassifier

In [29]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=50)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

0.8939758521069463

In [30]:
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print("Test results: ")
evaluate_model(predictions, y_test)

Test results: 


Unnamed: 0,metric,value
0,accuracy,0.892534
1,precision,0.90623
2,recall,0.9238
3,F1,0.91493


### GradientBoostingClassifier

In [31]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0)
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

0.8949836460222291

In [32]:
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)

print("Test results: ")
evaluate_model(predictions, y_test)

Test results: 


Unnamed: 0,metric,value
0,accuracy,0.89465
1,precision,0.920609
2,recall,0.910076
3,F1,0.915312


### Voting Classifier

In [33]:
from sklearn.ensemble import VotingClassifier

clf1 = RandomForestClassifier(n_estimators=20, max_depth=20, 
                                    random_state=0)
clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
                                 max_depth=1, random_state=0)
#clf3 = AdaBoostClassifier(n_estimators=50)
#('ABC', clf3)

eclf1 = VotingClassifier(estimators=[
    ('RF', clf1), ('mlpC', clf2)], voting='soft', weights=[1,2])

eclf1 = eclf1.fit(X_train, y_train)
predictions = eclf1.predict(X_test)

print("Test results: ")
evaluate_model(predictions, y_test)

Test results: 


Unnamed: 0,metric,value
0,accuracy,0.895602
1,precision,0.920863
2,recall,0.911443
3,F1,0.916128


### Ensemble

from sklearn import svm
clf = svm.SVC(kernel = 'rbf')
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

evaluate_model(predictions, y_test)

#### Tpot

In [10]:
from tpot import TPOTClassifier



In [11]:
pipeline_optimizer = TPOTClassifier()

In [12]:
pipeline_optimizer = TPOTClassifier(generations=2, population_size=10, cv=3,
                                    verbosity=1)

In [41]:
pipeline_optimizer.fit(X, y)

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=True, criterion=entropy, max_features=0.25, min_samples_leaf=14, min_samples_split=15, n_estimators=100)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=3,
               disable_update_check=False, early_stop=None, generations=2,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=1, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=10,
               random_state=None, scoring=None, subsample=1.0, template=None,
               use_dask=False, verbosity=1, warm_start=False)

In [42]:
print(pipeline_optimizer.score(df_train[columns][-90000:], y[-90000:]))

0.5285111111111112


In [44]:
pred = pipeline_optimizer.predict(df_train[columns][-90000:])

In [45]:
evaluate_model(pred, y[-90000:])

Unnamed: 0,metric,value
0,accuracy,0.528511
1,precision,0.626405
2,recall,0.614349
3,F1,0.620318
