# Testing out link prediction

## Importing useful tools

In [1]:
from graph_utilities import *
from node2vec import *

## Downloading and loading model

In [2]:
grqc = load_grqc_from_internet()

In [6]:
model = Node2Vec(num_walks=10, dimensions = 100, p = 10, q = 3)
model.load_graph(grqc)
model.create_model(workers = 8, hierarchical_softmaw = 0)

Simulating walks:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10


## Training Node2Vec model

In [7]:
model.train(epochs = 30, verbose = True, workers = 8)

epoch 1/30 - loss 18754300.0
epoch 2/30 - loss 12578278.0
epoch 3/30 - loss 10717501.0
epoch 4/30 - loss 9818438.0
epoch 5/30 - loss 8815087.0
epoch 6/30 - loss 7916641.5
epoch 7/30 - loss 6641129.5
epoch 8/30 - loss 5652074.0
epoch 9/30 - loss 4571891.5
epoch 10/30 - loss 4071485.5
epoch 11/30 - loss 3901402.0
epoch 12/30 - loss 3417330.0
epoch 13/30 - loss 3368642.75
epoch 14/30 - loss 3279630.25
epoch 15/30 - loss 3053116.5
epoch 16/30 - loss 3155094.5
epoch 17/30 - loss 3055960.25
epoch 18/30 - loss 3048453.5
epoch 19/30 - loss 2974704.75
epoch 20/30 - loss 2881443.0
epoch 21/30 - loss 2883788.25
epoch 22/30 - loss 2797062.75
epoch 23/30 - loss 2835005.25
epoch 24/30 - loss 2909680.5
epoch 25/30 - loss 2820576.25
epoch 26/30 - loss 2831310.75
epoch 27/30 - loss 2816035.25
epoch 28/30 - loss 2707929.5
epoch 29/30 - loss 2809378.5
epoch 30/30 - loss 2763120.0


In [8]:
fake_edges = create_fake_edges(graph = grqc, seed = 1)

In [9]:
data = building_dataset(graph = grqc, embedding_dict = model.get_embedding_dictionnary(), edges_fake = fake_edges)

In [10]:
X = data[:,:-1]
y = data[:,-1]

## Training a Logistic Regression model

In [11]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 1)

In [13]:
kf = KFold(n_splits = 5)

In [14]:
counter = 0

for train_index, test_index in kf.split(X_train, y_train):
    
    counter += 1
    
    X_train_cv = X_train[train_index, :]
    X_test_cv = X_train[test_index, :]
    y_train_cv = y_train[train_index]
    y_test_cv = y_train[test_index]
    
    lr = LogisticRegression()
    
    lr.fit(X_train_cv, y_train_cv)
    print('split {}'.format(counter))
    print('training accuracy {}'.format(lr.score(X_train_cv, y_train_cv)))
    print('test accuracy {}'.format(lr.score(X_test_cv, y_test_cv)))
    print('Confusion Matrix')
    print(confusion_matrix(y_test_cv, lr.predict(X_test_cv)))
    print()
    

split 1
training accuracy 0.7161079615479419
test accuracy 0.6867143209267932
Confusion Matrix
[[1466  578]
 [ 693 1320]]

split 2
training accuracy 0.7118560512694109
test accuracy 0.7022430367266453
Confusion Matrix
[[1421  580]
 [ 628 1428]]

split 3
training accuracy 0.7086517130884891
test accuracy 0.7032289869361598
Confusion Matrix
[[1493  588]
 [ 616 1360]]

split 4
training accuracy 0.710069016514666
test accuracy 0.7084052255361104
Confusion Matrix
[[1477  567]
 [ 616 1397]]

split 5
training accuracy 0.711486319940843
test accuracy 0.7054473749075671
Confusion Matrix
[[1461  569]
 [ 626 1401]]

