In [1]:
import sys
sys.path.insert(0, '../')
from utils.GraphAnalytics import GraphAnalytics
from models.Node2Vec import GraphNode2Vec

In [2]:
g = GraphAnalytics()
#g.load_graph('../data/_tmp')
g.load_graph('../data/full_graph')

In [None]:
mod = GraphNode2Vec(walk_length = 1, workers = -1)
mod.fit(g.as_undigraph)
import pandas as pd
random_df = pd.DataFrame(mod.embeddings).transpose()

In [None]:
em_df = random_df.copy()

In [None]:
import sys
sys.path.insert(0, '../')
from models.GCN import GraphGCN

In [None]:
import numpy as np

In [None]:
len(g.nodes)

In [None]:
list(g.edges(data=True))[0:5]

In [None]:
g[21966]

### StellarGraph implementation

https://stellargraph.readthedocs.io/en/stable/demos/link-prediction/hinsage-link-prediction.html

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from stellargraph.data import EdgeSplitter
import stellargraph as sg
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_regression
from tensorflow.keras import Model, optimizers, losses, metrics

import multiprocessing
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from stellargraph import StellarGraph
import networkx as nx
#load in with original node2vec embeddings
st_g = g.as_undigraph.copy()
nx.set_node_attributes(
    st_g,
    mod.embeddings,
    'node2vec_embeddings',
)
st_g = StellarGraph.from_networkx(st_g, node_features='node2vec_embeddings')
print(st_g.info())

In [None]:
edge_splitter_test = EdgeSplitter(st_g)
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.5, method="global", edge_label='ACQUIRED'
)
#setting keep connected as false because
#ValueError: Unable to sample 298 positive edges (could only sample 90 positive edges). Consider using smaller value for p or set keep_connected=False

#this produced edge_ids_test which is a dataset of node to node
#and edge_labels_test which is a dataset if positive labels 1 (the acquired edge exists), and negative samples, the acquired edge doesn't exist 0
#can bassically run G_test as your training data
#can see when comparing info

In [None]:
edge_splitter_test = EdgeSplitter(G_test)
G_train, edge_ids_train, edge_labels_train = edge_splitter_test.train_test_split(
    p=0.5, method="global", edge_label='ACQUIRED'
)

In [None]:
print(edge_ids_test.shape, edge_labels_test.shape)

In [None]:
print(G_test.info())

In [None]:
print(st_g.info())

### Creation of the model

In [None]:
batch_size = 200
epochs = 20
# Use 70% of edges for training, the rest for testing:
train_size = 0.7
test_size = 0.3
num_workers = -1
num_samples=[8, 4]

In [None]:
generator = HinSAGELinkGenerator(st_g, batch_size=batch_size, num_samples=num_samples, head_node_types=["Company", "Company"])
train_gen = generator.flow(edge_ids_train, edge_labels_train, shuffle=True)
test_gen = generator.flow(edge_ids_test, edge_labels_test)

In [None]:
hinsage_layer_sizes = [32, 32]
assert len(hinsage_layer_sizes) == len(num_samples)

hinsage = HinSAGE(
    layer_sizes=hinsage_layer_sizes, generator=generator, bias=True, dropout=0.0
)

In [None]:
x_inp, x_out = hinsage.in_out_tensors()

In [None]:
score_prediction = link_regression(edge_embedding_method="concat")(x_out)

In [None]:
import tensorflow.keras.backend as K


def root_mean_square_error(s_true, s_pred):
    return K.sqrt(K.mean(K.pow(s_true - s_pred, 2)))


model = Model(inputs=x_inp, outputs=score_prediction)
model.compile(
    optimizer=optimizers.Adam(lr=1e-2),
    loss=losses.mean_squared_error,
    metrics=[root_mean_square_error, metrics.mae],
)
model.summary()

In [None]:
test_metrics = model.evaluate(
    test_gen, verbose=1, use_multiprocessing=True, workers=num_workers
)

print("Untrained model's Test Evaluation:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
history = model.fit(
    train_gen,
    validation_data=test_gen,
    epochs=epochs,
    verbose=1,
    shuffle=False,
    use_multiprocessing=False,
    workers=num_workers,
)

### Prediction with GCN

In [None]:
import stellargraph as sg

from stellargraph.mapper import FullBatchLinkGenerator
from stellargraph.layer import GCN, LinkEmbedding
from stellargraph.mapper import RelationalFullBatchNodeGenerator


from tensorflow import keras
from sklearn import preprocessing, feature_extraction, model_selection

from stellargraph import globalvar
from stellargraph import datasets
from IPython.display import display, HTML
%matplotlib inline

In [None]:
edge_splitter_test = EdgeSplitter(st_g)
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.5, method="global", edge_label='ACQUIRED'
)
#setting keep connected as false because
#ValueError: Unable to sample 298 positive edges (could only sample 90 positive edges). Consider using smaller value for p or set keep_connected=False

#this produced edge_ids_test which is a dataset of node to node
#and edge_labels_test which is a dataset if positive labels 1 (the acquired edge exists), and negative samples, the acquired edge doesn't exist 0
#can bassically run G_test as your training data
#can see when comparing info

In [None]:
epochs = 50
train_gen = FullBatchLinkGenerator(G_train, method="gcn")
train_flow = train_gen.flow(edge_ids_train, edge_labels_train)

In [None]:
test_gen = FullBatchLinkGenerator(G_test, method="gcn")
test_flow = train_gen.flow(edge_ids_test, edge_labels_test)

In [None]:
gcn = GCN(
    layer_sizes=[16, 16], activations=["relu", "relu"], generator=train_gen, dropout=0.3
)

In [None]:
x_inp, x_out = gcn.in_out_tensors()

In [None]:
prediction = LinkEmbedding(activation="relu", method="ip")(x_out)
prediction = keras.layers.Reshape((-1,))(prediction)

In [None]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=0.01),
    loss=keras.losses.binary_crossentropy,
    metrics=["acc"],
)

In [None]:
init_train_metrics = model.evaluate(train_flow)
init_test_metrics = model.evaluate(test_flow)

print("\nTrain Set Metrics of the initial (untrained) model:")
for name, val in zip(model.metrics_names, init_train_metrics):
    print("\t{}: {:0.4f}".format(name, val))

print("\nTest Set Metrics of the initial (untrained) model:")
for name, val in zip(model.metrics_names, init_test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
history = model.fit(
    train_flow, epochs=epochs, validation_data=test_flow, verbose=2, shuffle=False
)

In [None]:
sg.utils.plot_history(history)

In [None]:
train_metrics = model.evaluate(train_flow)
test_metrics = model.evaluate(test_flow)

print("\nTrain Set Metrics of the trained model:")
for name, val in zip(model.metrics_names, train_metrics):
    print("\t{}: {:0.4f}".format(name, val))

print("\nTest Set Metrics of the trained model:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))