In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import math

In [None]:
# load model
model = load_model('models/rolx_256+256+128+32__01-10.14236.hdf5')

In [None]:
# compile model
model.compile(optimizer='adam', loss=MeanSquaredError())

In [None]:
# load dataset
data = pd.read_csv('data/rolx_dataset_weighted.csv', index_col = 0)

In [None]:
# drop outliers - has significant affect on linear regression results
q = data["label"].quantile(0.99)
data = data[data["label"] < q]

In [None]:
y = data['label']
y.sample(10)

In [None]:
data.drop(['label'], axis = 1, inplace = True)

In [None]:
# 90/10 train-test split
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.10, random_state=42)
X_train = X_train.to_numpy()
X_test = X_test.to_numpy()
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
y_train_pred = model.predict(X_train, verbose = 0)

In [None]:
y_train_pred = [pred for sublist in y_train_pred for pred in sublist]
y_train_pred = [max(int(round(x)), 0) for x in y_train_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_train, y_train_pred))
print('R^2: %.4f' % r2_score(y_train, y_train_pred))

In [None]:
y_test_pred = model.predict(X_test, verbose = 0)

In [None]:
y_test_pred = [pred for sublist in y_test_pred for pred in sublist]
y_test_pred = [max(int(round(x)), 0) for x in y_test_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_test_pred))
print('R^2: %.4f' % r2_score(y_test, y_test_pred))

In [None]:
def load_inference_data(filename, n_examples):
    inference_data = pd.read_csv(filename, index_col = 0)
    inference_data = inference_data.sample(n_examples)
    inference_node_pairs = inference_data[['src_id', 'dst_id']]
    inference_features = inference_data.copy()
    inference_features.drop(['src_id', 'dst_id'], axis = 1, inplace = True)
    return inference_node_pairs, inference_features

In [None]:
# load inference dataset
inference_node_pairs, inference_features = load_inference_data('data/rolx_inference_weighted.csv', 300000)

In [None]:
inference_preds = model.predict(inference_features.to_numpy())

In [None]:
inference_preds = [pred for sublist in inference_preds for pred in sublist]
inference_preds = [max(int(round(x)), 0) for x in inference_preds]
print(np.count_nonzero(inference_preds))

In [None]:
def output_new_edges(inference_preds, inference_node_pairs, model_type, embedding_type):
    filename = 'data/' + model_type + '_' + embedding_type + '_edges_to_add.csv'
    with open(filename, 'w') as out_file:
        for i, pred in enumerate(inference_preds):
            if pred > 0:
                src = inference_node_pairs.iloc[i, 0]
                dst = inference_node_pairs.iloc[i, 1]
                weight = pred
                line = str(src) + ', ' + str(dst) + ', ' + str(weight) + '\n'
                out_file.write(line)

In [None]:
output_new_edges(inference_preds, inference_node_pairs, 'dense_nn', 'rolx')