In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
# load dataset
data = pd.read_csv('data/node2vec_basic_dataset_weighted.csv', index_col = 0)

In [10]:
# drop outliers - has significant affect on linear regression results
q = data["label"].quantile(0.99)
data = data[data["label"] < q]

In [11]:
X = data.copy()
X = X.drop(['label'], axis = 1)
X.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst54,dst55,dst56,dst57,dst58,dst59,dst60,dst61,dst62,dst63
357869,0.353947,0.045458,-0.863391,0.01554,2.039922,-0.788093,0.393259,-0.169335,0.158642,0.352419,...,-0.896955,-0.567042,0.117213,-1.085843,-0.651343,-0.83247,-0.041895,-0.109675,0.464498,0.154978
205507,0.372998,0.820999,-2.747492,-4.149252,-0.68367,0.234671,2.43007,0.360904,-1.105515,2.928489,...,-5.370278,1.280953,0.094613,3.313851,-3.311223,-0.392349,2.094127,2.406776,-0.319784,-0.522241
146631,-0.69067,2.202327,1.972172,-1.561773,-5.092409,-0.775415,2.302926,-3.505743,-2.987507,-1.123894,...,-0.377747,3.494834,1.396146,-0.73798,-0.587583,1.481168,1.932055,3.714919,1.826901,-0.56758
458367,-0.496807,-0.968589,0.232325,1.935612,2.075796,-1.240723,0.638791,0.020361,1.254244,-0.555374,...,-0.950597,-0.083846,0.586788,-0.215317,0.531944,-0.713144,0.02562,0.675346,0.461003,-0.107559
361521,-0.324965,-0.968163,-1.380978,0.693898,0.552619,-1.986601,0.275122,0.134651,-0.078765,1.034552,...,-0.926666,-0.183561,1.105045,-0.306646,-0.451525,0.42908,0.210777,-0.454139,0.629716,-0.337949
618844,-0.74809,-0.417237,-0.609376,1.642442,0.233111,-0.586439,-0.121848,-0.211719,1.4413,-0.353703,...,-0.602379,0.262495,0.60504,-0.176019,-0.303891,-0.10112,0.282546,0.381851,0.109775,-0.045297
321745,-0.3076,-0.126435,-0.146606,1.302217,0.790816,-0.617441,1.177446,-0.076079,1.483339,0.908127,...,0.075993,-0.719784,-0.49045,-0.364203,-0.450459,-0.201075,0.113033,0.349342,0.357567,-0.569648
593774,-0.634269,0.087062,-0.513776,0.571317,0.511587,-0.414295,0.173243,-0.093609,-0.885778,0.548452,...,0.200027,-2.69176,1.139984,-1.165509,-0.540855,0.636551,0.696925,0.002045,1.21163,-0.312936
164238,0.080761,-5.890927,-2.979596,-3.772892,0.882317,3.409544,-2.441024,4.300789,-2.12859,-5.122968,...,0.258591,-0.088224,0.157203,0.340123,-0.720626,0.107826,-0.300277,0.366346,0.098355,-0.07443
215755,0.976661,1.35605,0.343535,-0.943591,3.725766,2.001515,3.039148,-3.155875,-1.053759,0.590571,...,-0.368315,-0.428269,-2.32373,2.031532,-2.541317,0.706544,0.098949,0.049183,1.426595,-4.156734


In [12]:
y = data['label']
y.sample(10)

276670    1
441364    0
331683    0
594282    0
399266    0
388055    0
59812     1
511788    0
434761    0
348018    0
Name: label, dtype: int64

In [13]:
# 90/10 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(551708, 128)
(551708,)
(61301, 128)
(61301,)


In [14]:
linreg = LinearRegression(normalize = True)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [15]:
y_train_pred = linreg.predict(X_train)
y_train_pred = [max(int(round(x)), 0) for x in y_train_pred]
print("Mean squared error: %.4f" % mean_squared_error(y_train, y_train_pred))
print("Mean absolute error: %.4f" % mean_absolute_error(y_train, y_train_pred))
print('R^2: %.4f' % r2_score(y_train, y_train_pred))

Mean squared error: 1.0851
Mean absolute error: 0.4519
R^2: 0.2851


In [16]:
y_test_pred = linreg.predict(X_test)
y_test_pred = [max(int(round(x)), 0) for x in y_test_pred]
print("Mean squared error: %.4f" % mean_squared_error(y_test, y_test_pred))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_test_pred))
print('R^2: %.4f' % r2_score(y_test, y_test_pred))

Mean squared error: 1.0770
Mean absolute error: 0.4497
R^2: 0.2881


In [231]:
def load_inference_data(filename, n_examples):
    inference_data = pd.read_csv(filename, index_col = 0)
    inference_data = inference_data.sample(n_examples)
    inference_node_pairs = inference_data[['src_id', 'dst_id']]
    inference_features = inference_data.copy()
    inference_features.drop(['src_id', 'dst_id'], axis = 1, inplace = True)
    return inference_node_pairs, inference_features

In [236]:
# load inference dataset
inference_node_pairs, inference_features = load_inference_data('data/rolx_inference_weighted.csv', 200000)

In [239]:
inference_preds = linreg.predict(inference_features)
inference_preds = [max(int(round(x)), 0) for x in inference_preds]
print(np.count_nonzero(inference_preds))

62618


In [240]:
def output_new_edges(inference_preds, inference_node_pairs, model_type, embedding_type):
    filename = 'data/' + model_type + '_' + embedding_type + '_edges_to_add.csv'
    with open(filename, 'w') as out_file:
        for i, pred in enumerate(inference_preds):
            if pred > 0:
                src = inference_node_pairs.iloc[i, 0]
                dst = inference_node_pairs.iloc[i, 1]
                weight = pred
                line = str(src) + ', ' + str(dst) + ', ' + str(weight) + '\n'
                out_file.write(line)

In [241]:
output_new_edges(inference_preds, inference_node_pairs, 'linreg', 'rolx')