In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import json

In [2]:
# load dataset
data = pd.read_csv('data/node2vec_256dim.csv', index_col = 0)

In [3]:
# drop outliers - has significant affect on linear regression results
q = data["label"].quantile(0.99)
data = data[data["label"] < q]

In [4]:
X = data.copy()
X = X.drop(['label'], axis = 1)
X.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst246,dst247,dst248,dst249,dst250,dst251,dst252,dst253,dst254,dst255
221103,-3.254808,-0.498184,3.198521,-1.190839,-3.01221,0.009669,-0.42123,-2.25153,-0.662277,-0.626224,...,-0.774978,-0.447902,-0.908105,0.250546,-0.191604,0.518216,0.155106,0.873243,-0.026432,-0.277466
523652,0.170902,-0.218113,0.016552,0.00631,0.202078,-0.024601,0.008458,-0.102188,0.033927,-0.033609,...,0.09982,-0.121982,-0.050491,0.016778,0.235254,-0.001645,0.052328,-0.07404,-0.033732,-0.288014
370794,0.436548,-0.126295,-0.04526,-0.276206,0.002729,-0.258838,0.31484,-0.183727,0.047459,0.05118,...,0.435408,-0.117129,-0.071275,0.019739,1.388287,-0.491617,0.202765,0.06795,-0.518105,-0.493507
496352,0.84396,-0.330069,-0.009983,0.438544,-0.544002,-0.593822,-0.150877,-0.032528,-0.039162,-0.028734,...,0.246172,0.026093,0.141927,-0.015538,0.248966,-0.084439,0.092085,-0.103995,0.035007,-0.148845
569795,0.211372,-0.106784,-0.078711,0.02599,0.057513,-0.156326,0.106247,0.014855,-0.116341,0.011993,...,0.484098,-0.059525,0.570453,0.265557,0.227615,-0.118201,0.169167,-0.496936,-0.07364,-0.679223
438574,0.178657,-0.215074,-0.065799,0.075445,0.130172,-0.130337,-0.025253,-0.17381,0.058743,0.166362,...,0.148691,-0.024842,0.178727,-0.006497,0.243646,-0.020237,0.034233,-0.043674,0.032533,-0.113293
31120,-4.164678,1.148429,2.251239,-3.709203,0.519988,-0.533266,1.339565,-5.726871,-2.239186,2.15018,...,-3.068601,0.912485,-1.934673,1.274509,-2.11382,1.166511,0.088604,0.462798,-2.66717,4.667828
526735,0.185417,-0.084352,0.021086,0.124117,0.182684,-0.068912,-0.075919,0.163429,-0.168111,-0.002331,...,0.173529,-0.024489,0.051078,-0.083436,0.240122,0.065199,0.016465,-0.043011,0.131117,-0.175045
146119,0.926954,2.293888,0.968753,0.418305,-0.449054,3.590363,-0.154546,0.122409,-0.653065,0.269008,...,0.148905,0.056297,0.416846,-0.093204,0.315081,0.240407,0.198337,0.165624,0.167382,-0.475275
520589,-0.004386,0.025302,-0.064063,-0.165468,0.227208,-0.272434,0.001118,0.117424,0.085334,-0.174269,...,0.250207,0.014396,0.267165,0.093874,0.070336,0.004023,0.10266,0.091159,0.165374,-0.14484


In [5]:
y = data['label']
y.sample(10)

409668    0
77258     1
22589     1
198794    1
523433    0
75745     1
19735     3
130811    1
106105    1
364395    0
Name: label, dtype: int64

In [6]:
# 90/10 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(546274, 512)
(546274,)
(60698, 512)
(60698,)


In [7]:
linreg = LinearRegression(normalize = True)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [8]:
y_train_pred = linreg.predict(X_train)
y_train_pred = [max(int(round(x)), 0) for x in y_train_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_train, y_train_pred))
print('R^2: %.4f' % r2_score(y_train, y_train_pred))

RMSE: 0.9188
Mean absolute error: 0.3703
R^2: 0.3807


In [9]:
y_test_pred = linreg.predict(X_test)
y_test_pred = [max(int(round(x)), 0) for x in y_test_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_test_pred))
print('R^2: %.4f' % r2_score(y_test, y_test_pred))

RMSE: 0.9158
Mean absolute error: 0.3660
R^2: 0.3739


In [10]:
def load_inference_data(filename, n_examples):
    inference_data = pd.read_csv(filename, index_col = 0)
    inference_data = inference_data.sample(n_examples)
    inference_node_pairs = inference_data[['src_id', 'dst_id']]
    inference_features = inference_data.copy()
    inference_features.drop(['src_id', 'dst_id'], axis = 1, inplace = True)
    return inference_node_pairs, inference_features

In [13]:
# load inference dataset
inference_node_pairs, inference_features = load_inference_data('data/node2vec_256dim_inference.csv', 300000)

In [14]:
inference_preds = linreg.predict(inference_features)
inference_preds = [max(int(round(x)), 0) for x in inference_preds]
print(np.count_nonzero(inference_preds))

52014


In [15]:
def output_new_edges(inference_preds, inference_node_pairs, model_type, embedding_type):
    filename = 'data/' + model_type + '_' + embedding_type + '_edges_to_add.csv'
    with open(filename, 'w') as out_file:
        for i, pred in enumerate(inference_preds):
            if pred > 0:
                src = inference_node_pairs.iloc[i, 0]
                dst = inference_node_pairs.iloc[i, 1]
                weight = pred
                line = str(src) + ', ' + str(dst) + ', ' + str(weight) + '\n'
                out_file.write(line)

In [16]:
output_new_edges(inference_preds, inference_node_pairs, 'linreg', 'node2vec-256')