In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [6]:
# load dataset
data = pd.read_csv('data/node2vec_basic_dataset_weighted.csv', index_col = 0)

In [13]:
# drop outliers - has significant affect on linear regression results
q = data["label"].quantile(0.99)
data = data[data["label"] < q]

In [14]:
X = data.copy()
X = X.drop(['label'], axis = 1)
X.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst54,dst55,dst56,dst57,dst58,dst59,dst60,dst61,dst62,dst63
506028,-0.502074,-0.205431,-0.2081,0.454686,0.558753,0.273764,0.261005,0.093908,-0.180036,0.440083,...,-2.555779,-1.764843,1.44444,0.798545,-0.34448,0.389152,-0.82719,1.773951,-1.260498,0.759518
389600,-0.685313,0.178513,-0.385685,0.576294,0.920263,-0.301337,-0.068909,0.047137,0.684309,-0.021873,...,-0.38493,-0.40374,0.53531,0.523825,0.635128,-0.070574,0.391891,-0.069345,-0.873973,-0.376768
272270,1.91686,-0.006106,-1.368155,-0.656563,3.63715,-0.417407,3.363787,-3.915681,3.472558,-1.81251,...,-2.33101,-2.50098,1.651505,-0.559541,0.170048,-0.580766,1.535017,2.165002,0.699544,1.274908
158517,0.117559,2.069915,-0.24798,0.025814,4.703812,-0.900471,4.334816,-2.250279,1.456761,0.944309,...,4.370479,0.304916,-4.363379,0.95707,-1.649037,-1.420114,-2.292599,2.15679,0.185897,2.327306
457451,-0.667499,0.257804,-0.278962,-0.251559,0.635071,-0.860158,0.571181,-0.241247,0.66128,-0.354348,...,-1.265548,-0.599124,0.574271,0.750123,-0.259373,-0.003319,0.752094,1.217232,-0.81157,-0.102391
96139,-0.600717,-1.355594,-3.499774,-1.740187,3.45636,2.107548,-2.242504,0.438141,-3.232942,2.70044,...,-0.090972,-0.336486,-1.543155,1.158376,1.028706,0.543339,-2.789739,0.340252,-3.80889,1.205908
393986,-1.327985,-0.040829,-0.581102,-0.383013,1.350631,-0.546643,-0.487392,-1.593578,0.695566,-0.845451,...,0.273516,-0.761409,0.202237,0.151441,0.09201,0.163899,0.102748,0.572147,0.383779,-0.780366
163223,-3.82371,0.422956,0.624361,2.614134,-1.458366,-0.552552,1.392234,-0.959714,-0.611547,-2.7281,...,-0.134211,2.763403,-2.816531,-0.212345,0.477211,-0.036607,-1.14283,2.886069,3.983221,-2.567622
167541,3.316294,4.231339,7.027232,1.828667,-2.06513,-0.231957,1.679622,-0.966612,0.61843,-0.138694,...,1.372003,-1.514764,0.374591,-0.120298,-0.874889,-2.179856,1.663809,0.661233,0.614403,0.522464
429385,-0.586006,-0.122511,-0.781875,0.513499,0.727583,-1.156577,0.468368,0.344901,0.872234,0.080436,...,-0.660583,-0.08753,0.295456,-0.113256,0.880455,-0.011184,-0.670374,-0.402418,0.886615,-0.066361


In [15]:
y = data['label']
y.sample(10)

194959    1
391942    0
114952    1
498665    0
552175    0
605493    0
171529    1
471205    0
328106    0
316976    0
Name: label, dtype: int64

In [16]:
# 90/10 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(551708, 128)
(551708,)
(61301, 128)
(61301,)


In [17]:
linreg = LinearRegression(normalize = True)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [18]:
y_train_pred = linreg.predict(X_train)
y_train_pred = [max(int(round(x)), 0) for x in y_train_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_train, y_train_pred))
print('R^2: %.4f' % r2_score(y_train, y_train_pred))

Mean squared error: 1.0851
Mean absolute error: 0.4519
R^2: 0.2851


In [19]:
y_test_pred = linreg.predict(X_test)
y_test_pred = [max(int(round(x)), 0) for x in y_test_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_test_pred))
print('R^2: %.4f' % r2_score(y_test, y_test_pred))

Mean squared error: 1.0770
Mean absolute error: 0.4497
R^2: 0.2881


In [20]:
def load_inference_data(filename, n_examples):
    inference_data = pd.read_csv(filename, index_col = 0)
    inference_data = inference_data.sample(n_examples)
    inference_node_pairs = inference_data[['src_id', 'dst_id']]
    inference_features = inference_data.copy()
    inference_features.drop(['src_id', 'dst_id'], axis = 1, inplace = True)
    return inference_node_pairs, inference_features

In [24]:
# load inference dataset
inference_node_pairs, inference_features = load_inference_data('data/node2vec_medium_inference_weighted.csv', 200000)

In [25]:
inference_preds = linreg.predict(inference_features)
inference_preds = [max(int(round(x)), 0) for x in inference_preds]
print(np.count_nonzero(inference_preds))

39558


In [26]:
def output_new_edges(inference_preds, inference_node_pairs, model_type, embedding_type):
    filename = 'data/' + model_type + '_' + embedding_type + '_edges_to_add.csv'
    with open(filename, 'w') as out_file:
        for i, pred in enumerate(inference_preds):
            if pred > 0:
                src = inference_node_pairs.iloc[i, 0]
                dst = inference_node_pairs.iloc[i, 1]
                weight = pred
                line = str(src) + ', ' + str(dst) + ', ' + str(weight) + '\n'
                out_file.write(line)

In [27]:
output_new_edges(inference_preds, inference_node_pairs, 'linreg', 'node2vec')