In [201]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [196]:
# load dataset
data = pd.read_csv('data/rolx_dataset_weighted.csv', index_col = 0)

In [197]:
# drop outliers - has significant affect on linear regression results
q = data["label"].quantile(0.99)
data = data[data["label"] < q]

In [198]:
X = data.copy()
X = X.drop(['label'], axis = 1)
X.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst86,dst87,dst88,dst89,dst90,dst91,dst92,dst93,dst94,dst95
305467,18.0,19.0,43.0,107.0,7351.0,20510.0,412.166667,1150.277778,9336.055556,43754.388889,...,6817566.0,44294540.0,120648599.0,367510458.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
519305,1.0,1.0,1.0,1.0,3.0,3.0,4.0,4.0,4.0,4.0,...,619641.0,2367269.0,2406765.0,5566465.0,5547.0,23350.0,120147.0,345094.0,162133.0,318744.0
455611,1.0,1.0,1.0,1.0,32.0,36.0,33.0,37.0,124.0,547.0,...,124877.0,545463.0,1001044.0,2620221.0,2483.0,9271.0,69282.0,220269.0,162133.0,318744.0
309349,5.0,5.0,5.0,5.0,16.0,16.0,4.2,4.2,10.0,27.0,...,50.0,135.0,11203.0,27684.0,11.0,11.0,37.0,115.0,5850.0,13943.0
299996,7.0,7.0,19.0,47.0,603.0,1530.0,90.571429,231.0,1322.0,7819.142857,...,2659174.0,12814649.0,20693778.0,52868058.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
71787,518.0,1564.0,7423.0,44604.0,89288.0,233936.0,200.030888,620.810811,5004.312741,24117.747104,...,6242005.0,39393107.0,108690379.0,316581164.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
142864,143.0,313.0,3156.0,32536.0,75803.0,210217.0,573.230769,1922.909091,15566.167832,69071.055944,...,6853282.0,44294540.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
287296,13.0,14.0,30.0,70.0,3931.0,9225.0,306.0,719.307692,6380.538462,25851.692308,...,6817566.0,44294540.0,120648599.0,367510458.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
255894,154.0,306.0,2929.0,28256.0,71685.0,196797.0,502.525974,1642.876623,12515.116883,55775.642857,...,6817566.0,44294540.0,120648599.0,367510458.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
574102,1.0,1.0,1.0,1.0,177.0,490.0,178.0,491.0,4216.0,41091.0,...,4687.0,38900.0,92415.0,247739.0,284.0,442.0,4633.0,38689.0,83533.0,225410.0


In [199]:
y = data['label']
y.sample(10)

263502    1
359984    0
462871    0
535780    0
354833    0
232491    1
602818    0
6817      1
494234    0
98944     1
Name: label, dtype: int64

In [200]:
# 90/10 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(551708, 192)
(551708,)
(61301, 192)
(61301,)


In [202]:
linreg = LinearRegression(normalize = True)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [209]:
y_train_pred = linreg.predict(X_train)
y_train_pred = [max(int(round(x)), 0) for x in y_train_pred]
print("Mean squared error: %.4f" % mean_squared_error(y_train, y_train_pred))
print("Mean absolute error: %.4f" % mean_absolute_error(y_train, y_train_pred))
print('R^2: %.4f' % r2_score(y_train, y_train_pred))

Mean squared error: 0.9422
Mean absolute error: 0.3575
R^2: 0.3793


In [210]:
y_test_pred = linreg.predict(X_test)
y_test_pred = [max(int(round(x)), 0) for x in y_test_pred]
print("Mean squared error: %.4f" % mean_squared_error(y_test, y_test_pred))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_test_pred))
print('R^2: %.4f' % r2_score(y_test, y_test_pred))

Mean squared error: 0.9354
Mean absolute error: 0.3563
R^2: 0.3817


In [231]:
def load_inference_data(filename, n_examples):
    inference_data = pd.read_csv(filename, index_col = 0)
    inference_data = inference_data.sample(n_examples)
    inference_node_pairs = inference_data[['src_id', 'dst_id']]
    inference_features = inference_data.copy()
    inference_features.drop(['src_id', 'dst_id'], axis = 1, inplace = True)
    return inference_node_pairs, inference_features

In [236]:
# load inference dataset
inference_node_pairs, inference_features = load_inference_data('data/rolx_inference_weighted.csv', 200000)

In [239]:
inference_preds = linreg.predict(inference_features)
inference_preds = [max(int(round(x)), 0) for x in inference_preds]
print(np.count_nonzero(inference_preds))

62618


In [240]:
def output_new_edges(inference_preds, inference_node_pairs, model_type, embedding_type):
    filename = 'data/' + model_type + '_' + embedding_type + '_edges_to_add.csv'
    with open(filename, 'w') as out_file:
        for i, pred in enumerate(inference_preds):
            if pred > 0:
                src = inference_node_pairs.iloc[i, 0]
                dst = inference_node_pairs.iloc[i, 1]
                weight = pred
                line = str(src) + ', ' + str(dst) + ', ' + str(weight) + '\n'
                out_file.write(line)

In [241]:
output_new_edges(inference_preds, inference_node_pairs, 'linreg', 'rolx')