In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [3]:
# load dataset
data = pd.read_csv('data/rolx_dataset_weighted.csv', index_col = 0)

In [4]:
# drop outliers - has significant affect on linear regression results
q = data["label"].quantile(0.99)
data = data[data["label"] < q]

In [5]:
X = data.copy()
X = X.drop(['label'], axis = 1)
X.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst86,dst87,dst88,dst89,dst90,dst91,dst92,dst93,dst94,dst95
483374,1.0,1.0,1.0,1.0,17.0,23.0,18.0,24.0,38.0,94.0,...,703743.0,2698113.0,2153333.0,4602628.0,5547.0,23350.0,120147.0,345094.0,145640.0,281336.0
260038,130.0,303.0,1883.0,16869.0,67824.0,192299.0,549.692308,1736.415385,14521.061538,63249.7,...,6853282.0,44151214.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
164333,339.0,613.0,4417.0,29358.0,72623.0,201223.0,239.286136,764.973451,5955.230088,28465.817109,...,6853282.0,44294540.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
45949,4601.0,11182.0,102665.0,310542.0,145456.0,257796.0,75.241252,188.589002,1427.821561,8933.609867,...,6817566.0,44294540.0,124978835.0,367510458.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
156319,11.0,12.0,28.0,83.0,4389.0,10825.0,403.090909,998.090909,8386.181818,37040.272727,...,4724218.0,25338648.0,48861782.0,129472982.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
112782,907.0,2183.0,18790.0,90321.0,113966.0,267898.0,166.084895,492.124587,3876.971334,19737.265711,...,6817566.0,44294540.0,120648599.0,367510458.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
29629,5547.0,23350.0,120147.0,345094.0,132390.0,229673.0,66.186587,161.620876,1235.49342,7959.476113,...,6853282.0,44294540.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
342049,1.0,1.0,1.0,1.0,62.0,63.0,63.0,64.0,77.0,305.0,...,370.0,3008.0,32695.0,109899.0,56.0,78.0,369.0,3007.0,32694.0,109897.0
573541,1.0,1.0,1.0,1.0,1.0,2.0,2.0,3.0,2.0,3.0,...,703743.0,2698113.0,2153333.0,4602628.0,5547.0,23350.0,120147.0,345094.0,145640.0,281336.0
77653,2483.0,9271.0,69282.0,220269.0,162133.0,318744.0,120.102296,302.058397,2360.316955,14728.89408,...,6817566.0,44294540.0,120648599.0,367510458.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0


In [6]:
y = data['label']
y.sample(10)

83989     5
443410    0
100581    1
230731    1
615982    0
580794    0
24395     1
69141     1
120717    2
117515    1
Name: label, dtype: int64

In [7]:
# 90/10 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(551708, 192)
(551708,)
(61301, 192)
(61301,)


In [8]:
linreg = LinearRegression(normalize = True)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [9]:
y_train_pred = linreg.predict(X_train)
y_train_pred = [max(int(round(x)), 0) for x in y_train_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_train, y_train_pred))
print('R^2: %.4f' % r2_score(y_train, y_train_pred))

RMSE: 0.9707
Mean absolute error: 0.3575
R^2: 0.3793


In [10]:
y_test_pred = linreg.predict(X_test)
y_test_pred = [max(int(round(x)), 0) for x in y_test_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_test_pred))
print('R^2: %.4f' % r2_score(y_test, y_test_pred))

RMSE: 0.9671
Mean absolute error: 0.3563
R^2: 0.3817


In [None]:
def load_inference_data(filename, n_examples):
    inference_data = pd.read_csv(filename, index_col = 0)
    inference_data = inference_data.sample(n_examples)
    inference_node_pairs = inference_data[['src_id', 'dst_id']]
    inference_features = inference_data.copy()
    inference_features.drop(['src_id', 'dst_id'], axis = 1, inplace = True)
    return inference_node_pairs, inference_features

In [None]:
# load inference dataset
inference_node_pairs, inference_features = load_inference_data('data/rolx_inference_weighted.csv', 200000)

In [None]:
inference_preds = linreg.predict(inference_features)
inference_preds = [max(int(round(x)), 0) for x in inference_preds]
print(np.count_nonzero(inference_preds))

In [None]:
def output_new_edges(inference_preds, inference_node_pairs, model_type, embedding_type):
    filename = 'data/' + model_type + '_' + embedding_type + '_edges_to_add.csv'
    with open(filename, 'w') as out_file:
        for i, pred in enumerate(inference_preds):
            if pred > 0:
                src = inference_node_pairs.iloc[i, 0]
                dst = inference_node_pairs.iloc[i, 1]
                weight = pred
                line = str(src) + ', ' + str(dst) + ', ' + str(weight) + '\n'
                out_file.write(line)

In [None]:
output_new_edges(inference_preds, inference_node_pairs, 'linreg', 'rolx')