In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import math
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [2]:
# load dataset
data = pd.read_csv('data/rolx_dataset_weighted.csv', index_col = 0)

In [3]:
# drop outliers - has significant affect on linear regression results
q = data["label"].quantile(0.99)
data = data[data["label"] < q]

In [4]:
X = data.copy()
X = X.drop(['label'], axis = 1)
X.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst86,dst87,dst88,dst89,dst90,dst91,dst92,dst93,dst94,dst95
543404,1.0,1.0,1.0,1.0,481.0,1203.0,482.0,1204.0,13159.0,86030.0,...,7235.0,49867.0,262186.0,823556.0,385.0,1139.0,2513.0,21137.0,59030.0,179815.0
322350,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,...,1118480.0,5389134.0,9744784.0,25982765.0,3530.0,19896.0,110785.0,333575.0,162133.0,351372.0
611612,3.0,4.0,4.0,7.0,316.0,768.0,107.0,259.333333,2091.0,20544.0,...,53.0,247.0,16498.0,53483.0,14.0,18.0,41.0,222.0,15930.0,52305.0
599029,1.0,1.0,1.0,1.0,29.0,35.0,30.0,36.0,37.0,46.0,...,4070983.0,21377826.0,38896213.0,102350313.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
108408,1603.0,5093.0,41011.0,158271.0,137132.0,285794.0,135.71491,372.578291,3052.271366,17863.301934,...,6853282.0,44151214.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
183545,617.0,1585.0,16583.0,88360.0,112645.0,265723.0,235.322528,714.518639,5932.528363,31457.878444,...,6853282.0,44151214.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
135301,653.0,2048.0,22466.0,126459.0,120495.0,250078.0,252.333844,767.148545,6552.686064,34635.750383,...,4278904.0,22617145.0,36786948.0,95186163.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
412517,1.0,1.0,1.0,1.0,2.0,3.0,3.0,4.0,3.0,4.0,...,4230167.0,22044005.0,40087059.0,105080261.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
290675,289.0,510.0,3687.0,32508.0,76500.0,209865.0,289.221453,949.380623,7698.761246,33777.529412,...,6817566.0,44294540.0,120648599.0,367510458.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
550438,1.0,1.0,1.0,1.0,176.0,340.0,177.0,341.0,1799.0,7913.0,...,50448.0,318289.0,608953.0,1621328.0,702.0,1126.0,22028.0,122210.0,129970.0,280115.0


In [5]:
y = data['label']
y.sample(10)

39740     1
286723    1
274669    1
427599    0
616368    0
130503    1
48829     1
94206     1
14589     1
341416    0
Name: label, dtype: int64

In [6]:
# 90/10 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(551708, 192)
(551708,)
(61301, 192)
(61301,)


In [7]:
linreg = LinearRegression(normalize = True)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [8]:
y_train_pred = linreg.predict(X_train)
y_train_pred = [max(int(round(x)), 0) for x in y_train_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_train, y_train_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_train, y_train_pred))
print('R^2: %.4f' % r2_score(y_train, y_train_pred))

RMSE: 0.9707
Mean absolute error: 0.3575
R^2: 0.3793


In [9]:
y_test_pred = linreg.predict(X_test)
y_test_pred = [max(int(round(x)), 0) for x in y_test_pred]
print("RMSE: %.4f" % math.sqrt(mean_squared_error(y_test, y_test_pred)))
print("Mean absolute error: %.4f" % mean_absolute_error(y_test, y_test_pred))
print('R^2: %.4f' % r2_score(y_test, y_test_pred))

RMSE: 0.9671
Mean absolute error: 0.3563
R^2: 0.3817


In [10]:
linreg.coef_

array([ 5.24009263e+05, -7.52555441e+05, -1.04801852e+06,  1.50511088e+06,
       -5.24009262e+05,  7.52555441e+05,  3.76131913e+07,  1.07446711e+07,
        5.01374001e+06,  1.03383583e+06,  2.18906605e+06, -1.06025358e+05,
        3.94758558e+05, -2.39059277e+05, -3.28213225e+04,  5.46024168e+02,
        5.16475116e+02, -1.14131249e+02,  6.37341569e+07, -4.49843333e+06,
        1.45612420e+06,  1.13736050e+06, -9.86662069e+05,  5.60039761e+05,
       -4.16859961e+07, -1.11851113e+07,  3.13186951e+06, -1.52955538e+05,
        1.88373871e+06,  5.46465502e+05, -1.42049936e-04,  7.35848299e-05,
       -2.06548462e-05,  6.84533625e-06, -8.85313207e-06,  1.82401536e-06,
       -4.07280476e+06, -4.40440144e+05, -3.30973333e-07, -8.75074256e-09,
       -1.16026386e-08,  3.95082388e-09,  1.87707817e-05, -1.53540791e-05,
       -8.13845811e-06,  3.63887348e-06, -2.85582001e-06, -4.76730870e-07,
        1.28451173e+05, -5.13356639e+05,  3.44203834e+04, -8.25073137e+02,
        2.83055338e+02, -

In [None]:
def load_inference_data(filename, n_examples):
    inference_data = pd.read_csv(filename, index_col = 0)
    inference_data = inference_data.sample(n_examples)
    inference_node_pairs = inference_data[['src_id', 'dst_id']]
    inference_features = inference_data.copy()
    inference_features.drop(['src_id', 'dst_id'], axis = 1, inplace = True)
    return inference_node_pairs, inference_features

In [None]:
# load inference dataset
inference_node_pairs, inference_features = load_inference_data('data/rolx_inference_weighted.csv', 200000)

In [None]:
inference_preds = linreg.predict(inference_features)
inference_preds = [max(int(round(x)), 0) for x in inference_preds]
print(np.count_nonzero(inference_preds))

In [None]:
def output_new_edges(inference_preds, inference_node_pairs, model_type, embedding_type):
    filename = 'data/' + model_type + '_' + embedding_type + '_edges_to_add.csv'
    with open(filename, 'w') as out_file:
        for i, pred in enumerate(inference_preds):
            if pred > 0:
                src = inference_node_pairs.iloc[i, 0]
                dst = inference_node_pairs.iloc[i, 1]
                weight = pred
                line = str(src) + ', ' + str(dst) + ', ' + str(weight) + '\n'
                out_file.write(line)

In [None]:
output_new_edges(inference_preds, inference_node_pairs, 'linreg', 'rolx')