In [68]:
import json
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LinearRegression

In [78]:
dataset_path = './dataset/git_web_ml'
os.listdir(dataset_path)

['musae_git_edges.csv',
 'citing.txt',
 'pruned_df.csv',
 'README.txt',
 'features.csv',
 'musae_git_features.json',
 'musae_git_target.csv']

In [5]:
nodes = pd.read_csv(os.path.join(dataset_path, 'musae_git_target.csv'))
nodes

Unnamed: 0,id,name,ml_target
0,0,Eiryyy,0
1,1,shawflying,0
2,2,JpMCarrilho,1
3,3,SuhwanCha,0
4,4,sunilangadi2,1
...,...,...,...
37695,37695,shawnwanderson,1
37696,37696,kris-ipeh,0
37697,37697,qpautrat,0
37698,37698,Injabie3,1


In [6]:
edges = pd.read_csv(os.path.join(dataset_path, 'musae_git_edges.csv'))
edges

Unnamed: 0,id_1,id_2
0,0,23977
1,1,34526
2,1,2370
3,1,14683
4,1,29982
...,...,...
288998,37527,37596
288999,37529,37601
289000,37644,2347
289001,25879,2347


In [None]:
coeffs = []

In [86]:
def QAP_linear_regression(nodes, edges, n_rounds):

    coeffs = []

    for i in tqdm(range(n_rounds)):
        ids = np.array(nodes['id']).copy()
        np.random.shuffle(ids)
        nodes['replace_id'] = ids
        replace_mat = np.array(nodes[['id', 'replace_id']])
        replacement = dict()
        for row in replace_mat:
            replacement[row[0]] = row[1]
        temp_edges = edges.copy()
        temp_edges = temp_edges.replace(replacement)  
        temp_edges = temp_edges.merge(nodes[['id', 'ml_target']], how='left', left_on='id_1', right_on='id')
        temp_edges = temp_edges.merge(nodes[['id', 'ml_target']], how='left', left_on='id_2', right_on='id', suffixes=[None, '_2']) 
        test_df = temp_edges.groupby('id_1').mean()[['ml_target', 'ml_target_2']] 
        ml_target = np.array(test_df['ml_target']).reshape((-1, 1))
        ml_target2_mean = test_df['ml_target_2']
        lr = LinearRegression().fit(ml_target, ml_target2_mean)
        coef = lr.coef_[0]
        coeffs.append(coef)

        mean_coef = np.mean(coeffs)
        std_coef = np.std(coeffs)
    
    return mean_coef, std_coef, coeffs


In [95]:
mean_coef, std_coef, coeffs = QAP_linear_regression(nodes, edges, n_rounds=20)

 70%|███████   | 14/20 [10:09<04:21, 43.54s/it]


KeyboardInterrupt: 

In [None]:
mean_coef

0.0010409176955971332

In [93]:
coeffs

[0.004004838096268154,
 0.00382815013870159,
 0.004914137604000279,
 -0.00459624657213348,
 -0.0029462907888508764]

In [None]:
std_coef

0.003980722893447656

In [97]:
import scipy
z_score = 307.5567491
scipy.stats.norm.sf(abs(z_score))*2

4.599930578470714e-205

In [90]:
temp_edges = edges.copy()
temp_edges = temp_edges.merge(nodes[['id', 'ml_target']], how='left', left_on='id_1', right_on='id')
temp_edges = temp_edges.merge(nodes[['id', 'ml_target']], how='left', left_on='id_2', right_on='id', suffixes=[None, '_2']) 
test_df = temp_edges.groupby('id_1').mean()[['ml_target', 'ml_target_2']] 
ml_target = np.array(test_df['ml_target']).reshape((-1, 1))
ml_target2_mean = test_df['ml_target_2']
lr = LinearRegression().fit(ml_target, ml_target2_mean)
coef = lr.coef_[0]

In [91]:
coef

0.3881983092000647

In [7]:
ids = np.array(nodes['id']).copy()
np.random.shuffle(ids)
nodes['replace_id'] = ids

In [8]:
nodes

Unnamed: 0,id,name,ml_target,replace_id
0,0,Eiryyy,0,18176
1,1,shawflying,0,25398
2,2,JpMCarrilho,1,32588
3,3,SuhwanCha,0,13988
4,4,sunilangadi2,1,27555
...,...,...,...,...
37695,37695,shawnwanderson,1,20065
37696,37696,kris-ipeh,0,20738
37697,37697,qpautrat,0,22457
37698,37698,Injabie3,1,18071


In [12]:
replace_mat = np.array(nodes[['id', 'replace_id']])

In [13]:
replacement = dict()
for row in replace_mat:
    replacement[row[0]] = row[1]

In [71]:
temp_edges = edges.copy()
temp_edges = temp_edges.replace(replacement)

In [72]:
temp_edges

Unnamed: 0,id_1,id_2
0,6433,4169
1,26553,14971
2,26553,35434
3,26553,12617
4,26553,34061
...,...,...
288998,5477,33511
288999,6596,21294
289000,35899,7360
289001,20289,7360


In [19]:
nodes

Unnamed: 0,id,name,ml_target,replace_id
0,0,Eiryyy,0,6433
1,1,shawflying,0,26553
2,2,JpMCarrilho,1,30541
3,3,SuhwanCha,0,3195
4,4,sunilangadi2,1,36748
...,...,...,...,...
37695,37695,shawnwanderson,1,9041
37696,37696,kris-ipeh,0,12668
37697,37697,qpautrat,0,4893
37698,37698,Injabie3,1,34588


In [18]:
edges

Unnamed: 0,id_1,id_2
0,0,23977
1,1,34526
2,1,2370
3,1,14683
4,1,29982
...,...,...
288998,37527,37596
288999,37529,37601
289000,37644,2347
289001,25879,2347


In [73]:
temp_edges = temp_edges.merge(nodes[['id', 'ml_target']], how='left', left_on='id_1', right_on='id')
temp_edges = temp_edges.merge(nodes[['id', 'ml_target']], how='left', left_on='id_2', right_on='id', suffixes=[None, '_2'])

In [74]:
temp_edges

Unnamed: 0,id_1,id_2,id,ml_target,id_2.1,ml_target_2
0,6433,4169,6433,0,4169,0
1,26553,14971,26553,0,14971,0
2,26553,35434,26553,0,35434,0
3,26553,12617,26553,0,12617,0
4,26553,34061,26553,0,34061,0
...,...,...,...,...,...,...
288998,5477,33511,5477,0,33511,0
288999,6596,21294,6596,0,21294,0
289000,35899,7360,35899,1,7360,0
289001,20289,7360,20289,0,7360,0


In [75]:
test_df = temp_edges.groupby('id_1').mean()[['ml_target', 'ml_target_2']]
test_df

Unnamed: 0_level_0,ml_target,ml_target_2
id_1,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.0,0.000000
1,0.0,0.000000
2,1.0,1.000000
3,0.0,0.000000
4,1.0,1.000000
...,...,...
37693,1.0,0.333333
37694,0.0,0.000000
37697,0.0,0.153846
37698,1.0,0.285714


In [76]:
ml_target = np.array(test_df['ml_target']).reshape((-1, 1))
ml_target2_mean = test_df['ml_target_2']

In [80]:
lr = LinearRegression().fit(ml_target, ml_target2_mean)
lr.coef_[0]

0.000509603049221263

In [16]:
np.array(edges)

array([[    0, 23977],
       [    1, 34526],
       [    1,  2370],
       ...,
       [37644,  2347],
       [25879,  2347],
       [25616,  2347]])

In [9]:
edges

Unnamed: 0,id_1,id_2
0,0,23977
1,1,34526
2,1,2370
3,1,14683
4,1,29982
...,...,...
288998,37527,37596
288999,37529,37601
289000,37644,2347
289001,25879,2347


In [10]:
def replace_id(id):
    return nodes[nodes['id']==id]['replace_id']


edges['id_1_new'] = edges['id_1'].apply(lambda x: replace_id(x))
edges['id_2_new'] = edges['id_2'].apply(lambda x: replace_id(x))

: 

: 

In [21]:
ids[:10]

array([23153, 12985, 23833, 15664, 15274, 24363,  6029, 13330,  4126,
       31885])

In [22]:
nodes

Unnamed: 0,id,name,ml_target
0,0,Eiryyy,0
1,1,shawflying,0
2,2,JpMCarrilho,1
3,3,SuhwanCha,0
4,4,sunilangadi2,1
...,...,...,...
37695,37695,shawnwanderson,1
37696,37696,kris-ipeh,0
37697,37697,qpautrat,0
37698,37698,Injabie3,1


In [24]:
permuted_edges = edges.copy()


In [25]:
permuted_edges

Unnamed: 0,id_1,id_2
0,0,23977
1,1,34526
2,1,2370
3,1,14683
4,1,29982
...,...,...
288998,37527,37596
288999,37529,37601
289000,37644,2347
289001,25879,2347
