In [95]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import heapq
from tqdm import tqdm

In [96]:
dataset = 'stackexchange-chemistry'
methods = ['CHAWE', 'HAWE','hin2vec','node2bits','deepWalk','struc2vec']
#methods = ['HAWE','hin2vec','node2bits','deepWalk','struc2vec','GraphSTONE']
userDoubleIds = np.loadtxt('../dataset/{}/userInfo.txt'.format(dataset), dtype = int)
userNid2Oid = dict(userDoubleIds)
userOid2Nid = dict(userDoubleIds[:,[1,0]])

In [97]:
users= dict()
with open('../dataset/{}/Users.xml'.format(dataset), 'r') as xml_file:
    tree = ET.parse(xml_file)
    root = tree.getroot()
    for row in root:
        userinfo = row.attrib
        userOid  = userinfo.pop('Id')
        users[userOid] = {'reputation': userinfo.pop('Reputation'), 'upvotes': userinfo.pop('UpVotes'), 'downvotes': userinfo.pop('DownVotes')}
print(len(users))

9323


In [100]:
results = {}
K = 5
for method in methods:
    results[method] = {}
    df = pd.read_csv("../embedding/{}-{}.embeddings".format(method, dataset), skiprows=1, header=None, sep=' ')
    embeddings = df.loc[df[0].str.contains('U')]
    embeddings.reset_index(drop=True, inplace=True)
    tmpEmbeddings = embeddings.copy()
    tmpEmbeddings.drop([df.columns[0]], axis=1,inplace=True)
    embed_matrix = tmpEmbeddings.values
    print(method,embed_matrix.shape)
    for i in tqdm(range(len(embeddings))):
        distances = []
        for j in range(len(embeddings)):
            if i == j:
                continue
            distances.append({'id': embeddings[0][j],'dis': np.linalg.norm(embed_matrix[i]-embed_matrix[j])})
        
        closestK = heapq.nsmallest(K, distances, lambda x:x["dis"])
        results[method][embeddings[0][i]] = [closeuser['id'] for closeuser in closestK]

  1%|          | 6/804 [00:00<00:15, 52.24it/s]

CHAWE (804, 128)


100%|██████████| 804/804 [00:16<00:00, 49.49it/s]
  5%|▌         | 22/402 [00:00<00:03, 107.97it/s]

HAWE (402, 128)


100%|██████████| 402/402 [00:04<00:00, 96.75it/s] 
  5%|▌         | 22/402 [00:00<00:03, 107.97it/s]

hin2vec (402, 128)


100%|██████████| 402/402 [00:04<00:00, 97.00it/s] 
  3%|▎         | 13/402 [00:00<00:03, 121.14it/s]

node2bits (402, 128)


100%|██████████| 402/402 [00:03<00:00, 107.02it/s]
  5%|▌         | 22/402 [00:00<00:03, 105.88it/s]

deepWalk (402, 128)


100%|██████████| 402/402 [00:06<00:00, 60.64it/s] 
  1%|▏         | 6/402 [00:00<00:06, 58.92it/s]

struc2vec (402, 128)


100%|██████████| 402/402 [00:04<00:00, 84.13it/s] 


In [101]:
with open('{}-results.txt'.format(dataset), 'w', encoding='utf-8') as rf:
    for inu in userNid2Oid:
        nu = 'U'+str(inu)
        ou = str(userNid2Oid[int(nu[1:])])
        rf.write('Target user OID-{}: reputation-{}, upvote-{}, downvote-{}.\n'.format(ou, users[ou]['reputation'], users[ou]['upvotes'], users[ou]['downvotes']))
        for method in methods:
            for nc in results[method][nu]:
                oc = str(userNid2Oid[int(nc[1:])])
                rf.write('{}-- user OID-{}: reputation-{}, upvote-{}, downvote-{}.\n'.format(method, oc, users[oc]['reputation'], users[oc]['upvotes'], users[oc]['downvotes']))

        rf.write('\n')