In [None]:
import csv
import snap
import sys
import numpy as np

In [None]:
# Get dictionary mapping usernames to node IDs in TNEANet
uunet = snap.TNEANet.Load(snap.TFIn('output/usernets/user_tneanet_nodelete.graph'))
usernames_to_nids = {uunet.GetStrAttrDatN(NI, 'username'): NI.GetId() for NI in uunet.Nodes()}
nids_to_usernames = {v: k for (k,v) in usernames_to_nids.iteritems()}

del uunet

In [None]:
# Parse network feature files and map usernames to network features
user_network_features = {username: {} for username in usernames_to_nids}

network_basic = open('output/network_features_user_tneanet_nodelete_jan2012.tsv')
nbreader = csv.DictReader(network_basic, delimiter='\t')
for (i, row) in enumerate(nbreader):
    nid = int(row['node_id'])
    unf = user_network_features[nids_to_usernames[nid]]
    unf.update(row)
    del unf['node_id'] # redundant
    if i % 100000 == 0:
        print(i)
network_basic.close()

In [None]:
node2vec = {'node2vec_homophily': open('output/reddit_emb/uu_tneanet_p1_q0.5.emb'),
            'node2vec_structure': open('output/reddit_emb/uu_tneanet_p0.5_q1.emb')}


user_n2v_features = {}
for (prefix, n2vfile) in node2vec.iteritems():
    print(prefix)
    user_n2v_features[prefix] = {}
    for (j, line) in enumerate(n2vfile):
        entries = line.strip().split()
        nid = int(entries[0])
        vec = np.array(entries[1:], dtype=np.float64)
        user_n2v_features[prefix][nids_to_usernames[nid]] = vec
        if j % 100000 == 0:
            print(j)
    n2vfile.close()

In [None]:
# Merge network features with basic and language features
basic_language_file = open('output/basic_and_language_nodelete.tsv')
bl_reader = csv.DictReader(basic_language_file, delimiter='\t')

bl_feature_names = bl_reader.fieldnames[:-2]
output_names = bl_reader.fieldnames[-2:]

network_feature_names = user_network_features.values()[0].keys()
for (prefix, nfeatures) in node2vec_nfeatures.iteritems():
    network_feature_names.extend([prefix + '_' + str(i) for i in range(nfeatures)])

full_file = open('output/basic_language_network_nodelete.tsv', 'w')
full_writer = csv.DictWriter(full_file, delimiter='\t', 
                             fieldnames=bl_feature_names + network_feature_names + output_names)
full_writer.writeheader()

for (i, row) in enumerate(bl_reader):
    username = row['Author']
    if username in user_network_features and username in user_n2v_features['node2vec_homophily']:
        row.update(user_network_features[username])
        for prefix in user_n2v_features:
            vec = user_n2v_features[prefix][username]
            row.update({prefix + '_' + str(j): vec[j] for j in range(len(vec))})
        full_writer.writerow(row)
    
    if i % 10000 == 0:
        print(i)
print('Done')

basic_language_file.close()
full_file.close()