In [6]:
import snap
import csv
import pandas as pd
import numpy as np
import networkx as nx

In [11]:
# Load in weights
weights = pd.read_csv('data/reddit_nodes_weighted_full.csv', header = None)
weights.columns = ['src', 'dst', 'weight']

In [13]:
# Turn weights into dict (bidirectional)
weights_dict = {}
for i in range(weights.shape[0]):
    src = weights.iloc[i, 0]
    dst = weights.iloc[i, 1]
    weight = weights.iloc[i, 2]
    
    weights_dict[(src, dst)] = weight
    weights_dict[(dst, src)] = weight

In [16]:
# Load graph into networkx (weighted, undirected)
df = pd.read_csv('data/reddit_nodes_weighted_full.csv', header=None, names=['source', 'target', 'weight'])
G = nx.from_pandas_edgelist(df, edge_attr='weight', create_using=nx.Graph())

In [49]:
''' 6 features per node: 
        1) degree
        2) weighted degree
        3) number of edges in egonet
        4) weight of edges in egonet
        5) number of edges going out of egonet
        6) weight of edges going out of egonet
'''
def getBasicFeatureVectors(Graph, weights_dict):
    vectors = {}
    i = 0
    for node_id, nbrsdict in Graph.adjacency():
        features = []
        deg = len(nbrsdict)
        egonet = nx.ego_graph(Graph, node_id)
        egonet_nodes = egonet.nodes()
        egonet_edges = len(egonet.edges())
        egonet_edges_weighted = 0
        for egonet_edge in egonet.edges():
            egonet_edges_weighted += weights_dict[egonet_edge]
        
        deg_weighted = 0
        num_edges_crossing_egonet = 0
        num_edges_weighted_crossing_egonet = 0
        for nbr, eattr in nbrsdict.items():
            deg_weighted += eattr['weight']
            for nbr_nbr in Graph.neighbors(nbr):
                if nbr_nbr not in egonet_nodes:
                    num_edges_crossing_egonet += 1
                    num_edges_weighted_crossing_egonet += weights_dict[(nbr, nbr_nbr)]
        
        features.append(deg)
        features.append(deg_weighted)
        features.append(egonet_edges)
        features.append(egonet_edges_weighted)
        features.append(num_edges_crossing_egonet)
        features.append(num_edges_weighted_crossing_egonet)
        vectors[node_id] = features
        i+= 1
        if i % 1000 == 0:
            print('processed', i, 'nodes')
    return vectors

In [50]:
vectors = getBasicFeatureVectors(G, weights_dict)
print(len(vectors))

processed 1000 nodes
processed 2000 nodes
processed 3000 nodes
processed 4000 nodes
processed 5000 nodes
processed 6000 nodes
processed 7000 nodes
processed 8000 nodes
processed 9000 nodes
processed 10000 nodes
processed 11000 nodes
processed 12000 nodes
processed 13000 nodes
processed 14000 nodes
processed 15000 nodes
processed 16000 nodes
processed 17000 nodes
processed 18000 nodes
processed 19000 nodes
processed 20000 nodes
processed 21000 nodes
processed 22000 nodes
processed 23000 nodes
processed 24000 nodes
processed 25000 nodes
processed 26000 nodes
processed 27000 nodes
processed 28000 nodes
processed 29000 nodes
processed 30000 nodes
processed 31000 nodes
processed 32000 nodes
processed 33000 nodes
processed 34000 nodes
processed 35000 nodes
processed 36000 nodes
processed 37000 nodes
processed 38000 nodes
processed 39000 nodes
processed 40000 nodes
processed 41000 nodes
processed 42000 nodes
processed 43000 nodes
processed 44000 nodes
processed 45000 nodes
processed 46000 nod

In [51]:
vectors

{38971: [3, 3, 3, 3, 94, 194],
 80654: [1, 1, 1, 1, 2, 2],
 89991: [409, 942, 5452, 28123, 70670, 203172],
 58847: [117, 291, 1171, 12405, 47400, 152883],
 79969: [431, 1088, 6049, 35342, 76331, 213699],
 66802: [566, 3187, 15099, 77301, 122978, 288371],
 72627: [33, 40, 198, 1691, 19190, 68572],
 73001: [767, 1656, 21179, 111555, 126711, 274313],
 60698: [59, 78, 396, 2047, 27349, 73108],
 67506: [5, 5, 10, 31, 6559, 21543],
 71534: [11, 48, 25, 211, 1722, 5107],
 83674: [747, 2960, 14294, 88205, 110341, 254933],
 59210: [156, 915, 3063, 16069, 78890, 218080],
 86748: [1070, 2903, 31332, 142243, 133354, 274626],
 80305: [388, 818, 13722, 100278, 118448, 259409],
 78685: [3401, 12346, 85926, 271428, 141957, 259390],
 59177: [3389, 15890, 108629, 322218, 148508, 261679],
 62406: [271, 545, 6112, 37464, 67594, 189856],
 57148: [19, 20, 64, 876, 14863, 54615],
 67617: [3248, 10377, 86178, 278473, 142495, 255871],
 70967: [19, 28, 101, 719, 26538, 83740],
 56953: [434, 595, 11130, 70676, 1

In [86]:
'''
Recursive feature generation
    k = number of iterations
    4 types of aggregation:
        1) no aggregation - feature vector from previous iteration
        2) mean aggregation - mean of neighbor feature vectors
        3) sum aggregation - sum of neighbor feature vectors
        4) max aggregation - elementwise max of neighbor feature vectors
    4 aggregations are concatenated to form the new feature vector for a node

'''
def getRecursiveFeatureVectors(Graph, basic_feature_vectors, k):
    fvs = [{} for i in range(k+1)]
    fvs[0] = basic_feature_vectors
    
    for i in range(1, k+1):
        for node_id in Graph.nodes():
            prev_fv = np.asarray(fvs[i-1][node_id])
            sum_fv = np.zeros(len(prev_fv))
            avg_fv = np.zeros(len(prev_fv))
            max_fv = np.zeros(len(prev_fv))
            neighbors = [n for n in G.neighbors(node_id)]
            num_neighbors = len(neighbors)
            for nbr in neighbors:
                neighbor_fv = np.asarray(fvs[i-1][nbr])
                sum_fv += neighbor_fv
                max_fv = np.maximum(max_fv, neighbor_fv)
            avg_fv = sum_fv/float(num_neighbors)
            new_fv = list(prev_fv) + list(avg_fv) + list(sum_fv) + list(max_fv)
            fvs[i][node_id] = list(new_fv)
    return fvs[k]

In [91]:
recursive_vectors = getRecursiveFeatureVectors(G, vectors, 2)

In [92]:
print(recursive_vectors[62467])
print(len(recursive_vectors[62467]))

[1457.0, 3999.0, 45717.0, 190049.0, 142649.0, 269683.0, 159.66094715168154, 443.2271791352093, 3697.312971859986, 20855.190116678106, 40598.88057652711, 109357.48730267673, 232626.0, 645782.0, 5386985.0, 30386012.0, 59152569.0, 159333859.0, 5547.0, 23350.0, 120147.0, 345094.0, 162133.0, 351372.0, 159.66094715168154, 443.2271791352093, 3697.312971859986, 20855.190116678106, 40598.88057652711, 109357.48730267673, 679.8375559019117, 2161.391199739856, 19085.946177034533, 80296.75988642625, 78131.77407931344, 174578.12183493006, 47833.8455730954, 150624.64035689773, 1220114.6479066575, 5830554.489361702, 8446995.471516816, 21473779.968428276, 4077.525051475635, 16569.782429649964, 96178.67398764585, 296261.2100205903, 151842.10157858612, 303394.4996568291, 232626.0, 645782.0, 5386985.0, 30386012.0, 59152569.0, 159333859.0, 990523.3189490854, 3149146.9780209702, 27808223.579939313, 116992379.15452304, 113837994.83355969, 254360323.51349312, 69693913.0, 219460101.0, 1777707042.0, 8495117891.

In [93]:
np.save('data/rolx_embeddings.npy', recursive_vectors)