In [3]:
import networkx as nx
import numpy as np
import pandas as pd
import random

In [4]:
def load_embeddings(filename):
    x = np.load(filename, allow_pickle = True)
    return x.item()

In [5]:
# expects npy file to be a dict
#embeddings = load_embeddings('data/rolx_embeddings.npy')
embeddings = load_embeddings('data/node2vec_medium.npy')

In [6]:
embedding_dim = len(embeddings[0])
print(embedding_dim)

64


In [7]:
def get_weights_dict(filename):
    weights = pd.read_csv(filename, header = None)
    weights.columns = ['src', 'dst', 'weight']
    
    weights_dict = {}
    for i in range(weights.shape[0]):
        src = weights.iloc[i, 0]
        dst = weights.iloc[i, 1]
        weight = weights.iloc[i, 2]

        weights_dict[(src, dst)] = weight
        weights_dict[(dst, src)] = weight
    return weights_dict

In [8]:
weights_dict = get_weights_dict('data/reddit_nodes_weighted_full.csv')

In [9]:
# Load graph into networkx (weighted, undirected)
def load_graph(filename):
    df = pd.read_csv(filename, header=None, names=['source', 'target', 'weight'])
    G = nx.from_pandas_edgelist(df, edge_attr='weight', create_using=nx.Graph())
    return G

In [10]:
G = load_graph('data/reddit_nodes_weighted_full.csv')

In [11]:
# generate positive examples of edges
def get_positive_examples(G, embeddings, weights_dict):
    pos_examples = []
    for edge in G.edges():
        src_embedding = embeddings[edge[0]]
        dst_embedding = embeddings[edge[1]]
        edge_vector = list(src_embedding) + list(dst_embedding) + [weights_dict[(edge[0], edge[1])]] # label = edge weight
        pos_examples.append(edge_vector)
    return pos_examples

In [35]:
# generate negative examples
def get_negative_examples(G, embeddings, negative_edges):
    node_list = list(G.nodes())
    neg_examples = []
    for edge in negative_edges:
        src = edge[0]
        dst = edge[1]
        src_embedding = embeddings[src]
        dst_embedding = embeddings[dst]
        edge_vector = list(src_embedding) + list(dst_embedding) + [0] # label = 0
        neg_examples.append(edge_vector)
    return neg_examples

In [13]:
pos_examples = get_positive_examples(G, embeddings, weights_dict)
num_pos_examples = len(pos_examples)
print(num_pos_examples)

309667


In [29]:
# load negative edge samples - use pairs of nodes w/ no edge that are > 5 hops apart
negative_edges = np.load('data/negative_sample_edges.npy')

In [36]:
neg_examples = get_negative_examples(G, embeddings, negative_edges)
num_neg_examples = len(neg_examples)
print(num_neg_examples)

309582


In [28]:
#np.save('data/negative_sample_edges.npy', list(edges_used))

In [15]:
all_examples = pos_examples + neg_examples

In [16]:
# create train/test dataframe from examples
cols = ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)] + ['label']
df = pd.DataFrame(all_examples, columns = cols) 

In [17]:
df.reset_index()
df.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst55,dst56,dst57,dst58,dst59,dst60,dst61,dst62,dst63,label
295960,-0.236166,-0.465299,-4.032543,2.472753,-1.216998,-0.009433,9.886761,3.489938,-3.886166,-6.238655,...,-8.512048,-1.953421,-2.549924,3.461018,-7.01413,-8.471388,9.46799,-2.233382,10.167744,1
219039,-1.724976,0.116733,-6.506008,1.830798,-3.850633,-0.438971,11.009248,0.773132,-2.111652,-7.90241,...,-0.372824,-2.138546,-3.970612,-2.282519,-1.827538,5.600901,-2.691704,-5.320232,2.601571,1
425500,-2.01889,4.027038,0.548854,-0.364006,4.514221,-0.713183,2.076096,0.48316,2.743756,2.92295,...,1.676554,3.264465,-4.343042,7.054236,0.972955,0.547813,-2.591967,0.82511,-2.269313,0
478227,-1.517908,3.140068,0.878181,-3.136993,2.148531,-2.530984,4.644461,3.707418,2.846356,-0.247416,...,-0.212772,-1.402514,-0.705278,0.832873,1.015038,3.605969,0.317975,-3.977448,4.000094,0
340328,-1.012127,-2.574144,-0.659064,1.644482,-3.393647,-3.596241,2.45106,-1.348409,-0.039455,-1.947072,...,-3.241239,1.578903,2.441231,-2.691512,3.6093,3.49226,0.616422,-1.061361,-5.493214,0
343738,-1.739662,0.511935,4.352522,-0.418794,-1.814202,-3.984229,-3.231863,6.083848,1.978868,2.278765,...,-2.445725,-0.219901,-5.927732,2.099749,-1.636073,-2.611033,3.279548,-2.856349,4.905,0
82126,4.533829,-0.371335,5.811753,-4.420107,-0.786574,5.703921,-4.946699,9.687774,1.400762,1.765668,...,-1.96913,1.978541,-1.834425,-1.333841,-1.73198,-0.584734,-15.342148,3.961429,1.399222,1
445736,-0.415641,0.966904,0.092451,5.22538,1.057568,2.046107,-1.682768,-4.169355,-0.968579,-4.022402,...,-1.580998,-1.493285,-2.775271,3.169362,2.266104,0.998904,2.092844,1.584754,-1.4856,0
562481,-0.436923,-1.565495,2.241223,-0.291333,0.013716,-0.27256,-1.930388,-5.353038,0.358029,-1.651978,...,1.560747,2.133404,-3.302388,5.685205,-1.152344,-0.660244,-1.871567,-4.331575,6.206403,0
444718,5.551247,-2.468978,-0.969497,7.496026,-1.244565,-2.743198,5.888679,1.036376,-3.632004,0.459293,...,-1.317767,-8.50076,0.856205,2.020444,0.637203,-1.916205,1.048695,-3.608567,-2.76483,0


In [18]:
df.shape

(619334, 129)

In [19]:
#df.to_csv('data/rolx_dataset_weighted.csv')
df.to_csv('data/node2vec_medium_dataset_weighted.csv')

In [20]:
# generate inference examples
def get_inference_examples(G, embeddings, edges_used, num_examples = 500000, attempts = 1000000):
    node_list = list(G.nodes())
    inference_examples = []
    for i in range(attempts):
        if len(inference_examples) == num_examples:
            break
        rnd_node_pair = random.choices(node_list, k = 2)
        src = rnd_node_pair[0]
        dst = rnd_node_pair[1]
        if G.has_edge(src, dst):
            continue
        edge_tuple = (src, dst)
        if edge_tuple not in edges_used:
            src_embedding = embeddings[src]
            dst_embedding = embeddings[dst]
            edge_vector = [src, dst] + list(src_embedding) + list(dst_embedding)
            inference_examples.append(edge_vector)
    return inference_examples

In [21]:
inference_examples = get_inference_examples(G, embeddings, edges_used)
print(len(inference_examples))

500000


In [22]:
# create inference dataframe from examples
cols = ['src_id', 'dst_id'] + ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)]
inference_df = pd.DataFrame(inference_examples, columns = cols) 

In [23]:
inference_df.sample(10)

Unnamed: 0,src_id,dst_id,src0,src1,src2,src3,src4,src5,src6,src7,...,dst54,dst55,dst56,dst57,dst58,dst59,dst60,dst61,dst62,dst63
469224,23621,23213,1.700582,-3.824462,4.568372,-1.64473,0.368803,0.18483,0.111686,-0.234048,...,0.090107,3.95625,-2.192251,-3.356577,-2.030847,0.542401,-1.715207,0.779977,1.38272,-1.348533
495317,21944,86144,2.793049,1.459795,-3.822434,-1.250064,-1.121313,1.412901,3.538615,-2.029258,...,-0.842012,-0.55461,0.924177,1.662095,-2.801678,-1.211419,8.25174,0.076858,-0.307756,-0.074094
375549,63756,18630,0.224098,2.634256,-0.167823,3.384231,-8.524166,4.212944,-2.900369,-2.610011,...,-0.635437,-5.282114,3.261333,-9.181658,0.165313,2.671446,-2.034106,0.344411,1.290408,-3.415366
331433,59750,80083,0.207872,-4.598179,-4.791091,9.491179,-6.238302,-1.275949,-2.760428,-5.194577,...,-2.514327,0.409046,-0.882909,-0.29001,1.623112,2.96934,-1.406433,2.315613,-2.135646,-2.050828
30766,44119,89866,0.39936,-3.002097,-2.666371,2.230812,3.273831,1.264989,0.648883,0.110587,...,4.898437,-0.904974,1.726354,7.679121,4.640111,4.63271,1.883651,1.058766,-3.90679,7.527781
85424,87418,13574,6.320447,-1.408843,2.867861,3.54637,-4.447623,-3.553438,3.995579,-0.688433,...,-0.290201,1.090348,0.006539,-1.653402,1.085158,4.349091,-1.727649,-0.138359,1.013611,-1.341473
494792,87200,83891,-1.743511,-2.077423,-0.479794,2.652909,-1.062421,3.145582,4.618248,2.371457,...,1.369016,-1.767892,3.161354,0.202375,-2.428657,-0.027,3.137358,-1.728567,2.228651,3.298067
184455,49916,33636,-1.342859,-0.036418,0.113877,-0.519609,2.663784,-1.806143,0.563247,-1.637207,...,8.350389,-6.6445,1.76586,-2.405689,-3.125323,-6.907196,1.512795,2.620311,-6.4203,-0.710272
406623,64099,85607,-1.026308,0.273135,-4.029389,2.013609,-1.36616,5.75684,-4.059242,3.239184,...,-0.296204,-3.828959,7.683974,7.48626,-1.136587,-5.91629,6.369218,0.341874,-1.825805,-1.72065
320235,36463,14221,-0.676381,-4.274764,3.033229,2.854737,0.725373,2.917539,-1.39027,-0.537284,...,-0.242841,0.348852,0.859614,-3.905689,0.112337,2.80562,0.773921,-0.663763,0.076375,0.266612


In [24]:
inference_df.shape

(500000, 130)

In [25]:
#inference_df.to_csv('data/rolx_inference_weighted.csv')
inference_df.to_csv('data/node2vec_medium_inference_weighted.csv')