In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import random

In [2]:
def load_embeddings(filename):
    x = np.load(filename, allow_pickle = True)
    return x.item()

In [65]:
# expects npy file to be a dict
#embeddings = load_embeddings('data/rolx_embeddings.npy')
embeddings = load_embeddings('data/node2vec_basic.npy')

In [79]:
embedding_dim = len(embeddings[0])
print(embedding_dim)

64


In [67]:
def get_weights_dict(filename):
    weights = pd.read_csv(filename, header = None)
    weights.columns = ['src', 'dst', 'weight']
    
    weights_dict = {}
    for i in range(weights.shape[0]):
        src = weights.iloc[i, 0]
        dst = weights.iloc[i, 1]
        weight = weights.iloc[i, 2]

        weights_dict[(src, dst)] = weight
        weights_dict[(dst, src)] = weight
    return weights_dict

In [68]:
weights_dict = get_weights_dict('data/reddit_nodes_weighted_full.csv')

In [69]:
# Load graph into networkx (weighted, undirected)
def load_graph(filename):
    df = pd.read_csv(filename, header=None, names=['source', 'target', 'weight'])
    G = nx.from_pandas_edgelist(df, edge_attr='weight', create_using=nx.Graph())
    return G

In [70]:
G = load_graph('data/reddit_nodes_weighted_full.csv')

In [82]:
# generate positive examples of edges
def get_positive_examples(G, embeddings, weights_dict):
    pos_examples = []
    for edge in G.edges():
        src_embedding = embeddings[edge[0]]
        dst_embedding = embeddings[edge[1]]
        edge_vector = list(src_embedding) + list(dst_embedding) + [weights_dict[(edge[0], edge[1])]] # label = edge weight
        pos_examples.append(edge_vector)
    return pos_examples

In [83]:
# generate negative examples
def get_negative_examples(G, embeddings, num_examples, attempts = 3000000, len_threshold = 5):
    node_list = list(G.nodes())
    neg_examples = []
    edges_used = set()
    for i in range(attempts):
        if len(neg_examples) == num_examples:
            break
        rnd_node_pair = random.choices(node_list, k = 2)
        src = rnd_node_pair[0]
        dst = rnd_node_pair[1]
        if G.has_edge(src, dst):
            continue
        try:    
            path_length = nx.shortest_path_length(G, source=src, target=dst, weight = None)
        except nx.NetworkXNoPath:
            continue
        if(path_length) >= len_threshold:
            src_embedding = embeddings[src]
            dst_embedding = embeddings[dst]
            edge_vector = list(src_embedding) + list(dst_embedding) + [0] # label = 0
            neg_examples.append(edge_vector)
            edges_used.add((src, dst))
    return neg_examples, edges_used

In [84]:
pos_examples = get_positive_examples(G, embeddings, weights_dict)
num_pos_examples = len(pos_examples)
print(num_pos_examples)

309667


In [87]:
neg_examples, edges_used = get_negative_examples(G, embeddings, num_pos_examples)
num_neg_examples = len(neg_examples)
print(num_neg_examples)

309667


In [88]:
all_examples = pos_examples + neg_examples

In [89]:
# create train/test dataframe from examples
cols = ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)] + ['label']
df = pd.DataFrame(all_examples, columns = cols) 

In [90]:
df.reset_index()
df.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst55,dst56,dst57,dst58,dst59,dst60,dst61,dst62,dst63,label
117988,-0.701049,-0.04671,-0.600066,-3.315555,-2.640109,0.593209,5.240912,0.021872,-0.997949,3.402424,...,0.798436,2.568911,-1.830647,-0.950117,1.066381,0.636799,-1.540267,-0.732816,1.091093,1
473414,-0.498076,-0.30228,-0.431933,-0.342239,0.267582,-0.048998,0.097196,-0.144548,0.445296,0.649405,...,-0.050774,0.988086,-0.48688,0.306718,-0.438657,-0.203259,0.014424,0.261689,0.653872,0
2365,-2.648076,-0.674411,-2.9043,2.113153,-0.050607,2.239888,-0.705597,-2.84022,2.70524,-0.581743,...,-1.098105,4.196887,-2.699295,0.87325,-0.082734,-1.730593,-1.09992,-3.052021,0.486465,1
358508,0.906426,-0.2277,0.258384,-0.049875,0.947797,-0.663801,0.27429,0.229944,1.459742,1.399658,...,-0.878457,-0.123665,-0.775969,0.712715,-0.150373,0.175186,1.814022,-0.797571,-0.028064,0
159393,-1.251741,-2.102675,-1.463369,-1.937316,1.356638,2.148602,-0.755819,-1.361258,-2.355413,-2.591883,...,0.158168,0.745268,6.290748,0.384049,5.992439,1.640739,1.419432,0.798864,-2.315643,3
195136,1.602249,-0.161369,0.884617,6.905804,-3.611916,2.604372,0.909167,-5.249211,-2.416237,0.745585,...,1.020442,2.773128,-2.580252,5.051717,0.170699,1.861395,-1.004747,0.767615,1.6941,1
498540,-0.41128,-0.360273,0.487534,-0.5515,1.508739,-0.475428,0.433168,-0.42064,0.245376,-0.064905,...,-0.068439,0.137591,-0.141986,-1.335153,-0.674775,-0.562063,-0.644027,-0.22316,-0.42329,0
209743,0.083377,0.141106,-0.832278,-3.716035,-1.803339,-3.18343,1.148587,2.006329,3.19821,-1.145033,...,-2.59867,-1.047897,0.546976,3.686751,0.339996,-2.362295,-3.110046,2.883427,1.171778,1
498261,0.182295,1.016341,0.895036,-0.129915,1.438504,-1.573911,-0.447342,-0.661573,-0.235324,0.029874,...,-0.504932,0.612753,-0.243442,-0.103039,-0.126456,0.368561,0.140518,0.176103,-0.535851,0
460420,-1.703591,0.164987,0.177921,-0.733648,1.124397,-2.067785,0.128721,0.790591,0.126318,0.689976,...,-0.978866,-0.709224,-0.037056,-0.282881,0.295664,-0.655805,0.432728,0.188787,0.690493,0


In [91]:
df.shape

(619334, 129)

In [92]:
#df.to_csv('data/rolx_dataset_weighted.csv')
df.to_csv('data/node2vec_basic_dataset_weighted.csv')

In [95]:
# generate inference examples
def get_inference_examples(G, embeddings, edges_used, num_examples = 500000, attempts = 1000000):
    node_list = list(G.nodes())
    inference_examples = []
    for i in range(attempts):
        if len(inference_examples) == num_examples:
            break
        rnd_node_pair = random.choices(node_list, k = 2)
        src = rnd_node_pair[0]
        dst = rnd_node_pair[1]
        if G.has_edge(src, dst):
            continue
        edge_tuple = (src, dst)
        if edge_tuple not in edges_used:
            src_embedding = embeddings[src]
            dst_embedding = embeddings[dst]
            edge_vector = [src, dst] + list(src_embedding) + list(dst_embedding)
            inference_examples.append(edge_vector)
    return inference_examples

In [96]:
inference_examples = get_inference_examples(G, embeddings, edges_used)
print(len(inference_examples))

500000


In [97]:
# create inference dataframe from examples
cols = ['src_id', 'dst_id'] + ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)]
inference_df = pd.DataFrame(inference_examples, columns = cols) 

In [98]:
inference_df.sample(10)

Unnamed: 0,src_id,dst_id,src0,src1,src2,src3,src4,src5,src6,src7,...,dst54,dst55,dst56,dst57,dst58,dst59,dst60,dst61,dst62,dst63
407473,82730,44657,0.75262,-0.630123,0.040539,-0.007708,-1.390579,0.635797,-1.528127,-0.950208,...,-0.425097,-0.253754,-2.308297,1.820423,1.850306,-1.850551,-0.424785,-1.093438,1.002581,2.088209
264201,15302,18885,-0.339255,-0.359826,-0.406256,0.687735,-0.680543,-1.370377,-0.92097,-0.791185,...,0.195369,-0.227075,1.328146,1.156351,0.778695,0.089531,-0.413677,0.27304,-0.213056,-0.453983
41167,43047,81754,-0.017926,0.029379,-0.486409,0.161017,0.750886,-0.281602,0.116351,0.11993,...,-1.415196,-0.925696,0.676919,0.916133,0.493906,-0.236645,1.347238,0.200409,-0.241949,0.261677
473616,56340,82527,0.870963,0.200561,-1.395895,-2.558268,-0.02684,-1.668455,0.863912,0.235182,...,-0.094044,-0.60183,1.156323,-0.154509,1.646063,0.120869,1.207994,-1.38351,0.444519,0.557945
91453,11076,86285,0.47172,-0.229952,-0.113845,0.629545,0.861416,-0.804096,0.017497,-0.106064,...,-0.708844,-0.388938,0.472475,0.47033,0.086138,0.151487,0.110256,-0.544546,-0.426234,-0.284594
259539,21368,59890,0.11665,-0.168886,-0.151804,0.806388,0.026276,0.185158,0.555937,-0.670893,...,-0.188616,-0.209671,-0.209739,-0.322244,1.529492,-0.736355,-0.380182,-0.433958,0.325367,0.745571
413720,4414,60029,0.295725,-0.179209,-0.673581,0.066297,0.315432,-0.16393,0.47547,0.421817,...,0.260882,-1.009694,-0.884255,-1.807481,-3.153533,0.343588,-2.115999,1.644628,0.129599,1.323013
297198,13965,59419,-0.938725,-0.286538,-0.851337,0.549733,0.498863,-0.390169,0.006602,0.299265,...,-0.335154,0.344872,0.26503,-0.743427,1.038278,0.382224,-0.163321,0.389669,-0.394833,-0.763715
465965,29,55765,-0.490312,-0.18632,-0.327912,0.271377,0.607819,0.514008,0.107967,-0.062261,...,-0.025115,-0.087273,0.24039,-0.291446,0.362844,-0.251191,-0.6254,0.492275,-0.068225,-0.112534
172347,87947,87301,-2.068022,0.116393,1.852752,-0.130352,0.501837,0.673705,-0.974124,1.124238,...,0.045424,-0.499469,1.152507,-0.268519,0.724043,-0.912904,-0.636645,-0.060434,-0.879472,0.423332


In [99]:
inference_df.shape

(500000, 130)

In [100]:
#inference_df.to_csv('data/rolx_inference_weighted.csv')
inference_df.to_csv('data/node2vec_basic_inference_weighted.csv')