In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import random

In [2]:
def load_embeddings(filename):
    x = np.load(filename, allow_pickle = True)
    return x.item()

In [3]:
# expects npy file to be a dict
embeddings = load_embeddings('data/rolx_embeddings.npy')

In [4]:
embedding_dim = len(embeddings[0])

In [5]:
# Load graph into networkx (weighted, undirected)
def load_graph(filename):
    df = pd.read_csv(filename, header=None, names=['source', 'target', 'weight'])
    G = nx.from_pandas_edgelist(df, edge_attr='weight', create_using=nx.Graph())
    return G

In [6]:
G = load_graph('data/reddit_nodes_weighted_full.csv')

In [7]:
# generate positive examples of edges
def get_positive_examples(G, embeddings):
    pos_examples = []
    for edge in G.edges():
        src_embedding = embeddings[edge[0]]
        dst_embedding = embeddings[edge[1]]
        edge_vector = src_embedding + dst_embedding + [1] # label = 1
        pos_examples.append(edge_vector)
    return pos_examples

In [8]:
# generate negative examples
def get_negative_examples(G, embeddings, num_examples, attempts = 3000000, len_threshold = 5):
    node_list = list(G.nodes())
    neg_examples = []
    edges_used = set()
    for i in range(attempts):
        if len(neg_examples) == num_examples:
            break
        rnd_node_pair = random.choices(node_list, k = 2)
        src = rnd_node_pair[0]
        dst = rnd_node_pair[1]
        if G.has_edge(src, dst):
            continue
        try:    
            path_length = nx.shortest_path_length(G, source=src, target=dst, weight = None)
        except nx.NetworkXNoPath:
            continue
        if(path_length) >= len_threshold:
            src_embedding = embeddings[src]
            dst_embedding = embeddings[dst]
            edge_vector = src_embedding + dst_embedding + [0] # label = 0
            neg_examples.append(edge_vector)
            edges_used.add((src, dst))
    return neg_examples, edges_used

In [10]:
pos_examples = get_positive_examples(G, embeddings)
num_pos_examples = len(pos_examples)
print(num_pos_examples)

309667


In [11]:
neg_examples, edges_used = get_negative_examples(G, embeddings, num_pos_examples)
num_neg_examples = len(neg_examples)
print(num_neg_examples)

309667


In [12]:
all_examples = pos_examples + neg_examples

In [13]:
# create train/test dataframe from examples
cols = ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)] + ['label']
df = pd.DataFrame(all_examples, columns = cols) 

In [14]:
df.reset_index()
df.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst87,dst88,dst89,dst90,dst91,dst92,dst93,dst94,dst95,label
508397,1.0,1.0,1.0,1.0,3.0,4.0,4.0,5.0,7.0,46.0,...,2647133.0,3353275.0,8366189.0,5547.0,23350.0,120147.0,345094.0,152922.0,288501.0,0
550533,2.0,2.0,3.0,32.0,287.0,436.0,145.5,249.0,1001.0,6773.5,...,4467.0,56854.0,181472.0,55.0,80.0,453.0,4226.0,39200.0,125358.0,0
33203,890.0,1897.0,25566.0,133596.0,136029.0,278026.0,209.293258,610.473034,5042.123596,26774.698876,...,44294540.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0,1
556037,1.0,1.0,1.0,1.0,183.0,348.0,184.0,349.0,3701.0,41343.0,...,841.0,18811.0,66470.0,14.0,15.0,50.0,831.0,16628.0,58988.0,0
200936,128.0,246.0,1936.0,21724.0,69530.0,205738.0,572.453125,1944.84375,15141.554688,64506.601562,...,44294540.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0,1
422368,1.0,5.0,1.0,5.0,112.0,218.0,113.0,223.0,599.0,2394.0,...,315124.0,224145.0,506607.0,4601.0,11182.0,102665.0,310542.0,145456.0,257796.0,0
161321,7.0,15.0,17.0,65.0,1036.0,2141.0,151.857143,322.285714,3746.142857,19662.428571,...,41103539.0,124978835.0,355628556.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0,1
239606,47.0,90.0,147.0,723.0,6753.0,16336.0,148.93617,376.425532,2475.617021,12961.042553,...,2145258.0,2917682.0,7292718.0,4601.0,19896.0,110785.0,333575.0,152995.0,325429.0,1
35343,275.0,743.0,1165.0,8451.0,46320.0,148827.0,175.909091,599.949091,4532.745455,18265.138182,...,44294540.0,145149413.0,447730576.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0,1
461617,1.0,2.0,1.0,2.0,1.0,1.0,2.0,3.0,2.0,3.0,...,1520472.0,1265565.0,2866363.0,4601.0,19896.0,110785.0,333575.0,145640.0,269164.0,0


In [15]:
df.shape

(619334, 193)

In [16]:
df.to_csv('data/rolx_dataset.csv')

In [33]:
# generate inference examples
def get_inference_examples(G, embeddings, edges_used, num_examples = 100000, attempts = 1000000):
    node_list = list(G.nodes())
    inference_examples = []
    for i in range(attempts):
        if len(inference_examples) == num_examples:
            break
        rnd_node_pair = random.choices(node_list, k = 2)
        src = rnd_node_pair[0]
        dst = rnd_node_pair[1]
        if G.has_edge(src, dst):
            continue
        edge_tuple = (src, dst)
        if edge_tuple not in edges_used:
            src_embedding = embeddings[src]
            dst_embedding = embeddings[dst]
            edge_vector = src_embedding + dst_embedding
            inference_examples.append(edge_vector)
    return inference_examples

In [34]:
inference_examples = get_inference_examples(G, embeddings, edges_used)
print(len(inference_examples))

100000


In [35]:
# create inference dataframe from examples
cols = ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)]
inference_df = pd.DataFrame(inference_examples, columns = cols) 

In [36]:
inference_df.sample(10)

Unnamed: 0,src0,src1,src2,src3,src4,src5,src6,src7,src8,src9,...,dst86,dst87,dst88,dst89,dst90,dst91,dst92,dst93,dst94,dst95
44715,5.0,5.0,6.0,11.0,626.0,1023.0,126.6,208.0,1253.8,6363.8,...,6569407.0,41103539.0,124978835.0,355628556.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
48035,1.0,1.0,1.0,1.0,676.0,2352.0,677.0,2353.0,16780.0,74319.0,...,10274.0,75030.0,134463.0,336531.0,310.0,1105.0,9933.0,73422.0,103233.0,249104.0
39312,3.0,3.0,6.0,10.0,309.0,547.0,106.0,188.0,1276.666667,10942.333333,...,2599695.0,12300929.0,17107534.0,42579513.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
13067,18.0,44.0,67.0,453.0,1437.0,4315.0,86.277778,287.611111,1289.222222,9983.944444,...,5412041.0,31845920.0,71265363.0,200957725.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
28203,2.0,2.0,3.0,3.0,1772.0,4246.0,888.0,2125.0,16334.5,78679.5,...,6569407.0,41103539.0,124978835.0,355628556.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
24182,2.0,2.0,3.0,11.0,576.0,1944.0,290.0,982.0,6853.0,42315.5,...,289666.0,1024218.0,862208.0,1932667.0,5547.0,23350.0,120147.0,345094.0,148508.0,280036.0
54464,11.0,14.0,28.0,114.0,364.0,802.0,37.181818,92.363636,139.636364,889.818182,...,533512.0,2048886.0,2554069.0,6355116.0,4601.0,19896.0,110785.0,333575.0,152995.0,325429.0
28654,1.0,1.0,1.0,1.0,91.0,150.0,92.0,151.0,1132.0,11106.0,...,5860667.0,36571844.0,87694667.0,253188045.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
12101,4.0,5.0,6.0,7.0,940.0,1864.0,237.0,468.25,3449.25,20473.0,...,3009338.0,14273771.0,21413051.0,54353125.0,5547.0,23350.0,120147.0,345094.0,162133.0,351372.0
93430,1.0,1.0,1.0,1.0,2.0,7.0,3.0,8.0,4.0,87.0,...,719710.0,3144785.0,3059480.0,6748368.0,3530.0,19896.0,110785.0,333575.0,152922.0,302627.0


In [37]:
inference_df.shape

(100000, 192)

In [38]:
inference_df.to_csv('data/rolx_inference.csv')