In [None]:
import networkx as nx
import numpy as np
import pandas as pd
import random
import json

In [None]:
def load_embeddings(filename):
    x = np.load(filename, allow_pickle = True)
    return x.item()

In [None]:
def load_embeddings_json(filename):
    json_file = open(filename)
    json_str = json_file.read()
    json_data = json.loads(json_str)
    data = {int(k):v for k,v in json_data.items()}
    return data

In [None]:
# expects npy file to be a dict
#embeddings = load_embeddings('data/node2vec_medium.npy')
embeddings = load_embeddings_json('models/node2vec-dimension-512_lr-0.0500_seed-1234_epochs-250_numwalks-100_walklength-10_window-5_embedding.json')

In [None]:
embedding_dim = len(embeddings[41393])
print(embedding_dim)

In [None]:
def get_weights_dict(filename):
    weights = pd.read_csv(filename, header = None)
    weights.columns = ['src', 'dst', 'weight']
    
    weights_dict = {}
    for i in range(weights.shape[0]):
        src = weights.iloc[i, 0]
        dst = weights.iloc[i, 1]
        weight = weights.iloc[i, 2]

        weights_dict[(src, dst)] = weight
        weights_dict[(dst, src)] = weight
    return weights_dict

In [None]:
weights_dict = get_weights_dict('data/reddit_nodes_weighted_full.csv')

In [None]:
# Load graph into networkx (weighted, undirected)
def load_graph(filename):
    df = pd.read_csv(filename, header=None, names=['source', 'target', 'weight'])
    G = nx.from_pandas_edgelist(df, edge_attr='weight', create_using=nx.Graph())
    return G

In [None]:
G = load_graph('data/reddit_nodes_weighted_full.csv')

In [None]:
# generate positive examples of edges
def get_positive_examples(G, embeddings, weights_dict):
    pos_examples = []
    for edge in G.edges():
        src = edge[0]
        dst = edge[1]
        if src not in embeddings or dst not in embeddings:
            continue
        src_embedding = embeddings[src]
        dst_embedding = embeddings[dst]
        edge_vector = list(src_embedding) + list(dst_embedding) + [weights_dict[(edge[0], edge[1])]] # label = edge weight
        pos_examples.append(edge_vector)
    return pos_examples

In [None]:
# generate negative edges
def get_negative_edges(G, num_examples = 1000000, attempts = 50000000, len_threshold = 5):
    node_list = list(G.nodes())
    edges_used = set()
    for i in range(attempts):
        if len(edges_used) == num_examples:
            break
        rnd_node_pair = random.choices(node_list, k = 2)
        src = rnd_node_pair[0]
        dst = rnd_node_pair[1]
        if G.has_edge(src, dst):
            continue
        try:    
            path_length = nx.shortest_path_length(G, source=src, target=dst, weight = None)
        except nx.NetworkXNoPath:
            continue
        if(path_length) >= len_threshold:
            edges_used.add((src, dst))
    return list(edges_used)

In [None]:
#neg_edges = get_negative_edges(G)
#np.save('data/negative_sample_edges_large2.npy', neg_edges)
#print(len(neg_edges))

In [None]:
# generate negative examples
def get_negative_examples(G, embeddings, negative_edges):
    node_list = list(G.nodes())
    neg_examples = []
    for edge in negative_edges:
        src = edge[0]
        dst = edge[1]
        if src not in embeddings or dst not in embeddings:
            continue
        src_embedding = embeddings[src]
        dst_embedding = embeddings[dst]
        edge_vector = list(src_embedding) + list(dst_embedding) + [0] # label = 0
        neg_examples.append(edge_vector)
    return neg_examples

In [None]:
pos_examples = get_positive_examples(G, embeddings, weights_dict)
num_pos_examples = len(pos_examples)
print(num_pos_examples)

In [None]:
# load negative edge samples - use pairs of nodes w/ no edge that are > 5 hops apart
negative_edges = np.load('data/negative_sample_edges.npy')

In [None]:
neg_examples = get_negative_examples(G, embeddings, negative_edges)
num_neg_examples = len(neg_examples)
print(num_neg_examples)

In [None]:
all_examples = pos_examples + neg_examples

In [None]:
# create train/test dataframe from examples
cols = ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)] + ['label']
df = pd.DataFrame(all_examples, columns = cols) 

In [None]:
df.reset_index()
df.sample(10)

In [None]:
df.shape

In [None]:
df.to_csv('data/node2vec_512dim.csv')

In [None]:
# generate inference examples
def get_inference_examples(G, embeddings, edges_used, num_examples = 500000, attempts = 1000000):
    node_list = list(G.nodes())
    inference_examples = []
    for i in range(attempts):
        if i % 100000 == 0:
            print(i)
        if len(inference_examples) == num_examples:
            break
        rnd_node_pair = random.choices(node_list, k = 2)
        src = rnd_node_pair[0]
        dst = rnd_node_pair[1]
        if src not in embeddings or dst not in embeddings:
            continue
        if G.has_edge(src, dst):
            continue
        edge_tuple = (src, dst)
        if edge_tuple not in edges_used:
            src_embedding = embeddings[src]
            dst_embedding = embeddings[dst]
            edge_vector = [src, dst] + list(src_embedding) + list(dst_embedding)
            inference_examples.append(edge_vector)
    return inference_examples

In [None]:
inference_examples = get_inference_examples(G, embeddings, negative_edges)
print(len(inference_examples))

In [None]:
# create inference dataframe from examples
cols = ['src_id', 'dst_id'] + ['src' + str(i) for i in range(embedding_dim)] + ['dst' + str(i) for i in range(embedding_dim)]
inference_df = pd.DataFrame(inference_examples, columns = cols) 

In [None]:
inference_df.sample(10)

In [None]:
inference_df.shape

In [None]:
#inference_df.to_csv('data/rolx_inference_weighted.csv')
inference_df.to_csv('data/node2vec_512dim_inference.csv')