In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import csv, random, json
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
from gensim.models import Word2Vec

In [120]:
# loading the training data into a list
with open('train.txt', 'r') as f:
    lines = f.readlines()

lines = [line.replace(' ', '\t') for line in lines]

with open('train.txt', 'w') as f:
    f.writelines(lines)

with open('train.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)

training_file = []
for row in d:
    training_file.append([ int(x) for x in row ])


In [121]:
# loading the test data into a list
test_file = pd.read_csv('test-public.csv')

In [122]:
# extracting the features of the nodes
with open('nodes.json', 'r') as f:
    features = json.load(f)

In [123]:
# turning the test data into a pandas dataframe list of edges
node_list_1 = []
node_list_2 = []
for i,j in test_file.iterrows():
    node_list_1.append(j[1])
    node_list_2.append(j[2])

test_edge_list = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})
edge_tuples = []
for index, row in test_edge_list.iterrows():
    edge_tuples.append((row['node_1'],row['node_2']))

In [124]:
# turning the training data into a pandas dataframe list of edges
node_list_1 = []
node_list_2 = []
for row in training_file:
    for connection in range(0,len(row)):
        if connection == 0:
            continue
        else:
            node_list_1.append(row[0])
            node_list_2.append(row[connection])

training_edge_list = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})


In [319]:
# splitting up the list of positive edges in the graph
from sklearn.model_selection import train_test_split
y = np.ones(len(training_edge_list))
X_train, X_test, y_train, y_test = train_test_split(training_edge_list, y, test_size=0.3)
print(len(training_edge_list))
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

53872
37710
37710
16162
16162


In [350]:
# creating a graph from the list of edges in the training data
def create_graph(edge_list):
    graph_edges = nx.from_pandas_edgelist(edge_list, "node_1", "node_2")
    graph = nx.path_graph(4085)
    graph.add_edges_from(graph_edges.edges())
    n = graph.number_of_nodes()
    m = graph.number_of_edges()
    print("Number of nodes :", str(n))
    print("Number of edges :", str(m))
    print("Number of connected components :" + str(nx.number_connected_components(graph)))
    return graph

In [351]:
print("Training graph description: ")
graph = create_graph(X_train)
print("-------------------------------")
print("Validation graph description: ")
validation_graph = create_graph(X_test)
validation_graph.remove_edges_from(graph.edges())

Training graph description: 
Number of nodes : 4085
Number of edges : 28609
Number of connected components :1
-------------------------------
Validation graph description: 
Number of nodes : 4085
Number of edges : 17846
Number of connected components :1


In [352]:
# adding features to the nodes
def add_features(graph):
    for entry in features:
        id = entry['id']
        for key in entry:
            graph.nodes[id][key] = entry[key]
add_features(graph)
add_features(validation_graph)

In [10]:
# partitions the graph into communities of nodes
from community import community_louvain
community_dict = community_louvain.best_partition(graph)

In [54]:
# predicting the scores using the jaccard_coefficient
predictions_jac = nx.jaccard_coefficient(graph, edge_tuples)

scores_jac = []
for u, v, p in predictions_jac:
    scores_jac.append([u,v,p])

print("scores/labels calculated")

scores/labels calculated


In [404]:
# generating a Node2Vec model to learn the node embeddings
node2vec = Node2Vec(graph, dimensions=96, walk_length=100, num_walks=10, workers=4, p = 7, q = 2)
model = node2vec.fit(window=6, min_count=1, batch_words=4)

Computing transition probabilities: 100%|██████████| 4085/4085 [00:09<00:00, 437.43it/s]


In [76]:
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)

In [13]:
# generating the edge embeddings of the node2vec model
edges_kv = edges_embs.as_keyed_vectors()

Generating edge features: 100%|██████████| 8345655/8345655.0 [01:02<00:00, 132931.50it/s]


In [371]:
from gem.embedding.hope import HOPE
hope = HOPE(d=128, beta = .00002)
hope_graph_embeddings, t = hope.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

SVD error (low rank): 0.003905


In [77]:
# generating the similarities between each node in the graph through their embeddings
# take the dot product of the two vectors
# length of the vector = 1
# orthogonal = 0
scores = []
for edge in edge_tuples:
    node1 = edge[0]
    node2 = edge[1]
    #print(node1, node2)
    node1_vector = model.wv.get_vector(str(node1))
    node2_vector = model.wv.get_vector(str(node2))
    cos_sim = np.dot(node1_vector, node2_vector)/(np.linalg.norm(node1_vector)*np.linalg.norm(node2_vector))
    probability = max(0.0,cos_sim)
    scores.append(min(1.0,probability))

In [78]:
with open('output.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(["Id", "Predicted"])
    for i in range(0, len(scores)):
        wr.writerow([i+1,scores[i]])
                     
print("output file written")

output file written


In [354]:
def generate_negative_edges(graph):
    n_edges = graph.number_of_edges()
    n_nodes = graph.number_of_nodes()
    nneg = int(n_edges)
    n_neighbors = [len(list(graph.neighbors(v))) for v in list(graph.nodes)]
    n_non_edges = n_nodes - 1 - np.array(n_neighbors)
    non_edges = [e for e in nx.non_edges(graph)]
    rnd = np.random.RandomState(seed=None)
    rnd_inx = rnd.choice(len(non_edges), nneg, replace=False)
    neg_edge_list = [non_edges[i] for i in rnd_inx]
    return neg_edge_list

In [355]:
training_pos_edges = graph.edges()
training_neg_edges = generate_negative_edges(graph)
print("len of training pos edges is " + str(len(training_pos_edges)) \
      + " and negative edges is " + str(len(training_neg_edges))) 
validation_pos_edges = validation_graph.edges()
validation_neg_edges = generate_negative_edges(validation_graph)
print("len of validation pos edges is " + str(len(validation_pos_edges)) + \
      " and negative edges is " + str(len(validation_neg_edges))) 


len of training pos edges is 28609 and negative edges is 28609
len of validation pos edges is 2388 and negative edges is 2388


In [405]:
# Create a matrix of the embedded node features for the training graph
embedded_nodes = []
for i in range(0,graph.number_of_nodes()):
    node_embedding = model.wv.get_vector(str(i))
    embedded_nodes.append(node_embedding)
embedded_matrix = np.vstack(embedded_nodes)

In [406]:
# returns a list of embeddings (Hadmard Product of the two nodes) for the given edges
def get_edge_embeddings(edge_list):
    embedded_edges = []
    for edge in edge_list:
        node1 = edge[0]
        node2 = edge[1]
        node1_embedded = embedded_matrix[node1]
        node2_embedded = embedded_matrix[node2]
        edge_embedded = np.multiply(node1_embedded, node2_embedded)
        embedded_edges.append(edge_embedded)
    embedded_edges = np.array(embedded_edges)
    return embedded_edges

In [407]:
# Creating the embedded list of training features
positive_embedded_edges = get_edge_embeddings(pos_edge_list)
negative_embedded_edges = get_edge_embeddings(neg_edge_list)
training_features = np.concatenate([positive_embedded_edges, negative_embedded_edges])

In [408]:
# Creating the label vector to train the data
listofones = [1] * len(pos_edge_list)
listofzeros = [0] * len(neg_edge_list)
training_labels = listofones + listofzeros
print(len(training_labels))

30279


In [409]:
# Creating the embedded list of test and validation features
test_embedded_edges = get_edge_embeddings(edge_tuples)
validation_pos_embedded_edges = get_edge_embeddings(validation_pos_edges)
validation_neg_embedded_edges = get_edge_embeddings(validation_neg_edges)
validation_features = np.concatenate([validation_pos_embedded_edges, validation_neg_embedded_edges])
listofones = [1] * len(validation_pos_edges)
listofzeros = [0] * len(validation_neg_edges)
validation_labels = listofones + listofzeros


In [410]:
# fitting the data to a logistic regression model using the embedded egde data
from sklearn import metrics, model_selection, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

logistic_model = LogisticRegression(random_state=0)
logistic_model.fit(training_features, training_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [411]:
# predicting the classes
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
test_preds = logistic_model.predict_proba(test_embedded_edges)[:, 1]
print(test_preds)
validation_preds = logistic_model.predict_proba(validation_features)[:, 1]
print(validation_preds)
auc = roc_auc_score(validation_labels, validation_preds)
print(auc)

[2.59808517e-01 4.20194443e-01 3.45666792e-04 ... 2.78864352e-02
 7.65025814e-01 4.43071768e-06]
[9.99797199e-01 9.99373078e-01 9.98438272e-01 ... 1.08951036e-04
 2.99701609e-05 7.17282451e-03]
0.957123522413856


In [403]:
with open('output.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(["Id", "Predicted"])
    for i in range(0, len(scores)):
        prob = test_preds[i]
        wr.writerow([i+1,float(prob)])
                     
print("output file written")

output file written
