In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import csv, random, json
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder


In [2]:
import sys
sys.version

'3.7.4 (v3.7.4:e09359112e, Jul  8 2019, 14:54:52) \n[Clang 6.0 (clang-600.0.57)]'

In [3]:
# loading the training data into a list
with open('train.txt', 'r') as f:
    lines = f.readlines()

lines = [line.replace(' ', '\t') for line in lines]

with open('train.txt', 'w') as f:
    f.writelines(lines)

with open('train.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)

training_file = []
for row in d:
    training_file.append([ int(x) for x in row ])


In [4]:
# loading the test data into a list
test_file = pd.read_csv('test-public.csv')

In [5]:
# extracting the features of the nodes
with open('nodes.json', 'r') as f:
    features = json.load(f)

In [6]:
# turning the test data into a pandas dataframe list of edges
node_list_1 = []
node_list_2 = []
for i,j in test_file.iterrows():
    node_list_1.append(j[1])
    node_list_2.append(j[2])

test_edge_list = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})
edge_tuples = []
for index, row in test_edge_list.iterrows():
    edge_tuples.append((row['node_1'],row['node_2']))

In [7]:
# turning the training data into a pandas dataframe list of edges
node_list_1 = []
node_list_2 = []
for row in training_file:
    for connection in range(0,len(row)):
        if connection == 0:
            continue
        else:
            node_list_1.append(row[0])
            node_list_2.append(row[connection])

training_edge_list = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})


In [8]:
# splitting up the list of positive edges in the graph
from sklearn.model_selection import train_test_split
y = np.ones(len(training_edge_list))
X_train, X_test, y_train, y_test = train_test_split(training_edge_list, y, test_size=0.20)
#X_train = training_edge_list
#y_train = np.ones(len(training_edge_list))
print(len(training_edge_list))
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

53872
43097
43097
10775
10775


In [9]:
# creating a graph from the list of edges in the training data
def create_graph(edge_list):
    graph_edges = nx.from_pandas_edgelist(edge_list, "node_1", "node_2")
    graph = nx.path_graph(4085)
    graph.add_edges_from(graph_edges.edges())
    n = graph.number_of_nodes()
    m = graph.number_of_edges()
    print("Number of nodes :", str(n))
    print("Number of edges :", str(m))
    print("Number of connected components :" + str(nx.number_connected_components(graph)))
    return graph

In [10]:
print("Training graph description: ")
graph = create_graph(X_train)
print("-------------------------------")

print("Validation graph description: ")
validation_graph = create_graph(X_test)
validation_graph.remove_edges_from(graph.edges())

Training graph description: 
Number of nodes : 4085
Number of edges : 29905
Number of connected components :1
-------------------------------
Validation graph description: 
Number of nodes : 4085
Number of edges : 13762
Number of connected components :1


In [11]:
# adding features to the nodes
def add_features(graph):
    for entry in features:
        id = entry['id']
        for key in entry:
            graph.nodes[id][key] = entry[key]
add_features(graph)
add_features(validation_graph)

In [12]:
# computes the similarity of the node features from features.json
# feature vector contains [combo degree, last paper, years active, # of shared keywords, # of shared venues, # of papers]
def calculate_node_similarity(graph, node1, node2):
    node1 = graph.nodes[node1]
    node2 = graph.nodes[node2]
    feature_vector = {}
    for attribute in node1:
        if attribute in node2:
            if attribute[0:7] == "keyword":
                feature_vector["keyword"] += 1
            if attribute[0:5] == "venue":
                feature_vector["venue"] += 1
            
    return feature_vector


In [20]:
# extracts data from the nodes such as first paper, last paper, and the difference between them
def extract_activity_data(graph, node1, node2):
    node1 = graph.nodes[node1]
    node2 = graph.nodes[node2]
    F1 = node1["first"]  
    F2 = node2["first"]
    L1 = node1["last"]    
    L2 = node2["last"]
    years_active_1 = L1 - F1
    years_active_2 = L2 - F2
    

extract_activity_data(graph, 0, 10)

<class 'int'>


In [13]:
# generating a Node2Vec model to learn the node embeddings
node2vec = Node2Vec(graph, dimensions=12, walk_length=80, num_walks=10, workers=4, p = 5, q = 1)
model = node2vec.fit(window=6, min_count=1, batch_words=4)

Computing transition probabilities: 100%|██████████| 4085/4085 [00:09<00:00, 416.15it/s]


In [76]:
# generating the edge embeddings of the node2vec model
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)
edges_kv = edges_embs.as_keyed_vectors()

In [156]:
# importing all of the gem modules
from gem.embedding.gf       import GraphFactorization
from gem.embedding.hope     import HOPE
from gem.embedding.lap      import LaplacianEigenmaps
from gem.embedding.lle      import LocallyLinearEmbedding
from gem.embedding.node2vec import node2vec
#from gem.embedding.sdne     import SDNE

In [157]:
# learning the embeddings uing the laplacian eigenmaps algorithm
le = LaplacianEigenmaps(d=24)
le_embeddings, t = le.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

Laplacian matrix recon. error (low rank): 66.740138


In [158]:
# learning the embeddings uing the local linear embedding algorithm
lli = LocallyLinearEmbedding(d=2)
lli_embeddings, t = le.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

Laplacian matrix recon. error (low rank): 66.740138


In [159]:
# generating a HOPE model to learn node embeddings
hope = HOPE(d=64, beta = .002)
hope_graph_embeddings, t = hope.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

SVD error (low rank): 0.415458


In [96]:
# generating a GF model to learn node embeddings
embedding = GraphFactorization(d=2, max_iter=100000, eta=1*10**-4, regu=1.0, data_set='author')
#embedding.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

In [135]:
#Generating a SDNE model to learn node embeddings
import keras
import theano
sdne = SDNE(d=2, beta=5, alpha=1e-5, nu1=1e-6, nu2=1e-6, K=3, n_units=[50, 15,], n_iter=50, xeta=0.01, n_batch=500)
#sdne_graph_embeddings, t = sdne.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

In [16]:
# generating the similarities between each node in the graph through their embeddings
# take the dot product of the two vectors
# length of the vector = 1
# orthogonal = 0
scores = []
def get_cosine_similarity(edge):
    node1 = edge[0]
    node2 = edge[1]
    node1_vector = model.wv.get_vector(str(node1))
    node2_vector = model.wv.get_vector(str(node2))
    cos_sim = np.dot(node1_vector, node2_vector)/(np.linalg.norm(node1_vector)*np.linalg.norm(node2_vector))
    probability = max(0.0,cos_sim)
    return min(1.0,probability)

In [14]:
def generate_negative_edges(g):
    n_edges = g.number_of_edges()
    n_nodes = g.number_of_nodes()
    non_edges = [e for e in nx.non_edges(g)]
    nneg = int(n_edges * 2)
    print(nneg)
    n_neighbors = [len(list(g.neighbors(v))) for v in list(g.nodes)]
    n_non_edges = n_nodes - 1 - np.array(n_neighbors)
    rnd = np.random.RandomState(seed=None)
    rnd_inx = rnd.choice(len(non_edges), nneg, replace=False)
    neg_edge_list = [non_edges[i] for i in rnd_inx]
    print("Len before pruning: " + str(len(neg_edge_list)))
    
    for edge in neg_edge_list:
        if get_cosine_similarity(edge) > .20:
            neg_edge_list.remove(edge)
    print("Len after pruning: " + str(len(neg_edge_list)))
    return neg_edge_list

In [17]:
training_pos_edges = graph.edges()
training_neg_edges = generate_negative_edges(graph)

print("len of training pos edges is " + str(len(training_pos_edges)) \
      + " and negative edges is " + str(len(training_neg_edges))) 
validation_pos_edges = validation_graph.edges()
validation_neg_edges = generate_negative_edges(validation_graph)
print("len of validation pos edges is " + str(len(validation_pos_edges)) + \
      " and negative edges is " + str(len(validation_neg_edges))) 


59810
Len before pruning: 59810
Len after pruning: 46932
len of training pos edges is 29905 and negative edges is 46932
2184
Len before pruning: 2184
Len after pruning: 1730
len of validation pos edges is 1092 and negative edges is 1730


In [147]:
# predicting the scores using the jaccard_coefficient
def make_local_features(graph, edge_list):
    predictions = []
    predictions.append(nx.jaccard_coefficient(graph, edge_list))
    predictions.append(nx.adamic_adar_index(graph, edge_list))
    predictions.append(nx.preferential_attachment(graph, edge_list))
    predictions.append(nx.resource_allocation_index(graph, edge_list))

    scores_local = {}
    for prediction_type in predictions:
        for u, v, p in prediction_type:
            if (u,v) in scores_local:
                data = list(scores_local.get((u,v), []))
                data = [*data, p]
                scores_local[(u,v)] = data
            else:
                scores_local[(u,v)] = [p]
    for edge in edge_list:
        node1 = edge[0]
        node2 = edge[1]
        node1_degree = graph.degree[node1]
        node2_degree = graph.degree[node2]
        total_degree = node1_degree + node2_degree
        fatures_score = calculate_node_similarity(node1, node2)
        
        data = list(scores_local.get((node1,node2), []))
        data = [*data, total_degree, fatures_score]
        scores_local[(node1,node2)] = data
        
                
    return scores_local
    
        
features_dict = make_local_features(graph, training_pos_edges)
local_features_pos_train = [x for x in features_dict.values()]

features_dict = make_local_features(graph, training_neg_edges)
local_features_neg_train = [x for x in features_dict.values()]

features_dict = make_local_features(graph, training_pos_edges)
local_features_pos_validation = [x for x in features_dict.values()]

features_dict = make_local_features(graph, training_neg_edges)
local_features_neg_validation = [x for x in features_dict.values()]

features_dict = make_local_features(graph, edge_tuples)
test_embedded_edges = [x for x in features_dict.values()]

In [148]:
pos_labels = np.ones(len(local_features_pos_train))
neg_labels = np.zeros(len(local_features_neg_train))
training_labels = np.concatenate([pos_labels, neg_labels])
training_features = np.concatenate([local_features_pos_train, local_features_neg_train])
print("Features: " + str(len(training_features)))
print("Labels: " + str(len(training_labels)))

pos_labels = np.ones(len(local_features_pos_validation))
neg_labels = np.zeros(len(local_features_neg_validation))
validation_labels = np.concatenate([pos_labels, neg_labels])
validation_features = np.concatenate([local_features_pos_validation, local_features_neg_validation])

Features: 108511
Labels: 108511


In [165]:
# Create a matrix of the embedded node features for the training graph
embedded_nodes = []
for i in range(0,graph.number_of_nodes()):
    node_embedding = model.wv.get_vector(str(i))
    #node_embedding = hope_graph_embeddings[i]
    embedded_nodes.append(node_embedding)
embedded_matrix = np.vstack(embedded_nodes)

In [166]:
# returns a list of embeddings (Hadmard Product of the two nodes) for the given edges
def get_edge_embeddings(edge_list):
    embedded_edges = []
    for edge in edge_list:
        node1 = edge[0]
        node2 = edge[1]
        node1_embedded = embedded_matrix[node1]
        node2_embedded = embedded_matrix[node2]
        edge_embedded = np.multiply(node1_embedded, node2_embedded)
        embedded_edges.append(edge_embedded)
    embedded_edges = np.array(embedded_edges)
    return embedded_edges

In [167]:
# Creating the embedded list of training features
positive_embedded_edges = get_edge_embeddings(training_pos_edges)
negative_embedded_edges = get_edge_embeddings(training_neg_edges)
training_features = np.concatenate([positive_embedded_edges, negative_embedded_edges])
print(len(training_features))

78855


In [168]:
# Creating the label vector to train the data
listofones = [1] * len(positive_embedded_edges)
listofzeros = [0] * len(negative_embedded_edges)
training_labels = listofones + listofzeros
print(len(training_labels))

78855


In [169]:
# Creating the embedded list of test and validation features
test_embedded_edges = get_edge_embeddings(edge_tuples)
validation_pos_embedded_edges = get_edge_embeddings(validation_pos_edges)
validation_neg_embedded_edges = get_edge_embeddings(validation_neg_edges)
validation_features = np.concatenate([validation_pos_embedded_edges, validation_neg_embedded_edges])
listofones = [1] * len(validation_pos_edges)
listofzeros = [0] * len(validation_neg_edges)
validation_labels = listofones + listofzeros

In [170]:
# fitting the data to a logistic regression model using the embedded egde data
from sklearn import metrics, model_selection, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

logistic_model = LogisticRegression(random_state=0)
logistic_model.fit(training_features, training_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [171]:
# fitting the data to a random forest model using the embedded egde data
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators=70, bootstrap = True,max_features = 'sqrt')
random_forest_model.fit(training_features, training_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=70,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [172]:
from sklearn.neural_network import MLPClassifier
neural_net = MLPClassifier(hidden_layer_sizes=(8,8,8,8),max_iter=400, activation = 'logistic')
neural_net.fit(training_features,training_labels)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(8, 8, 8, 8), learning_rate='constant',
              learning_rate_init=0.001, max_iter=400, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [173]:
test_preds = neural_net.predict_proba(test_embedded_edges)[:, 1]
print(test_preds)
validation_preds = neural_net.predict_proba(validation_features)[:, 1]
print(validation_preds)
auc = roc_auc_score(validation_labels, validation_preds)
print(auc)

[0.87672083 0.93356577 0.00276022 ... 0.04733783 0.97868531 0.00294644]
[0.62240186 0.95107441 0.99462699 ... 0.00338865 0.00282997 0.00261909]
0.9644288279731074


In [174]:
# predicting the classes
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

test_preds = logistic_model.predict_proba(test_embedded_edges)[:, 1]
print(test_preds)
validation_preds = logistic_model.predict_proba(validation_features)[:, 1]
print(validation_preds)
auc = roc_auc_score(validation_labels, validation_preds)
print(auc)

[7.86742629e-01 9.28613170e-01 2.93056902e-03 ... 2.04666761e-01
 9.91889841e-01 1.30297519e-04]
[0.20422338 0.98074781 0.9999817  ... 0.00150234 0.00683831 0.00121563]
0.9628482168034551


In [175]:
test_preds = random_forest_model.predict_proba(test_embedded_edges)[:, 1]
print(test_preds)
validation_preds = random_forest_model.predict_proba(validation_features)[:, 1]
print(validation_preds)
auc = roc_auc_score(validation_labels, validation_preds)
print(auc)

[0.52857143 0.62857143 0.         ... 0.21428571 0.75714286 0.01428571]
[0.35714286 0.74285714 0.72857143 ... 0.02857143 0.         0.        ]
0.9588883405808931


In [139]:
with open('output.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(["Id", "Predicted"])
    for i in range(0, len(test_preds)):
        prob = test_preds[i]
        wr.writerow([i+1,float(prob)])
                     
print("output file written")

output file written
