In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import csv, random, json
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder

In [3]:
# loading the training data into a list
with open('train.txt', 'r') as f:
    lines = f.readlines()

lines = [line.replace(' ', '\t') for line in lines]

with open('train.txt', 'w') as f:
    f.writelines(lines)

with open('train.txt') as f:
    reader = csv.reader(f, delimiter="\t")
    d = list(reader)

training_file = []
for row in d:
    training_file.append([ int(x) for x in row ])


In [4]:
# loading the test data into a list
test_file = pd.read_csv('test-public.csv')

In [5]:
# extracting the features of the nodes
with open('nodes.json', 'r') as f:
    features = json.load(f)

In [6]:
# turning the test data into a pandas dataframe list of edges
node_list_1 = []
node_list_2 = []
for i,j in test_file.iterrows():
    node_list_1.append(j[1])
    node_list_2.append(j[2])

test_edge_list = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})
edge_tuples = []
for index, row in test_edge_list.iterrows():
    edge_tuples.append((row['node_1'],row['node_2']))

In [7]:
# turning the training data into a pandas dataframe list of edges
node_list_1 = []
node_list_2 = []
for row in training_file:
    for connection in range(0,len(row)):
        if connection == 0:
            continue
        else:
            node_list_1.append(row[0])
            node_list_2.append(row[connection])

training_edge_list = pd.DataFrame({'node_1': node_list_1, 'node_2': node_list_2})


In [8]:
# splitting up the list of positive edges in the graph
from sklearn.model_selection import train_test_split
y = np.ones(len(training_edge_list))
X_train, X_test, y_train, y_test = train_test_split(training_edge_list, y, test_size=0.20)
print(len(training_edge_list))
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

53872
43097
43097
10775
10775


In [9]:
# creating a graph from the list of edges in the training data
def create_graph(edge_list):
    graph_edges = nx.from_pandas_edgelist(edge_list, "node_1", "node_2")
    graph = nx.path_graph(4085)
    graph.add_edges_from(graph_edges.edges())
    n = graph.number_of_nodes()
    m = graph.number_of_edges()
    print("Number of nodes :", str(n))
    print("Number of edges :", str(m))
    print("Number of connected components :" + str(nx.number_connected_components(graph)))
    return graph

In [10]:
print("Training graph description: ")
graph = create_graph(X_train)
print("-------------------------------")

print("Validation graph description: ")
validation_graph = create_graph(X_test)
validation_graph.remove_edges_from(graph.edges())

Training graph description: 
Number of nodes : 4085
Number of edges : 29940
Number of connected components :1
-------------------------------
Validation graph description: 
Number of nodes : 4085
Number of edges : 13794
Number of connected components :1


In [11]:
# adding features to the nodes
def add_features(graph):
    for entry in features:
        id = entry['id']
        for key in entry:
            graph.nodes[id][key] = entry[key]
add_features(graph)
add_features(validation_graph)

In [12]:
def find_first_and_last(node):
    first = node["first"]  
    last = node["last"]
    return first, last

In [13]:
# extracts data from the nodes such as first paper, last paper, and the difference between them
def extract_activity_data(graph, node1, node2):
    node1 = graph.nodes[node1]
    node2 = graph.nodes[node2]
    F1, L1 = find_first_and_last(node1)
    F2, L2 = find_first_and_last(node2)
    num_papers1 = node1["num_papers"] 
    num_papers2 = node2["num_papers"] 
    years_active_1 = (F1 - L1) + 1
    years_active_2 = (F2 - L2) + 1
    
    vector = [abs(F1-F2), abs(L1-L2), num_papers1+num_papers2, years_active_1+years_active_2]
    return vector

In [14]:
# finds the number of venue and keyword entries the given node has
def total_venues_and_keywords(node_feature_dictionary):
    total_venues = 0
    total_keywords = 0
    for attribute in node_feature_dictionary:
        if attribute[0:5] == "venue":
            total_venues += 1
        if attribute[0:7] == "keyword":
            total_keywords += 1
        
    return total_venues, total_keywords

In [15]:
# finds the number of venues and keywords the two nodes have in common
def shared_venues_and_keywords(node1, node2):
    total_venues = 0
    total_keywords = 0
    for attribute in node1:
        if attribute in node2:
            if attribute[0:5] == "venue":
                total_venues += 1
            if attribute[0:7] == "keyword":
                total_keywords += 1
    return total_venues, total_keywords
            

In [16]:
# finds the number of papers the author writes per year active
def find_efficacy(node1, node2):
    num_papers1 = node1["num_papers"] 
    num_papers2 = node2["num_papers"] 
    F1, L1 = find_first_and_last(node1)
    F2, L2 = find_first_and_last(node2)
    years_active_1 = (F1 - L1) + 1
    years_active_2 = (F2 - L2) + 1
    efficacy1 = num_papers1 / (years_active_1+1)
    efficacy2 = num_papers2 / (years_active_2+1)
    return efficacy1 + efficacy2
    
    

In [17]:
# finds the ratio of venue to num. papers and keywords to num. papers
def find_ratios(node1, node2):
    total_venues1, total_keywords1 = total_venues_and_keywords(node1)
    total_venues2, total_keywords2 = total_venues_and_keywords(node2)
    num_papers1 = node1["num_papers"] 
    num_papers2 = node2["num_papers"] 
    
    venue_ratio_1 = total_venues1 / num_papers1
    venue_ratio_2 = total_venues2 / num_papers2
    
    keyword_ratio_1 = total_keywords1 / num_papers1
    keyword_ratio_2 = total_keywords2 / num_papers2
    
    return venue_ratio_1 + venue_ratio_2, keyword_ratio_1 + keyword_ratio_2

In [18]:
# computes the similarity of the node features from features.json
# feature vector contains [combo degree, last paper, years active, # of shared keywords, # of shared venues, # of papers]
def calculate_node_similarity(graph, node1, node2):
    activity_vector = extract_activity_data(graph, node1, node2)
    max_triangle = max(nx.triangles(graph,node1) , nx.triangles(graph,node2))
    min_coeff = min(nx.clustering(graph,node1) , nx.clustering(graph,node2))
    
    node1 = graph.nodes[node1]
    node2 = graph.nodes[node2]    
    shared_venues, shared_keywords = shared_venues_and_keywords(node1, node2)
    efficacy = find_efficacy(node1,node2)    
    
    feature_vector = [shared_venues, shared_keywords, min_coeff, max_triangle, efficacy]
    return activity_vector + feature_vector

v = calculate_node_similarity(graph, 0, 87)
print(v)


[3, 0, 11, 7, 1, 10, 0, 1]


In [19]:
# generating a Node2Vec model to learn the node embeddings
node2vec = Node2Vec(graph, dimensions=16, walk_length=80, num_walks=10, workers=4, p = 5, q = 1)
model = node2vec.fit(window=6, min_count=1, batch_words=4)

Computing transition probabilities: 100%|██████████| 4085/4085 [00:10<00:00, 388.83it/s]


In [101]:
# importing all of the gem modules
from gem.embedding.hope     import HOPE
from gem.embedding.lap      import LaplacianEigenmaps
from gem.embedding.lle      import LocallyLinearEmbedding

In [157]:
# learning the embeddings uing the laplacian eigenmaps algorithm
le = LaplacianEigenmaps(d=24)
le_embeddings, t = le.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

Laplacian matrix recon. error (low rank): 66.740138


In [158]:
# learning the embeddings uing the local linear embedding algorithm
lli = LocallyLinearEmbedding(d=2)
lli_embeddings, t = le.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

Laplacian matrix recon. error (low rank): 66.740138


In [159]:
# generating a HOPE model to learn node embeddings
hope = HOPE(d=64, beta = .002)
hope_graph_embeddings, t = hope.learn_embedding(graph=graph, edge_f=None, is_weighted=False, no_python=True)

SVD error (low rank): 0.415458


In [20]:
# generating the similarities between each node in the graph through their embeddings
# take the dot product of the two vectors
# length of the vector = 1
# orthogonal = 0
scores = []
def get_cosine_similarity(edge):
    node1 = edge[0]
    node2 = edge[1]
    node1_vector = model.wv.get_vector(str(node1))
    node2_vector = model.wv.get_vector(str(node2))
    cos_sim = np.dot(node1_vector, node2_vector)/(np.linalg.norm(node1_vector)*np.linalg.norm(node2_vector))
    probability = max(0.0,cos_sim)
    return min(1.0,probability)

In [21]:
def generate_negative_edges(g):
    n_edges = g.number_of_edges()
    n_nodes = g.number_of_nodes()
    non_edges = [e for e in nx.non_edges(g)]
    nneg = int(n_edges * 1.5)
    n_neighbors = [len(list(g.neighbors(v))) for v in list(g.nodes)]
    n_non_edges = n_nodes - 1 - np.array(n_neighbors)
    rnd = np.random.RandomState(seed=None)
    rnd_inx = rnd.choice(len(non_edges), nneg, replace=False)
    neg_edge_list = [non_edges[i] for i in rnd_inx]
    print("Len before pruning: " + str(len(neg_edge_list)))
    
    for edge in neg_edge_list:
        if get_cosine_similarity(edge) > .12:
            neg_edge_list.remove(edge)
    print("Len after pruning: " + str(len(neg_edge_list)))
    return neg_edge_list

In [22]:
training_pos_edges = graph.edges()
training_neg_edges = generate_negative_edges(graph)

print("len of training pos edges is " + str(len(training_pos_edges)) \
      + " and negative edges is " + str(len(training_neg_edges))) 
validation_pos_edges = validation_graph.edges()
validation_neg_edges = generate_negative_edges(validation_graph)
print("len of validation pos edges is " + str(len(validation_pos_edges)) + \
      " and negative edges is " + str(len(validation_neg_edges))) 


Len before pruning: 44910
Len after pruning: 33607
len of training pos edges is 29940 and negative edges is 33607
Len before pruning: 1585
Len after pruning: 1175
len of validation pos edges is 1057 and negative edges is 1175


In [23]:
def make_local_features(graph, edge_list):
    predictions = []
    predictions.append(nx.jaccard_coefficient(graph, edge_list))
    predictions.append(nx.adamic_adar_index(graph, edge_list))

    scores_local = {}
    for prediction_type in predictions:
        for u, v, p in prediction_type:
            if (u,v) in scores_local:
                data = scores_local[(u,v)]
                data = data + [p]
                scores_local[(u,v)] = data
            else:
                scores_local[(u,v)] = [p]

    for edge in edge_list:
        node1 = edge[0]
        node2 = edge[1]
        features_scores = calculate_node_similarity(graph,node1,node2)
        data = scores_local[(node1,node2)]
        data = data + features_scores
        scores_local[(node1,node2)] = data

    return scores_local

    
features_dict = make_local_features(graph, training_pos_edges)
local_features_pos_train = [x for x in features_dict.values()]

features_dict = make_local_features(graph, training_neg_edges)
local_features_neg_train = [x for x in features_dict.values()]

features_dict = make_local_features(graph, training_pos_edges)
local_features_pos_validation = [x for x in features_dict.values()]

features_dict = make_local_features(graph, training_neg_edges)
local_features_neg_validation = [x for x in features_dict.values()]

features_dict = make_local_features(graph, edge_tuples)
test_embedded_edges = [x for x in features_dict.values()]

In [24]:
pos_labels = np.ones(len(local_features_pos_train))
neg_labels = np.zeros(len(local_features_neg_train))
training_labels = np.concatenate([pos_labels, neg_labels])
training_features = np.concatenate([local_features_pos_train, local_features_neg_train])
print("Features: " + str(len(training_features)))
print("Labels: " + str(len(training_labels)))

pos_labels = np.ones(len(local_features_pos_validation))
neg_labels = np.zeros(len(local_features_neg_validation))
validation_labels = np.concatenate([pos_labels, neg_labels])
validation_features = np.concatenate([local_features_pos_validation, local_features_neg_validation])

Features: 63547
Labels: 63547


In [25]:
print(training_features[5])

[ 0.125       0.86685447 72.          0.20168067  0.          0.
 21.         18.          3.          8.          0.25757576 17.        ]


In [26]:
# fitting the data to a logistic regression model using the embedded egde data
from sklearn import metrics, model_selection, pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

logistic_model = LogisticRegression(random_state=0)
logistic_model.fit(training_features, training_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
from sklearn.neural_network import MLPClassifier

neural_net = MLPClassifier(hidden_layer_sizes=(64,64),max_iter=350, activation = 'logistic')
neural_net.fit(training_features + validation_features, training_labels + validation_labels)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(64, 64), learning_rate='constant',
              learning_rate_init=0.001, max_iter=250, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [34]:
# predicting the classes using the neural net
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

test_preds = neural_net.predict_proba(test_embedded_edges)[:, 1]
validation_preds = neural_net.predict_proba(validation_features)[:, 1]
auc = roc_auc_score(validation_labels, validation_preds)
print(auc)

0.9241400367511787


In [32]:
# predicting the classes using the logistic model

test_preds = logistic_model.predict_proba(test_embedded_edges)[:, 1]
validation_preds = logistic_model.predict_proba(validation_features)[:, 1]
auc = roc_auc_score(validation_labels, validation_preds)
print(logistic_model.coef_)
print(auc)

[[ 6.98995661e-01  6.45852267e+00 -6.43635665e-04  1.07283734e+00
  -4.88829987e-02 -2.77073584e-02 -2.18337398e-03 -1.95062283e-02
   2.87594510e-01  2.12071536e-02 -6.45458095e-01 -6.56016389e-05]]
0.9249893782864327


In [35]:
# writing the prediction probs to the output file
with open('output.csv', 'w', newline='') as myfile:
    wr = csv.writer(myfile)
    wr.writerow(["Id", "Predicted"])
    for i in range(0, len(test_preds)):
        prob = test_preds[i]
        wr.writerow([i+1,float(prob)])
                     
print("output file written")

output file written
