In [1]:
"""
Michael E. Ramsey
CSCI 5352
Date Created: 11/01/18
Last Edited: 12/3/18

This is a python script to analyze the citation network, hepPH, presented in CSCI 5352.
In this file, we compute features for each edge pair and a randomly sampled set of the 
non-edge pairs. 

The output of this scipt is a csv, containing several "features" of each edge and non-edge pair.
This .csv file will be used to create machine learning models for edge prediction.

You can get the data at the following link:
https://icon.colorado.edu/
"""

# Get necessary libraries
import sys
from os import listdir
from os.path import isfile, join
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import scipy
from scipy import linalg
from scipy.sparse import csr_matrix
from random import randint
import itertools

  from ._conv import register_converters as _register_converters


In [2]:
"""
Extract and load the data
"""

# Get list of filenames that contain edge information
# Had to exclude a bunch of files that I did not need
# Could have done this more efficiently
filepath = "hepPH_Edges/"
filename = "hepPH_edges.csv"

# Load the data
data = pd.read_csv(filepath + filename)

# Delete the first column
data = data.drop(columns = ['Unnamed: 0'])

In [3]:
"""
View the data
"""
data.head()

Unnamed: 0,FromNodeID,ToNodeID
0,9903282,9211202
1,9601388,9211202
2,9905409,9211202
3,9702367,9211202
4,9912370,9211202


In [4]:
"""
Extract node and edge list
"""
# Extract the edge list
edge_list = list(zip(data['FromNodeID'], data['ToNodeID']))
edge_list = list(set(edge_list))

# Extract node list
node_list = list(set(data['FromNodeID']) | set(data['ToNodeID']))

# Create list of not_edges - adjust for proportion of edges that exist in the network
total_val = int(np.round(len(edge_list)*(1+len(edge_list)/(len(node_list)-1)**2)))
not_edges = [(node_list[randint(0, len(node_list)-1)], node_list[randint(0, len(node_list)-1)]) for _ in range(total_val)]
not_edges = list(set(not_edges) - set(edge_list))

# Create label vectors
y_edges = np.ones((len(edge_list),1))
y_not_edges = np.zeros((len(not_edges),1))

In [5]:
"""
Perform train/valid/test split
We will use 95/2.5/2.5
"""
edge_train, edge_test, y_train, y_test = train_test_split(edge_list, y_edges, test_size = .7, random_state = 345)
not_edge_train, not_edge_test, y_not_train, y_not_test = train_test_split(not_edges, y_not_edges, test_size = .7, random_state = 345)

In [6]:
"""
Generate the network
"""
G=nx.Graph()
G.add_nodes_from(node_list)
G.add_edges_from(edge_train)

# Compute clustering coefficient for each ndoe
cluster_coeff = nx.clustering(G)

# Construct adjacency matrices and multiples
A = nx.adjacency_matrix(G)
Asquared = A.dot(A)
Acubed = Asquared.dot(A)


In [None]:
"""
Concatenate lists
"""
edge_list = edge_train + not_edge_train + edge_test + not_edge_test
y = np.concatenate((y_train, y_not_train, y_test, y_not_test))
label = ['Tr']*(len(edge_train)+len(not_edge_train)) + ['T']*(len(edge_test)+len(not_edge_test))

"""
Feature Construction
"""
# Initialize numpy arrays to store features
#shortest_path = np.zeros((len(edge_list),1))
common_neighbors = np.zeros((len(edge_list),1))
pref_attach = np.zeros((len(edge_list),1))
neighbor_sum = np.zeros((len(edge_list),1))
local_cluster_sum = np.zeros((len(edge_list),1))
local_cluster_prod = np.zeros((len(edge_list),1))
jaccard_coeff = np.zeros((len(edge_list),1))
adamic_adar = np.zeros((len(edge_list),1))
sorensen = np.zeros((len(edge_list),1))
cosine_sim = np.zeros((len(edge_list),1))
hub_prom = np.zeros((len(edge_list),1))
hub_depr = np.zeros((len(edge_list),1))
lhn = np.zeros((len(edge_list),1))
resource_all = np.zeros((len(edge_list),1))
local_path001 = np.zeros((len(edge_list),1))
local_path01 = np.zeros((len(edge_list),1))
local_path1 = np.zeros((len(edge_list),1))
#commute_time = np.zeros((len(edge_list),1))
#cosine_sim_time = np.zeros((len(edge_list),1))
#rooted_page001 = np.zeros((len(edge_list),1))
#rooted_page01 = np.zeros((len(edge_list),1))
#rooted_page1 = np.zeros((len(edge_list),1))

# Loop through edges to extract features
for edge in range(0,len(edge_list)):

    # Shortest path
    #if nx.has_path(G, edge_list[edge][0], edge_list[edge][1]) == True:
    #    shortest_path[edge] = len(nx.shortest_path(G, edge_list[edge][0], edge_list[edge][1]))-1
    #else:
    #    shortest_path[edge] = 1000

    # Common neighbors
    common_neighbors[edge] = sum(1 for i in nx.common_neighbors(G, edge_list[edge][0], edge_list[edge][1]))

    # Preferential attachment
    pref_attach[edge] = sum(1 for i in G.neighbors(edge_list[edge][0]))*sum(1 for i in G.neighbors(edge_list[edge][1]))

    # Neighbor sum
    neighbor_sum[edge] = sum(1 for i in G.neighbors(edge_list[edge][0]))+sum(1 for i in G.neighbors(edge_list[edge][1]))

    # Jaccard coefficient
    temp = nx.jaccard_coefficient(G,[edge_list[edge]])
    jaccard_coeff[edge] = list(temp)[0][2]

    # Sorensen Index
    sorensen[edge] = common_neighbors[edge]/neighbor_sum[edge]

    # Cosine Similarity
    cosine_sim[edge] = common_neighbors[edge]/np.sqrt(pref_attach[edge])

    # Hub Promoted
    hub_prom[edge] = common_neighbors[edge]/min(sum(1 for i in G.neighbors(edge_list[edge][0])), sum(1 for i in G.neighbors(edge_list[edge][1])))

    # Hub Depressed
    hub_depr[edge] = common_neighbors[edge]/max(sum(1 for i in G.neighbors(edge_list[edge][0])), sum(1 for i in G.neighbors(edge_list[edge][1])))

    # LHN
    lhn[edge] = common_neighbors[edge]/pref_attach[edge]

    # Adamic/Adar
    temp = nx.adamic_adar_index(G,[edge_list[edge]])
    try: adamic_adar[edge] = list(temp)[0][2]
    except ZeroDivisionError: adamic_adar[edge] = 1000

    # Resource Allocation
    temp = list(nx.common_neighbors(G, edge_list[edge][0], edge_list[edge][1]))
    temp = dict(G.degree(temp))
    resource_all[edge] = sum(1/i for i in list(temp.values()))

    # Clustering coefficient
    local_cluster_sum[edge] = cluster_coeff[edge_list[edge][0]] + cluster_coeff[edge_list[edge][1]]
    local_cluster_prod[edge] = cluster_coeff[edge_list[edge][0]] * cluster_coeff[edge_list[edge][1]]

    # Local Path
    node1 = node_list.index(edge_list[edge][0])
    node2 = node_list.index(edge_list[edge][1])
    local_path001[edge] = Asquared[node1,node2] + .001*Acubed[node1,node2]
    local_path01[edge] = Asquared[node1,node2] + .01*Acubed[node1,node2]
    local_path1[edge] = Asquared[node1,node2] + .1*Acubed[node1,node2]

    # Commute Time
    #temp = (Lstar[edge_list[edge][0]-1, edge_list[edge][0]-1] + Lstar[edge_list[edge][1]-1, edge_list[edge][1]-1] - 2*Lstar[edge_list[edge][0]-1, edge_list[edge][1]-1])
    #commute_time[edge] = len(edge_list)*temp

    # Cosine similarity
    #temp = np.sqrt(Lstar[edge_list[edge][0]-1, edge_list[edge][0]-1]*Lstar[edge_list[edge][1]-1, edge_list[edge][1]-1])
    #cosine_sim_time[edge] = Lstar[edge_list[edge][0]-1, edge_list[edge][1]-1] / temp

    # Rooted pagerank
    #rooted_page001[edge] = RPR001[edge_list[edge][0]-1,edge_list[edge][1]-1]
    #rooted_page01[edge] = RPR01[edge_list[edge][0]-1,edge_list[edge][1]-1]
    #rooted_page1[edge] = RPR1[edge_list[edge][0]-1,edge_list[edge][1]-1]

    ####### Print statement for tracking
    if edge%10000 == 0:
        print(str(np.round(edge/len(edge_list)*100,1)) + '%' , end = " ")

"""
Create data frame with all of the features
"""
# Initialize data frame to store features
feature_df = pd.DataFrame.from_records(edge_list, columns = ['Node_1', 'Node_2'])

# Create features for data frame
#feature_df['shortest_path'] = shortest_path
feature_df['common_neighbors'] = common_neighbors 
feature_df['pref_attach'] = pref_attach 
feature_df['neighbor_sum'] = neighbor_sum 
feature_df['sorensen'] = sorensen 
feature_df['cosine_sim'] = cosine_sim 
feature_df['hub_prom'] = hub_prom 
feature_df['hub_depr'] = hub_depr 
feature_df['lhn'] = lhn
feature_df['adamic_adar'] = adamic_adar
feature_df['resource_all'] = resource_all 
feature_df['local_cluster_sum'] = local_cluster_sum
feature_df['local_cluster_prod'] = local_cluster_prod
feature_df['local_path001'] = local_path001
feature_df['local_path01'] = local_path01
feature_df['local_path1'] = local_path1
#feature_df['commute_time'] = commute_time
#feature_df['cosine_sim_time'] = cosine_sim_time
#feature_df['rooted_page001'] = rooted_page001
#feature_df['rooted_page01'] = rooted_page01
#feature_df['rooted_page1'] = rooted_page1
feature_df['edge'] = y
feature_df['label'] = label


0.0% 1.4% 2.9% 4.3% 5.8% 7.2% 8.6% 10.1% 11.5% 13.0% 14.4% 



15.8% 17.3% 18.7% 20.2% 21.6% 23.0% 24.5% 25.9% 27.4% 28.8% 30.2% 31.7% 33.1% 34.6% 36.0% 37.4% 38.9% 40.3% 41.8% 43.2% 44.6% 46.1% 47.5% 49.0% 50.4% 51.8% 53.3% 54.7% 56.2% 57.6% 59.0% 60.5% 61.9% 63.4% 64.8% 66.2% 67.7% 69.1% 70.6% 72.0% 73.4% 74.9% 76.3% 

In [None]:
"""
Save features to a data frame
"""
# Feature data frame
feature_df.to_csv('hepPH_features_30' + '.csv')
