In [1]:
"""
Michael E. Ramsey
CSCI 5352
Date Created: 11/01/18
Last Edited: 12/3/18

This is a python script to analyze the enron email network presented in CSCI 5352.
In this file, we compute features for each edge pair and a randomly sampled set of the 
non-edge pairs. 

The output of this scipt is a csv, containing several "features" of each edge and non-edge pair.
This .csv file will be used to create machine learning models for edge prediction.
"""

# Get necessary libraries
import sys
from os import listdir
from os.path import isfile, join
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import scipy
from scipy import linalg
from scipy.sparse import csr_matrix
from random import randint
import itertools

  from ._conv import register_converters as _register_converters


In [2]:
"""
Extract all filenames for the enron pairs
"""

# Get list of filenames that contain edge information
# Had to exclude a bunch of files that I did not need
# Could have done this more efficiently
filepath = "Enron_Networks-Months/"
files = [f for f in listdir(filepath) if isfile(join(filepath, f))]
len(files)

40

In [3]:
"""
Extract the edges for all networks
"""
network_list = []
for filename in files:
    
    edge_list = []
    # Open file and read lines
    f = open(filepath + filename)
    f1 = f.readlines()
    f.close()
  
    # Loop through lines and record edge
    for x in f1:
        temp = x.replace("\t"," ")
        edge_list.append(tuple(map(int, temp.split())))
    network_list.append(edge_list)
        

In [4]:
"""
Construct data frame for edges
"""
enron_df = pd.DataFrame(columns = ['Node_1', 'Node_2', 'Month', 'Year', 'Counter'])
month = 5
year = 1999
counter = 1
for network in network_list:
    temp = pd.DataFrame.from_records(network, columns = ['Node_1', 'Node_2'])
    temp['Month'] = [month]*len(network)
    temp['Year'] = [year]*len(network)
    temp['Counter'] = [counter]*len(network)
    month += 1
    counter += 1
    enron_df = enron_df.append(temp)
    if month == 13:
        month = 1 
        year += 1

In [5]:
"""
We train up to 2001: predict communications for 2002
"""
# Label training data and testing data
enron_df.loc[(enron_df['Year'] <= 2002) & (enron_df['Month'] < 11), 'label'] = 'Tr'
enron_df.loc[(enron_df['Year'] >= 2002) & (enron_df['Month'] >= 11), 'label'] = 'T'

# Extract training and testing data
train = enron_df.loc[enron_df['label'] == 'Tr']
test = enron_df.loc[enron_df['label'] == 'T']

# Extract the edge list
edge_list = list(zip(enron_df['Node_1'], enron_df['Node_2']))
edge_list = list(set(edge_list))
edge_train = list(zip(train['Node_1'], train['Node_2']))
edge_train = list(set(edge_train))
edge_test = list(zip(test['Node_1'], test['Node_2']))
edge_test = list(set(edge_test) - set(edge_train))

# Get the list of nodes
node_list = list(range(1,150+1))

# Create list of not_edges - adjust for proportion of edges that exist in the network
total_val = int(np.round(len(edge_train)*(1+len(edge_train)/(len(node_list)-1)**2)))
not_edge_train = [(randint(1, len(node_list)), randint(1, len(node_list))) for _ in range(total_val)]
not_edge_train = list(set(not_edge_train) - set(edge_train) - set(edge_test))
total_val2 = int(np.round(len(edge_test)*(1+len(edge_test)/(len(node_list)-1)**2)))
not_edge_test = [(randint(1, len(node_list)), randint(1, len(node_list))) for _ in range(total_val2)]
not_edge_test = list(set(not_edge_test) - set(edge_train) - set(edge_test) - set(not_edge_train))

# Create label vectors
y_train = np.ones((len(edge_train),1))
y_test = np.ones((len(edge_test),1))
y_not_train = np.zeros((len(not_edge_train),1))
y_not_test = np.zeros((len(not_edge_test),1))

In [6]:
"""
Concatenate lists
"""
edge_list = edge_train + not_edge_train + edge_test + not_edge_test
y = np.concatenate((y_train, y_not_train, y_test, y_not_test))
label = ['Tr']*(len(edge_train)+len(not_edge_train)) + ['T']*(len(edge_test)+len(not_edge_test))

In [7]:
"""
Generate the network
"""
G=nx.Graph()
G.add_nodes_from(node_list)
G.add_edges_from(edge_train)

# Compute clustering coefficient for each ndoe
cluster_coeff = nx.clustering(G)

# Construct adjacency matrices and multiples
A = nx.adjacency_matrix(G)
Asquared = A.dot(A)
Acubed = Asquared.dot(A)

# Construct row sum sum diagonal matrix
#D = (A.todense().sum(axis = 1)).flatten()
#D = csr_matrix(np.diag(np.array(D)[0]))
#Dinv = np.diag(1/np.diag(D.todense()))

# Coompute pseudo-inverse
#L = D-A
#Lstar = np.linalg.pinv(L.todense())

# Compute rooted pagerank
#RPR001 = (1-.001)*np.linalg.inv(np.identity(A.shape[0]) - .001*(Dinv.dot(A.todense())))
#RPR01 = (1-.01)*np.linalg.inv(np.identity(A.shape[0]) - .01*(Dinv.dot(A.todense())))
#RPR1 = (1-.1)*np.linalg.inv(np.identity(A.shape[0]) - .1*(Dinv.dot(A.todense())))


In [8]:
"""
Feature Construction
"""
# Initialize numpy arrays to store features
shortest_path = np.zeros((len(edge_list),1))
common_neighbors = np.zeros((len(edge_list),1))
pref_attach = np.zeros((len(edge_list),1))
neighbor_sum = np.zeros((len(edge_list),1))
local_cluster_sum = np.zeros((len(edge_list),1))
local_cluster_prod = np.zeros((len(edge_list),1))
jaccard_coeff = np.zeros((len(edge_list),1))
adamic_adar = np.zeros((len(edge_list),1))
same_gender = np.zeros((len(edge_list),1))
same_status = np.zeros((len(edge_list),1))
same_major = np.zeros((len(edge_list),1))
same_dorm = np.zeros((len(edge_list),1))
same_year = np.zeros((len(edge_list),1))
sorensen = np.zeros((len(edge_list),1))
cosine_sim = np.zeros((len(edge_list),1))
hub_prom = np.zeros((len(edge_list),1))
hub_depr = np.zeros((len(edge_list),1))
lhn = np.zeros((len(edge_list),1))
resource_all = np.zeros((len(edge_list),1))
local_path001 = np.zeros((len(edge_list),1))
local_path01 = np.zeros((len(edge_list),1))
local_path1 = np.zeros((len(edge_list),1))
#commute_time = np.zeros((len(edge_list),1))
#cosine_sim_time = np.zeros((len(edge_list),1))
#rooted_page001 = np.zeros((len(edge_list),1))
#rooted_page01 = np.zeros((len(edge_list),1))
#rooted_page1 = np.zeros((len(edge_list),1))

# Loop through edges to extract features
for edge in range(0,len(edge_list)):

    # Shortest path
    if nx.has_path(G, edge_list[edge][0], edge_list[edge][1]) == True:
        shortest_path[edge] = len(nx.shortest_path(G, edge_list[edge][0], edge_list[edge][1]))-1
    else:
        shortest_path[edge] = 1000

    # Common neighbors
    common_neighbors[edge] = sum(1 for i in nx.common_neighbors(G, edge_list[edge][0], edge_list[edge][1]))

    # Preferential attachment
    pref_attach[edge] = sum(1 for i in G.neighbors(edge_list[edge][0]))*sum(1 for i in G.neighbors(edge_list[edge][1]))

    # Neighbor sum
    neighbor_sum[edge] = sum(1 for i in G.neighbors(edge_list[edge][0]))+sum(1 for i in G.neighbors(edge_list[edge][1]))

    # Jaccard coefficient
    temp = nx.jaccard_coefficient(G,[edge_list[edge]])
    jaccard_coeff[edge] = list(temp)[0][2]

    # Sorensen Index
    sorensen[edge] = common_neighbors[edge]/neighbor_sum[edge]

    # Cosine Similarity
    cosine_sim[edge] = common_neighbors[edge]/np.sqrt(pref_attach[edge])

    # Hub Promoted
    hub_prom[edge] = common_neighbors[edge]/min(sum(1 for i in G.neighbors(edge_list[edge][0])), sum(1 for i in G.neighbors(edge_list[edge][1])))

    # Hub Depressed
    hub_depr[edge] = common_neighbors[edge]/max(sum(1 for i in G.neighbors(edge_list[edge][0])), sum(1 for i in G.neighbors(edge_list[edge][1])))

    # LHN
    lhn[edge] = common_neighbors[edge]/pref_attach[edge]

    # Adamic/Adar
    temp = nx.adamic_adar_index(G,[edge_list[edge]])
    try: adamic_adar[edge] = list(temp)[0][2]
    except ZeroDivisionError: adamic_adar[edge] = 1000

    # Resource Allocation
    temp = list(nx.common_neighbors(G, edge_list[edge][0], edge_list[edge][1]))
    temp = dict(G.degree(temp))
    resource_all[edge] = sum(1/i for i in list(temp.values()))

    # Clustering coefficient
    local_cluster_sum[edge] = cluster_coeff[edge_list[edge][0]] + cluster_coeff[edge_list[edge][1]]
    local_cluster_prod[edge] = cluster_coeff[edge_list[edge][0]] * cluster_coeff[edge_list[edge][1]]

    # Local Path
    local_path001[edge] = Asquared[edge_list[edge][0]-1,edge_list[edge][1]-1] + .001*Acubed[edge_list[edge][0]-1,edge_list[edge][1]-1]
    local_path01[edge] = Asquared[edge_list[edge][0]-1,edge_list[edge][1]-1] + .01*Acubed[edge_list[edge][0]-1,edge_list[edge][1]-1]
    local_path1[edge] = Asquared[edge_list[edge][0]-1,edge_list[edge][1]-1] + .1*Acubed[edge_list[edge][0]-1,edge_list[edge][1]-1]

    # Commute Time
    #temp = (Lstar[edge_list[edge][0]-1, edge_list[edge][0]-1] + Lstar[edge_list[edge][1]-1, edge_list[edge][1]-1] - 2*Lstar[edge_list[edge][0]-1, edge_list[edge][1]-1])
    #commute_time[edge] = len(edge_list)*temp

    # Cosine similarity
    #temp = np.sqrt(Lstar[edge_list[edge][0]-1, edge_list[edge][0]-1]*Lstar[edge_list[edge][1]-1, edge_list[edge][1]-1])
    #cosine_sim_time[edge] = Lstar[edge_list[edge][0]-1, edge_list[edge][1]-1] / temp

    # Rooted pagerank
    #rooted_page001[edge] = RPR001[edge_list[edge][0]-1,edge_list[edge][1]-1]
    #rooted_page01[edge] = RPR01[edge_list[edge][0]-1,edge_list[edge][1]-1]
    #rooted_page1[edge] = RPR1[edge_list[edge][0]-1,edge_list[edge][1]-1



In [9]:
"""
Create data frame with all of the features
"""
# Initialize data frame to store features
feature_df = pd.DataFrame.from_records(edge_list, columns = ['Node_1', 'Node_2'])

# Create features for data frame
feature_df['shortest_path'] = shortest_path
feature_df['common_neighbors'] = common_neighbors
feature_df['pref_attach'] = pref_attach
feature_df['neighbor_sum'] = neighbor_sum
feature_df['sorensen'] = sorensen
feature_df['cosine_sim'] = cosine_sim
feature_df['hub_prom'] = hub_prom
feature_df['hub_depr'] = hub_depr
feature_df['hub_prom'] = hub_prom
feature_df['lhn'] = lhn
feature_df['adamic_adar'] = adamic_adar
feature_df['resource_all'] = resource_all
feature_df['local_cluster_sum'] = local_cluster_sum
feature_df['local_cluster_prod'] = local_cluster_prod
feature_df['local_path001'] = local_path001
feature_df['local_path01'] = local_path01
feature_df['local_path1'] = local_path1
#feature_df['commute_time'] = commute_time
#feature_df['cosine_sim_time'] = cosine_sim_time
#feature_df['rooted_page001'] = rooted_page001
#feature_df['rooted_page01'] = rooted_page01
#feature_df['rooted_page1'] = rooted_page1
feature_df['edge'] = y
feature_df['label'] = label

In [10]:
"""
Save features to a data frame
"""
# Feature data frame
feature_df.to_csv('Enron_2002_10' + '.csv')