In [1]:
"""
Michael E. Ramsey
CSCI 5352
Date Created: 11/01/18
Last Edited: 11/20/18

This is a python script to analyze the Network of Facebook data presented in CSCI 5352.
In this file, we compute features for each edge pair and a randomly sampled set of the 
non-edge pairs. 

The output of this scipt is a csv, containing several "features" of each edge and non-edge pair.
This .csv file will be used to create machine learning models for edge prediction.
"""

# Get necessary libraries
import sys
from os import listdir
from os.path import isfile, join
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import scipy
from scipy import linalg
from scipy.sparse import csr_matrix

  from ._conv import register_converters as _register_converters


In [3]:
"""
Extract all filenames for facebook100 dataset
"""

# Get list of filenames that contain edge information
# Had to exclude a bunch of files that I did not need
# Could have done this more efficiently
files = [f for f in listdir("../Data/facebook100txt/") if isfile(join("../Data/facebook100txt/", f)) 
         if not(f.endswith('r.txt')) if f.endswith('.txt') if not(f.endswith('readme_aaron.txt'))
         if not(f.endswith('facebook100_readme_021011.txt'))]
len(files)
# Got all 100 Yes!!!

100

In [4]:
"""
Extract the nodes and edges.
"""

# Choose the dataset to import
filename = 'American75'

# Create empty list to store connections
edge_list = []

# Open file and read lines
f = open("../Data/facebook100txt/" + filename + '.txt')
f1 = f.readlines()
f.close()

# Loop through lines and record edge
for x in f1:
    temp = x[:-1].replace("\t"," ")
    edge_list.append(tuple(map(int, temp.split())))

# Create a flattened list and find unique elements to create node list
flat_list = [item for sublist in edge_list for item in sublist]
node_list = set(flat_list)

In [14]:
"""
Perform train/valid/test split
We will use 95/2.5/2.5
"""
edge_list, edge_test = train_test_split(edge_list, test_size = .01, random_state = 345)
edge_valid, edge_test = train_test_split(edge_test, test_size = .5, random_state = 345)

2044

In [15]:
"""
Extract Node Attributes
"""
# Extract node attributes
node_attr = pd.read_table("../Data/facebook100txt/" + filename + '_attr.txt')

In [16]:
"""
Generate the network
"""
G=nx.Graph()
G.add_nodes_from(node_list)
G.add_edges_from(edge_list)

# Compute clustering coefficient for each ndoe
cluster_coeff = nx.clustering(G)

# Construct adjacency matrices and multiples
A = nx.adjacency_matrix(G)
Asquared = A.dot(A)
Acubed = Asquared.dot(A)

# Construct row sum sum diagonal matrix
D = (A.todense().sum(axis = 1)).flatten()
D = csr_matrix(np.diag(np.array(D)[0]))
#Dinv = np.diag(1/np.diag(D.todense()))

# Compute pseudo-inverse
L = D-A
Lstar = np.linalg.pinv(L.todense())

# Compute rooted pagerank
#RPR001 = (1-.001)*np.linalg.inv(np.identity(A.shape[0]) - .001*(Dinv.dot(A.todense())))
#RPR01 = (1-.01)*np.linalg.inv(np.identity(A.shape[0]) - .01*(Dinv.dot(A.todense())))
#RPR1 = (1-.1)*np.linalg.inv(np.identity(A.shape[0]) - .1*(Dinv.dot(A.todense())))

In [None]:
"""
Feature Construction
"""

# Initialize numpy arrays to store features
shortest_path = np.zeros((len(edge_list),1))
common_neighbors = np.zeros((len(edge_list),1))
pref_attach = np.zeros((len(edge_list),1))
neighbor_sum = np.zeros((len(edge_list),1))
local_cluster_sum = np.zeros((len(edge_list),1))
local_cluster_prod = np.zeros((len(edge_list),1))
jaccard_coeff = np.zeros((len(edge_list),1))
adamic_adar = np.zeros((len(edge_list),1))
same_gender = np.zeros((len(edge_list),1))
same_status = np.zeros((len(edge_list),1))
same_major = np.zeros((len(edge_list),1))
same_dorm = np.zeros((len(edge_list),1))
same_year = np.zeros((len(edge_list),1))
sorensen = np.zeros((len(edge_list),1))
cosine_sim = np.zeros((len(edge_list),1))
hub_prom = np.zeros((len(edge_list),1))
hub_depr = np.zeros((len(edge_list),1))
lhn = np.zeros((len(edge_list),1))
resource_all = np.zeros((len(edge_list),1))
local_path001 = np.zeros((len(edge_list),1))
local_path01 = np.zeros((len(edge_list),1))
local_path1 = np.zeros((len(edge_list),1))
commute_time = np.zeros((len(edge_list),1))
cosine_sim_time = np.zeros((len(edge_list),1))
#rooted_page001 = np.zeros((len(edge_list),1))
#rooted_page01 = np.zeros((len(edge_list),1))
#rooted_page1 = np.zeros((len(edge_list),1))

# Loop through edges to extract features
for edge in range(0,len(edge_list)):
    
    # Shortest path
    shortest_path[edge] = len(nx.shortest_path(G, edge_list[edge][0], edge_list[edge][1]))-1
    
    # Common neighbors
    common_neighbors[edge] = sum(1 for i in nx.common_neighbors(G, edge_list[edge][0], edge_list[edge][1]))
    
    # Preferential attachment
    pref_attach[edge] = sum(1 for i in G.neighbors(edge_list[edge][0]))*sum(1 for i in G.neighbors(edge_list[edge][1]))
    
    # Neighbor sum
    neighbor_sum[edge] = sum(1 for i in G.neighbors(edge_list[edge][0]))+sum(1 for i in G.neighbors(edge_list[edge][1]))
    
    # Jaccard coefficient
    temp = nx.jaccard_coefficient(G,[edge_list[edge]])
    jaccard_coeff[edge] = list(temp)[0][2]
    
    # Sorensen Index
    sorensen[edge] = common_neighbors[edge]/neighbor_sum[edge]
    
    # Cosine Similarity
    cosine_sim[edge] = common_neighbors[edge]/np.sqrt(pref_attach[edge])
    
    # Hub Promoted
    hub_prom[edge] = common_neighbors[edge]/min(sum(1 for i in G.neighbors(edge_list[edge][0])), sum(1 for i in G.neighbors(edge_list[edge][1])))
    
    # Hub Depressed
    hub_depr[edge] = common_neighbors[edge]/max(sum(1 for i in G.neighbors(edge_list[edge][0])), sum(1 for i in G.neighbors(edge_list[edge][1])))
    
    # LHN
    lhn[edge] = common_neighbors[edge]/pref_attach[edge]
    
    # Adamic/Adar
    temp = nx.adamic_adar_index(G,[edge_list[edge]])
    adamic_adar[edge] = list(temp)[0][2]
    
    # Resource Allocation
    temp = list(nx.common_neighbors(G, edge_list[edge][0], edge_list[edge][1]))
    temp = dict(G.degree(temp))
    resource_all[edge] = sum(1/i for i in list(temp.values()))
    
    # Clustering coefficient
    local_cluster_sum[edge] = cluster_coeff[edge_list[edge][0]] + cluster_coeff[edge_list[edge][1]]
    local_cluster_prod[edge] = cluster_coeff[edge_list[edge][0]] * cluster_coeff[edge_list[edge][1]]
    
    # Local Path
    local_path001[edge] = Asquared[edge_list[edge][0]-1,edge_list[edge][1]-1] + .001*Acubed[edge_list[edge][0]-1,edge_list[edge][1]-1]
    local_path01[edge] = Asquared[edge_list[edge][0]-1,edge_list[edge][1]-1] + .01*Acubed[edge_list[edge][0]-1,edge_list[edge][1]-1]
    local_path1[edge] = Asquared[edge_list[edge][0]-1,edge_list[edge][1]-1] + .1*Acubed[edge_list[edge][0]-1,edge_list[edge][1]-1]
    
    # Commute Time
    temp = (Lstar[edge_list[edge][0]-1, edge_list[edge][0]-1] + Lstar[edge_list[edge][1]-1, edge_list[edge][1]-1] - 2*Lstar[edge_list[edge][0]-1, edge_list[edge][1]-1])
    commute_time[edge] = len(edge_list)*temp
    
    # Cosine similarity
    temp = np.sqrt(Lstar[edge_list[edge][0]-1, edge_list[edge][0]-1]*Lstar[edge_list[edge][1]-1, edge_list[edge][1]-1])
    cosine_sim_time[edge] = Lstar[edge_list[edge][0]-1, edge_list[edge][1]-1] / temp
    
    # Rooted pagerank
    #rooted_page001[edge] = RPR001[edge_list[edge][0]-1,edge_list[edge][1]-1]
    #rooted_page01[edge] = RPR01[edge_list[edge][0]-1,edge_list[edge][1]-1]
    #rooted_page1[edge] = RPR1[edge_list[edge][0]-1,edge_list[edge][1]-1]
    
    ####### Meta Data Attributes
    
    # Same gender
    same_gender[edge] = (node_attr.loc[edge_list[edge][0]-1,'gender'] + node_attr.loc[edge_list[edge][1]-1,'gender'])%2
    
    # Same status
    if node_attr.loc[edge_list[edge][0]-1,'status'] == node_attr.loc[edge_list[edge][1]-1,'status']:
        same_status[edge] = 1
    else:
        same_status[edge] = 0
    
    # Same major
    if node_attr.loc[edge_list[edge][0]-1,'major'] == node_attr.loc[edge_list[edge][1]-1,'major']:
        same_major[edge] = 1
    else:
        same_major[edge] = 0
        
    # Same dorm
    if node_attr.loc[edge_list[edge][0]-1,'dorm'] == node_attr.loc[edge_list[edge][1]-1,'dorm']:
        same_dorm[edge] = 1
    else:
        same_dorm[edge] = 0
        
    # Same year
    if node_attr.loc[edge_list[edge][0]-1,'year'] == node_attr.loc[edge_list[edge][1]-1,'year']:
        same_year[edge] = 1
    else:
        same_year[edge] = 0
    
    ####### Print statement for tracking
    if edge%10000 == 0:
        print(edge, end = " ")

0 10000 20000 30000 40000 

In [19]:
"""
Create data frame with all of the features
"""

# Initialize data frame to store features
feature_df = pd.DataFrame.from_records(edge_list, columns = ['Node_1', 'Node_2'])

# Create features for data frame
feature_df['shortest_path'] = shortest_path
feature_df['common_neighbors'] = common_neighbors
feature_df['pref_attach'] = pref_attach
feature_df['neighbor_sum'] = neighbor_sum
feature_df['sorensen'] = sorensen
feature_df['cosine_sim'] = cosine_sim
feature_df['hub_prom'] = hub_prom
feature_df['hub_depr'] = hub_depr
feature_df['hub_prom'] = hub_prom
feature_df['lhn'] = lhn
feature_df['adamic_adar'] = adamic_adar
feature_df['resource_all'] = resource_all
feature_df['local_cluster_sum'] = local_cluster_sum
feature_df['local_cluster_prod'] = local_cluster_prod
feature_df['same_gender'] = same_gender
feature_df['same_status'] = same_status
feature_df['same_major'] = same_major
feature_df['same_dorm'] = same_dorm
feature_df['same_year'] = same_year
feature_df['local_path001'] = local_path001
feature_df['local_path01'] = local_path01
feature_df['local_path1'] = local_path1
feature_df['commute_time'] = commute_time
feature_df['cosine_sim_time'] = cosine_sim_time
#feature_df['rooted_page001'] = rooted_page001
#feature_df['rooted_page01'] = rooted_page01
#feature_df['rooted_page1'] = rooted_page1

In [24]:
"""
Save features to a data frame
"""
# Feature data frame
feature_df.to_csv(filename + '.csv')

# Valid and test sets to text files
with open(filename + '_valid.txt', 'w') as f:
    for item in edge_valid:
        f.write(str(item[0]) +  " " + str(item[1]) + '\n')
with open(filename + '_test.txt', 'w') as f:
    for item in edge_test:
        f.write(str(item[0]) + " " + str(item[1]) + '\n')