### This dataset consists of 'circles' (or 'friends lists') from Facebook. Facebook data was collected from survey participants using this Facebook app. The dataset includes node features (profiles), circles, and ego networks.

http://snap.stanford.edu/data/egonets-Facebook.html

In [1]:
#setup
from graph_tool.all import * 
import numpy as np
import math
import time

file_path = "/home/jawa/Datasets/Email-Enron.txt"

In [2]:
# initialization
g = Graph(directed=True)

neighbours = {}
vertex_id = {}

#this time, we watn to add a edge in each direction for running pagerank and other centrality metrics

with open(file_path, encoding="UTF-8") as f:
    for line in f:
        if line.startswith("#"):
            continue
        l = line.replace("\n", "").split("\t")
        a = l[0]
        b = l[1]
        if a not in neighbours:
            neighbours[a] = []
        if b not in neighbours:
            neighbours[b] = []
        # see if we have been here before
        if a in neighbours[b]:
            continue
        if b in neighbours[a]:
            continue
        # add to our check lists
        neighbours[a].append(b)
        neighbours[b].append(a)
        # get the vertexes
        if a not in vertex_id:
            v_a = g.add_vertex()
            vertex_id[g.vertex_index[v_a]] = a
        else:
            v_a = g.vertex(vertex_id[a])
        if b not in vertex_id:
            v_b = g.add_vertex()
            vertex_id[g.vertex_index[v_b]] = b
        else:
            v_b = g.vertex(vertex_id[b])
        # add the edges
        g.add_edge(v_a, v_b)
        g.add_edge(v_b, v_a)
    

In [3]:
# degree stats
degrre_data = {}
degrre_data["out"] = []
degrre_data["in"] = []
counter = 0
for v in g.vertices():
        counter += 1
        degrre_data["out"].append(v.out_degree())
        degrre_data["in"].append(v.in_degree())
    
max_out = max(degrre_data["out"])
min_out = min(degrre_data["out"])
average_out = np.average(degrre_data["out"])
sd_out = np.std(degrre_data["out"])
print("Out degree stats -> max value = " + str(max_out) + " / min value = " + str(min_out) + " / average = " + str(average_out) + " / SV = " + str(sd_out))

Out degree stats -> max value = 1 / min value = 1 / average = 1.0 / SV = 0.0


In [4]:
# distance stats
#shortest_distance = graph_tool.topology.shortest_distance(g)
#global_avg = []
#for dist in shortest_distance:
#    global_avg.append(np.average(dist))
pseudo_diameter = graph_tool.topology.pseudo_diameter(g)

# there are non connect components in the graph, so we cant just calculate the avverage shortest distance
#print("Shortest Distance Average = " + str(np.average(global_avg)) + " / Diameter = " + str(pseudo_diameter))
print("Diameter = " + str(pseudo_diameter))

Diameter = (1.0, (<Vertex object with index '0' at 0x7fcd302c14f8>, <Vertex object with index '1' at 0x7fcd302c1840>))


In [5]:
# clustering stats
global_clustering = graph_tool.clustering.global_clustering(g)

print ("Clustering Average = " + str(np.average(global_clustering)) + " / " + "Clustering SV = " + str(np.std(global_clustering)))

Clustering Average = 0.0 / Clustering SV = 0.0


In [6]:
# connected componet stats
comp, hist = label_components(g)

labels = list(set(comp.a))
largest = label_largest_component(g)
l = GraphView(g, vfilt=largest)

print("Connected components = " + str(len(labels)) + " / Edges in Largest component = " + str(l.num_edges()) + " / Vertices in Largest component = " + str(l.num_vertices()))

Connected components = 183831 / Edges in Largest component = 2 / Vertices in Largest component = 2


In [7]:
start = time.time()
# vertex closeness stats
g_closeness = closeness(g)
c_list = []
for vc in g_closeness.a:
    if not math.isnan(vc):
        c_list.append(vc)

end = time.time()
print("Time Taken: " + str(end - start) +"s")
print("Closeness stats -> max value = " + str(max(c_list)) + " / min value = " + str(min(c_list)) + " / average = " + str(np.average(c_list)) + " / SV = " + str(np.std(c_list)))

closeness_dic   = {}
zero_closeness = 0
for i in range(0, len(g_closeness.a)):
    if (g.vertex(i).out_degree() != 0):
        closeness_dic[g.vertex_index[i]] = g_closeness.a[i]
    else:
        zero_closeness += 1

Time Taken: 265.1480493545532s
Closeness stats -> max value = 1.0 / min value = 1.0 / average = 1.0 / SV = 0.0


In [8]:
# betweenness stats
start=time.time()
g_betweenness, edge_betweenness = betweenness(g)
v_b = list(g_betweenness)
end = time.time()
print("Time Taken: " + str(end - start) +"s")
print("Betweenness stats -> max value = " + str(max(v_b)) + " / min value = " + str(min(v_b)) + " / average = " + str(np.average(v_b)) + " / SV = " + str(np.std(v_b)))

betweenness_dic   = {}
zero_betweenness = 0
for i in range(0, len(g_betweenness.a)):
    if (g.vertex(i).out_degree() != 0):
        betweenness_dic[g.vertex_index[i]] = g_betweenness.a[i]
    else:
        zero_betweenness += 1

Time Taken: 618.121927022934s
Betweenness stats -> max value = 0.0 / min value = 0.0 / average = 0.0 / SV = 0.0


In [9]:
start=time.time()
# degree centrality metrics
out_dic = {}
in_dic  = {}
zero_out = 0
zero_in  = 0
v_num = len(vertex_id.keys())
for v in g.vertices():
    if (v.out_degree() == 0):
        zero_out += 1
    else:
        out_dic[g.vertex_index[v]] = 1.0* v.out_degree() / (v_num - 1)
    if (v.in_degree() == 0):
        zero_in += 1
    else:
        in_dic[g.vertex_index[v]]  = v.in_degree() / (v_num - 1)
end = time.time()
print("Time Taken: " + str(end - start) +"s")

Time Taken: 6.157048225402832s


In [10]:
start=time.time()
# katz
g_katz = graph_tool.all.katz(g)
katz_dic = {}
for i in range(0, len(g_katz.a)):
    katz_dic[g.vertex_index[i]] = g_katz.a[i]
end = time.time()
print("Time Taken: " + str(end - start) +"s")

Time Taken: 26.518951416015625s


In [11]:
start=time.time()
# pagerank
g_pagerank = pagerank(g)
pagerank_dic = {}
for i in range(0, len(g_pagerank.a)):
    pagerank_dic[g.vertex_index[i]] = g_pagerank.a[i]
end = time.time()
print("Time Taken: " + str(end - start) +"s")

Time Taken: 26.58941674232483s


In [12]:
# print centrality metrics
def topTen(metric_name, dic, string=""):
    most = {}
    less = {}
    for i in range(0, 10):
        # +
        n = max(dic, key=dic.get)
        most[n] = dic[n]
        del dic[n]
        # -
        n = min(dic, key=dic.get)
        less[n] = dic[n]
        del dic[n]    
    print(metric_name)
    print(string)
    print("Most central:")
    print(sorted(most.items(), key=lambda x: x[1]))
    print("Least central:")
    print(sorted(less.items(), key=lambda x: x[1]))
    print("="*100)
    
topTen("Degree centrality (IN)", in_dic, "Total zeros: "+str(zero_in))
topTen("Degree centrality (OUT)", out_dic, "Total zeros: "+str(zero_out))
topTen("Betweenness", betweenness_dic, "Total zeros: "+str(zero_betweenness))
topTen("Closeness", closeness_dic, "Total zeros: "+str(zero_closeness))
topTen("Katz", katz_dic)
topTen("PageRank", pagerank_dic)

Degree centrality (IN)
Total zeros: 0
Most central:
[(0, 2.7198968615110115e-06), (16, 2.7198968615110115e-06), (2, 2.7198968615110115e-06), (4, 2.7198968615110115e-06), (6, 2.7198968615110115e-06), (8, 2.7198968615110115e-06), (10, 2.7198968615110115e-06), (12, 2.7198968615110115e-06), (18, 2.7198968615110115e-06), (14, 2.7198968615110115e-06)]
Least central:
[(19, 2.7198968615110115e-06), (1, 2.7198968615110115e-06), (3, 2.7198968615110115e-06), (17, 2.7198968615110115e-06), (5, 2.7198968615110115e-06), (7, 2.7198968615110115e-06), (9, 2.7198968615110115e-06), (11, 2.7198968615110115e-06), (13, 2.7198968615110115e-06), (15, 2.7198968615110115e-06)]
Degree centrality (OUT)
Total zeros: 0
Most central:
[(0, 2.7198968615110115e-06), (16, 2.7198968615110115e-06), (2, 2.7198968615110115e-06), (4, 2.7198968615110115e-06), (6, 2.7198968615110115e-06), (8, 2.7198968615110115e-06), (10, 2.7198968615110115e-06), (12, 2.7198968615110115e-06), (18, 2.7198968615110115e-06), (14, 2.719896861511011