### Directed graph (each unordered pair of nodes is saved once): CA-AstroPh.txt 
### Collaboration network of Arxiv Astro Physics category (there is an edge if authors coauthored at least one paper)
### Nodes: 18772 Edges: 396160

In [30]:
#setup
from graph_tool.all import *
import numpy as np

file_path = "/home/jawa/Datasets/CA-AstroPh.txt"

In [31]:
# initialization
g = Graph(directed=False)

neighbours = {}
vertex_id = {}

with open(file_path, encoding="UTF-8") as f:
    for line in f:
        if line.startswith("#"):
            continue
        l = line.replace("\n", "").split("\t")
        a = l[0]
        b = l[1]
        if a not in neighbours:
            neighbours[a] = []
        if b not in neighbours:
            neighbours[b] = []
        # see if we have been here before
        if a in neighbours[b]:
            continue
        if b in neighbours[a]:
            continue
        # add to our check lists
        neighbours[a].append(b)
        neighbours[b].append(a)
        # get the vertexes
        if a not in vertex_id:
            v_a = g.add_vertex()
            vertex_id[a] = g.vertex_index[v_a]
        else:
            v_a = g.vertex(vertex_id[a])
        if b not in vertex_id:
            v_b = g.add_vertex()
            vertex_id[b] = g.vertex_index[v_b]
        else:
            v_b = g.vertex(vertex_id[b])
        # add the edge
        g.add_edge(v_a, v_b)
    

In [32]:
# degree stats
degrre_data = {}
degrre_data["out"] = []
degrre_data["in"] = []
counter = 0
for v in g.vertices():
        counter += 1
        degrre_data["out"].append(v.out_degree())
        degrre_data["in"].append(v.in_degree())
    
max_out = max(degrre_data["out"])
min_out = min(degrre_data["out"])
average_out = np.average(degrre_data["out"])
sd_out = np.std(degrre_data["out"])
print("Out degree stats -> max value = " + str(max_out) + " / min value = " + str(min_out) + " / average = " + str(average_out) + " / SV = " + str(sd_out))

Out degree stats -> max value = 504 / min value = 1 / average = 21.1069678244 / SV = 30.5713415745


In [33]:
# distance stats
#shortest_distance = graph_tool.topology.shortest_distance(g)
#global_avg = []
#for dist in shortest_distance:
#    global_avg.append(np.average(dist))
pseudo_diameter = graph_tool.topology.pseudo_diameter(g)

# there are non connect components in the graph, so we cant just calculate the avverage shortest distance
#print("Shortest Distance Average = " + str(np.average(global_avg)) + " / Diameter = " + str(pseudo_diameter))
print("Diameter = " + str(pseudo_diameter))

Diameter = (14.0, (<Vertex object with index '18109' at 0x7f84ef5a7840>, <Vertex object with index '17810' at 0x7f84c8865de0>))


In [37]:
# clustering stats
global_clustering = graph_tool.clustering.global_clustering(g)

print ("Clustering Average = " + str(np.average(global_clustering)) + " / " + "Clustering SV = " + str(np.std(global_clustering)))

Clustering Average = 0.162088976764 / Clustering SV = 0.15575723869


In [40]:
# connected componet stats
comp, hist = label_components(g)

labels = list(set(comp.a))
largest = label_largest_component(g)
l = GraphView(g, vfilt=largest)

print("Connected components = " + str(len(labels)) + " / Edges in Largest component = " + str(l.num_edges()) + " / Vertices in Largest component = " + str(l.num_vertices()))

Connected components = 290 / Edges in Largest component = 197031 / Vertices in Largest component = 17903


In [41]:
# vertex closeness stats
vertex_closeness = closeness(g)
c_list = []
for vc in vertex_closeness.a:
    if not math.isnan(vc):
        c_list.append(vc)
        
print("Closeness stats -> max value = " + str(max(c_list)) + " / min value = " + str(min(c_list)) + " / average = " + str(np.average(c_list)) + " / SV = " + str(np.std(c_list)))

Closeness stats -> max value = 1.0 / min value = 0.101486411411 / average = 0.275854933257 / SV = 0.152575595454


In [None]:
# betweenness stats
vertex_betweenness, edge_betweenness = betweenness(g)
print("Betweenness stats -> max value = " + str(max(vertex_betweenness)) + " / min value = " + str(min(vertex_betweenness)) + " / average = " + str(np.average(vertex_betweenness)) + " / SV = " + str(vertex_betweenness))