In [142]:
from graph_tool.all import *
import numpy as np
import datetime

def D_example():
    D=Graph()
    
    v1 = D.add_vertex()
    v2 = D.add_vertex()
    v3 = D.add_vertex()
    v4 = D.add_vertex()
    v5 = D.add_vertex()
    v6 = D.add_vertex()
    v7 = D.add_vertex()
    v8 = D.add_vertex()
    
    e1 = D.add_edge( v1, v2 )
    e2 = D.add_edge( v1, v3 )
    e3 = D.add_edge( v1, v4 )
    e4 = D.add_edge( v1, v5 )
    e5 = D.add_edge( v6, v5 )
    e6 = D.add_edge( v7, v5 )
    e7 = D.add_edge( v5, v8 )

    prop_v = D.new_vertex_property( 'string' )
    prop_e = D.new_edge_property( 'string' )
    
    D.vertex_properties['name'] = prop_v
    D.edge_properties['c0'] = prop_e
    
    prop_v[v1] = '/John'
    prop_v[v2] = 'john@example.org'
    prop_v[v3] = 'john@doe.org'
    prop_v[v4] = '/Researcher'
    prop_v[v5] = '/Rome'
    prop_v[v6] = '/Giacomo'
    prop_v[v7] = '/Piero'
    prop_v[v8] = '"Roma"@it'
    
    prop_e[e1] = 'foaf:mbox'
    prop_e[e2] = 'foaf:mbox'
    prop_e[e3] = 'rdf:type'
    prop_e[e4] = 'ex:birthPlace'
    prop_e[e5] = 'ex:areaOfWork'
    prop_e[e6] = 'ex:areaOfWork'
    prop_e[e7] = 'foaf:name'
    
    return D

D=load_graph( '/Users/matthaeus/Projects/lodcc/dumps/rkb-explorer-newcastle/data.graph.gt.gz' )
#D=D_example()

In [171]:
# =========
# CAUTION
# please keep in mind that YOU CANNOT work with the vertice's and edge's index, 'cause it's a unique integer.
# you have to work with the vertice's and edge's label in all operations
# =========
#
# This notebook shows the implementation of the metrics in python
#

D.set_vertex_filter(None)
D.set_edge_filter(None)

prop_s = D.new_vertex_property( 'bool', val=False )
prop_o = D.new_vertex_property( 'bool', val=False )

D.vertex_properties['subject'] = prop_s
D.vertex_properties['object'] = prop_o

for v in D.vertices():
    if v.out_degree() > 0:
        prop_s[v] = True

    if v.in_degree() > 0:
        prop_o[v] = True

S = GraphView( D, vfilt=prop_s )
O = GraphView( D, vfilt=prop_o )

print( "Number of subjects: %s" % S.num_vertices() )
print( "Number of objects: %s" % O.num_vertices() )

Number of subjects: 5494
Number of objects: 15267


array([1, 0, 0, ..., 1, 0, 0], dtype=uint64)

In [185]:
# the number of triples in G in which s occurs as subject
l = np.array( [ s.out_degree() for s in D.vertices() if D.vp.subject[s] ] )
print( "(3) out-degree. max: %s, mean: %f" % ( l.max(), l.mean() ) )

# the number of triples in G, in which o occurs as object
l = np.array( [ o.in_degree() for o in D.vertices() if D.vp.object[o] ] )
print( "(7) in-degree. max: %s, mean: %f" % ( l.max(), l.mean() ) )


(3) out-degree. max: 570, mean: 10.566072
(7) in-degree. max: 4692, mean: 3.802319


In [186]:
# SUBJECT OUT-DEGREES

D.set_edge_filter(None)
D.set_edge_filter( prop_p_s )

# the number of triples of G, in which s occurs as subject and p as predicate
## e.g. l = ['/John_foaf:mbox', '/John_foaf:mbox', '/John_rdf:type', '/John_ex:birthPlace', '/Rome_foaf:name', '/Giacomo_ex:areaOfWork', '/Piero_ex:areaOfWork']
l = [ (D.vp.name[p.source()],D.ep.c0[p]) for p in D.edges() ]
_, counts = np.unique( l, return_counts=True, axis=0 )
print( "(4) partial out-degree. max: %s, mean: %f" % ( np.max( counts ), np.mean( counts ) ) )

# the number of different predicates (labels) of G with which s is related as a subject
## e.g. l = [ ['foaf:mbox', 'foaf:mbox', 'rdf:type', 'ex:birthPlace'], ['foaf:name'], ['ex:areaOfWork'], ['ex:areaOfWork'] ]
l = np.array( [ len( { D.ep.c0[p] for p in s.out_edges() } ) for s in D.vertices() if D.vp.subject[s] ] )
print( "(5) labelled out-degree. max: %s, mean: %f" % ( l.max(), l.mean() ) )

# the number of different objects of G with which s is related as a subject
## e.g. l = [ ['john@example.org', 'john@doe.org', '/Researcher', '/Rome'], ['"Roma"@it'], ['/Rome'], ['/Rome'] ]
l = np.array( [ len( { D.vp.name[p.target()] for p in s.out_edges() } ) for s in D.vertices() if D.vp.subject[s] ] )
print( "(6) direct out-degree. max: %s, mean: %f" % ( l.max(), l.mean() ) )

(4) partial out-degree. max: 190, mean: 1.868542
(5) labelled out-degree. max: 13, mean: 5.654714
(6) direct out-degree. max: 29, mean: 6.456680


In [187]:
# OBJECT IN-DEGREES

# the number of triples of G, in which s occurs as subject and p as predicate
## e.g. l = ['foaf:mbox_john@example.org', 'foaf:mbox_john@doe.org', 'rdf:type_/Researcher', 'ex:areaOfWork_/Rome', 'ex:areaOfWork_/Rome', 'ex:birthPlace_/Rome', 'foaf:name_"Roma"@it']
l = [ "%s_%s" % (D.ep.c0[p],D.vp.name[o]) for o in D.vertices() if D.vp.object[o] for p in o.in_edges() ]
unique, l = np.unique( l, return_counts=True )
print( "(8) partial in-degree. max: %s, mean: %s" % ( np.max( l ), np.mean( l ) ) )

# the number of different predicates (labels) of G with which s is related as a subject
## e.g. l = [ ['foaf:mbox'],['foaf:mbox'],['rdf:type'],['ex:areaOfWork', 'ex:areaOfWork', 'ex:birthPlace'],['foaf:name'] ]
l = np.array( [ len( { D.ep.c0[p] for p in o.in_edges() } ) 
               for o in D.vertices() 
               if D.vp.object[o] ] )
print( "(9) labelled in-degree. max: %s, mean: %s" % ( l.max(), l.mean() ) )

# the number of different objects of G with which s is related as a subject
## e.g. l = [ ['john@example.org'], ['john@doe.org'], ['/Researcher'], ['/Rome', '/Rome', '/Rome'], ['"Roma"@it'] ]
l = np.array( [ len( { D.vp.name[p.source()] for p in o.in_edges() } ) 
               for o in D.vertices() 
               if D.vp.object[o] ] )
print( "(10) direct in-degree. max: %s, mean: %s" % ( l.max(), l.mean() ) )

(8) partial in-degree. max: 4692, mean: 3.7116368286445014
(9) labelled in-degree. max: 4, mean: 1.024431780965481
(10) direct in-degree. max: 2017, mean: 2.323508220344534


In [188]:
# PREDICATE DEGREES

## the number of triples of graph G, in which p occurs as predicate
l = [ "%s" % D.ep.c0[p] for p in D.edges() ]
unique, l = np.unique( l, return_counts=True )
print( "(11) predicate degree. max: %s, mean: %s" % ( np.max(l), np.mean(l) ) )

## the number of different subjects of G with which p is related as a predicate in a triple of G
l = [ "%s_%s" % (D.vp.name[p.source()],D.ep.c0[p]) for p in D.edges() ]
unique, l = np.unique( l, return_counts=True )
print( "(12) predicate in-degree. max: %s, mean: %s" % ( np.max(l), np.mean(l) ) )

## the number of different objects of G with which p is related as a predicate in a triple of G
l = [ "%s_%s" % (D.ep.c0[p],D.vp.name[p.target()]) for p in D.edges() ]
unique, l = np.unique( l, return_counts=True )
print( "(13) predicate out-degree. max: %s, mean: %s" % ( np.max(l), np.mean(l) ) )

(11) predicate degree. max: 12191, mean: 1488.4615384615386
(12) predicate in-degree. max: 190, mean: 1.8685421830237874
(13) predicate out-degree. max: 4692, mean: 3.7116368286445014


In [189]:
# COMMON RATIOS

## the number of elements acting both as subject and objects among all subjects and objects
nom_intersection = len( { D.vp.name[s] for s in S.vertices() } & { D.vp.name[o] for o in O.vertices() } )
denom_union = len( { D.vp.name[s] for s in S.vertices() } | { D.vp.name[o] for o in O.vertices() } )
print( "(14) subject-object ratio: %s" % (float(nom_intersection) / denom_union) )

## the number of elements acting both as subject and predicates among all subjects and predicates
nom_intersection = len( { D.vp.name[s] for s in S.vertices() } & { D.ep.c0[p] for p in D.edges() } )
denom_union = len( { D.vp.name[s] for s in S.vertices() } | { D.ep.c0[p] for p in D.edges() } )
print( "(15) subject-predicate ratio: %s" % (float(nom_intersection) / denom_union) )

## the number of elements acting both as predicates and objects among all predicates and objects
nom_intersection = len( { D.ep.c0[p] for p in D.edges() } & { D.vp.name[o] for o in O.vertices() } )
denom_union = len( { D.ep.c0[p] for p in D.edges() } | { D.vp.name[o] for o in O.vertices() } )
print( "(16) predicate-object ratio: %s" % (float(nom_intersection) / denom_union) )

(14) subject-object ratio: 0.15313263719173517
(15) subject-predicate ratio: 0.0
(16) predicate-object ratio: 0.0


In [190]:
# SUBJECT-OBJECT DEGREES

## the maximum out-degree of the graph G restricted to subject–objects
l = np.array( [ v.out_degree() for v in D.vertices() if D.vp.subject[v] and D.vp.object[v] ] )
print( "(Eq.15) subject-object out-degree. max: %s" % l.max() )
print( "(Eq.16) subject-object out-degree. mean: %s" % l.mean() ) # i.e. (l.sum() / l.size)

(Eq.15) subject-object out-degree. max: 570
(Eq.16) subject-object out-degree. mean: 11.503083061298513


In [191]:
# PREDICATE LISTS

## We denote as L_G to the set of different predicate lists in G
L_G = [ np.unique( [ D.ep.c0[p] for p in s.out_edges() ] )
                 for s in D.vertices() 
                 if D.vp.subject[s] ]
L_G_max = np.max( list( map( lambda e: len(e), L_G ) ) )
# np.unique works only on sets with same dimension. thus, we need to fill up all arrays with np.nan
# to the size of L_G_max
L_G = list( map( lambda L_s: np.append( L_s, np.full( L_G_max-len(L_s), np.nan ) ), L_G ) )

## .. is defined as the ratio of repeated predicate lists from the total lists in the graph G
L_G, counts = np.unique( L_G, return_counts=True, axis=0 )
print( "(17) ratio of repeated predicate lists: %s" % (1 - ( len(L_G) / S.num_vertices() )) )

## .. is defined as the number of different subjects in G whose list of predicates is exactly Ls
print( "(18) predicate list-degree. max: %s, mean: %s" % ( np.max(counts), np.mean(counts) ) )

## .. is defined as the number of different predicate lists in LG in which the predicate appears
# TODO ZL
# print( "(19) lists per predicate degree. ")

(17) ratio of repeated predicate lists: 0.9767018565708045
(18) predicate list-degree. max: 2012, mean: 42.921875


In [193]:
# TYPED SUBJECTS and CLASSES

## number of all different classes
C_G = { D.vp.name[p.target()] 
          for p in D.edges() 
          if D.ep.c0[p] == 'ae98476863dc6ec5' }  # ae98476863dc6ec5 = http://www.w3.org/1999/02/22-rdf-syntax-ns#type
print( "Metric: number of different classes: %s" % ( len(C_G) ) )

## number of all different typed subjects
SC_G = { D.vp.name[p.source()] 
          for p in D.edges() 
          if D.ep.c0[p] == 'ae98476863dc6ec5' }  # ae98476863dc6ec5 = http://www.w3.org/1999/02/22-rdf-syntax-ns#type
print( "Metric: number of typed subjects: %s" % ( len(SC_G) ) )
print( "(20): ratio of typed subjects: %s" % ( len(SC_G) / S.num_vertices() ) )
print( "Metric: degree of predicate lists for typed subjects")


Metric: number of different classes: 18
Metric: number of typed subjects: 5494
(20): ratio of typed subjects: 1.0
Metric: degree of predicate lists for typed subjects
