In [26]:
import rdflib
from collections import defaultdict
from pprint import pprint

In [27]:
g = rdflib.Graph().parse('./data/subset_NamedRockUnit.nt', format='nt')

In [32]:
graph_temp = rdflib.Graph().parse('./data/Lexicon_NamedRockUnit.nt', format='nt')

In [43]:
[[str(x) for x in triple] for triple in graph_temp.triples((None, None, None))][:10]

[['http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/SO',
  'http://data.bgs.ac.uk/ref/Lexicon/hasLithologyComponent',
  'http://data.bgs.ac.uk/id/Lexicon/LithologyComponent/SO_SANDU'],
 ['http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/BYF',
  'http://data.bgs.ac.uk/ref/Lexicon/hasUnitClass',
  'http://data.bgs.ac.uk/id/Lexicon/Class/LS'],
 ['http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/USTL',
  'http://data.bgs.ac.uk/ref/Lexicon/hasPredominantAge',
  'http://data.bgs.ac.uk/id/Geochronology/Division/CE'],
 ['http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/OKSG',
  'http://data.bgs.ac.uk/ref/Lexicon/hasThicknessDescription',
  'Probably to 4m'],
 ['http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/UDCS',
  'http://www.w3.org/2004/02/skos/core#altLabel',
  'UDCS'],
 ['http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/FDHN3',
  'http://data.bgs.ac.uk/ref/Lexicon/hasYoungestAge',
  'http://data.bgs.ac.uk/id/Geochronology/Division/S'],
 ['http://data.bgs.ac.uk/id/Lexicon/NamedRockUnit/SASS',
  'htt

In [28]:
for triple in g:
    print(*triple)

_:A1 _:P2 _:C
_:B1 _:P2 _:D
_:A2 _:P2 _:C
_:B2 _:P2 _:D


In [29]:
def get_instances(g: rdflib.Graph(), predicate_name: str):
    return [t[0] for t in g if str(t[1]).endswith(predicate_name)]
all_instances = get_instances(g, 'P2')
pprint(all_instances)

[rdflib.term.URIRef('_:A1'),
 rdflib.term.URIRef('_:B1'),
 rdflib.term.URIRef('_:A2'),
 rdflib.term.URIRef('_:B2')]


In [30]:
depth = 2
vertices = []
edges = []
search_front = []
labels = defaultdict(dict)
subgraph_vertices = defaultdict(lambda: defaultdict(list))
subgraph_edges = defaultdict(lambda: defaultdict(list))

# 1. Initialization
for instance in all_instances:
    vertices.append(instance)
    labels[instance][depth] = 'root'
# 2. Subgraph Extraction
for instance in all_instances:
    search_front.append(instance)
    for d in range(depth - 1, -1, -1):
        new_search_front = []
        for r in search_front:
            # taking all triples that have as subject r
            triples = [(subj, pred, obj) for (subj, pred, obj) in g if subj == r]
            for (subj, pred, obj) in triples:
                new_search_front.append(obj)
                if obj not in vertices:
                    vertices.append(obj)
                if obj not in subgraph_vertices[instance][d]:
                    subgraph_vertices[instance][d].append(obj)
                labels[obj][d] = obj
                if (subj, pred, obj) not in edges:
                    edges.append( (subj, pred, obj) )
                if (subj, pred, obj) not in subgraph_edges[instance][d]:
                    subgraph_edges[instance][d].append( (subj, pred, obj) )
                labels[ (subj, pred, obj) ][d] = pred
        search_front = new_search_front

In [31]:
print('vertices:')
pprint(vertices)
print('edges:')
pprint(edges)
print('labels:')
pprint(labels)
print('subgraph_vertices:')
pprint(subgraph_vertices)
print('subgraph_edges:')
pprint(subgraph_edges)

vertices:
[rdflib.term.URIRef('_:A1'),
 rdflib.term.URIRef('_:B1'),
 rdflib.term.URIRef('_:A2'),
 rdflib.term.URIRef('_:B2'),
 rdflib.term.URIRef('_:C'),
 rdflib.term.URIRef('_:D')]
edges:
[(rdflib.term.URIRef('_:A1'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:C')),
 (rdflib.term.URIRef('_:B1'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:D')),
 (rdflib.term.URIRef('_:A2'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:C')),
 (rdflib.term.URIRef('_:B2'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:D'))]
labels:
defaultdict(<class 'dict'>,
            {rdflib.term.URIRef('_:A1'): {2: 'root'},
             rdflib.term.URIRef('_:A2'): {2: 'root'},
             rdflib.term.URIRef('_:B1'): {2: 'root'},
             rdflib.term.URIRef('_:B2'): {2: 'root'},
             rdflib.term.URIRef('_:C'): {1: rdflib.term.URIRef('_:C')},
             rdflib.term.URIRef('_:D'): {1: rdflib.term.URIRef('_:D')},
             (rdflib.term.URIRef('_:A1'), rdflib.term.U

In [48]:
V = []
E = []
depth = 1
labels = dict(dict())
search_front = []
V_sub = dict()
E_sub = dict()


# 1. Initialization
for i in instances:
    V.append(i)
    labels[i][depth] = 'e'
# 2. Subgraph Extraction
for i in instances:
    search_front.append(i)
    for i in range(depth - 1, -1, -1):
        new_search_front = []
        for r in search_front:
            triples = [(s, p, o) for (s, p, o) in g if s == r]
            for (s, p, o) in triples:
                new_search_front.append(o)
                if o not in V:
                    V.append(o)
                labels[o].append(depth)
                if o not in V_sub.keys():
                    V_sub[o] = depth
                if (s, p, o) not in E:
                    E.append((s, p, o))
                labels[(s, p, o)].append(depth)
                if (s, p, o) not in E_sub:
                    E_sub[(s, p, o)] = depth
        search_front = new_search_front


print('V:')
pprint(V)
print('E:')
pprint(E)
print('V_sub:')
pprint(V_sub)
print('E_sub:')
pprint(E_sub)
print('labels:')
pprint(labels)

V:
[rdflib.term.URIRef('_:B1'),
 rdflib.term.URIRef('_:A2'),
 rdflib.term.URIRef('_:A1'),
 rdflib.term.URIRef('_:B2'),
 rdflib.term.URIRef('_:D'),
 rdflib.term.URIRef('_:C')]
E:
[(rdflib.term.URIRef('_:B1'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:D')),
 (rdflib.term.URIRef('_:A2'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:C')),
 (rdflib.term.URIRef('_:A1'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:C')),
 (rdflib.term.URIRef('_:B2'),
  rdflib.term.URIRef('_:P2'),
  rdflib.term.URIRef('_:D'))]
V_sub:
{rdflib.term.URIRef('_:C'): 1, rdflib.term.URIRef('_:D'): 1}
E_sub:
{(rdflib.term.URIRef('_:A1'), rdflib.term.URIRef('_:P2'), rdflib.term.URIRef('_:C')): 1,
 (rdflib.term.URIRef('_:A2'), rdflib.term.URIRef('_:P2'), rdflib.term.URIRef('_:C')): 1,
 (rdflib.term.URIRef('_:B1'), rdflib.term.URIRef('_:P2'), rdflib.term.URIRef('_:D')): 1,
 (rdflib.term.URIRef('_:B2'), rdflib.term.URIRef('_:P2'), rdflib.term.URIRef('_:D')): 1}
labels:
defaultdict(<class 'lis