Permalink
Browse files

init NTriples parsing

  • Loading branch information...
1 parent 52a4b77 commit 413ee4299d1984c46733040e9606b6583f7b0d94 @mhausenblas committed Mar 30, 2012
Showing with 269 additions and 12 deletions.
  1. +1 −1 gk-launch.sh
  2. +1 −1 gk-shutdown.sh
  3. +24 −10 graphkeeper.py
  4. +237 −0 ntriples.py
  5. +6 −0 test/dataset0.nt
View
@@ -1,7 +1,7 @@
#!/bin/sh
#
# Public Domain
-# see
+# see https://github.com/mhausenblas/graphkeeper
PATH_TO_ZK_BIN=/Users/michael/Documents/dev/zookeeper-3.3.5/bin
View
@@ -1,7 +1,7 @@
#!/bin/sh
#
# Public Domain
-# see
+# see https://github.com/mhausenblas/graphkeeper
PATH_TO_ZK_BIN=/Users/michael/Documents/dev/zookeeper-3.3.5/bin
View
@@ -1,4 +1,15 @@
-import zookeeper, threading, sys
+"""
+GraphKeeper
+Michael Hausenblas, DERI
+Public Domain
+Documentation:
+ https://github.com/mhausenblas/graphkeeper
+
+Command line usage:
+ python graphkeeper.py <URI> - parses data from URI as N-Triples and loads it into the store
+"""
+import zookeeper, threading, ntriples, sys
+from pprint import pprint
ZOO_OPEN_ACL_UNSAFE = {"perms":0x1f, "scheme":"world", "id" :"anyone"};
@@ -49,12 +60,15 @@ def put_ng(self, ng, val):
zookeeper.create(self.handle, ng, val, [ZOO_OPEN_ACL_UNSAFE], zookeeper.SEQUENCE)
if __name__ == '__main__':
- gk = GraphKeeper()
-
- print 'Set up of GraphKeeper ...'
- gk.set_up()
-
- gk.put_ng('/ng', '<http://data.example.org/person/tim> dc:publisher "Tim" .')
-
- (data, stat) = gk.get_ng('/ng-0')
- print data
+ if len(sys.argv) == 2:
+ gk = GraphKeeper()
+ sink = ntriples.parseURI(sys.argv[1])
+
+ # print 'Set up of GraphKeeper ...'
+ # gk.set_up()
+ #
+ # gk.put_ng('/ng', '<http://data.example.org/person/tim> dc:publisher "Tim" .')
+ #
+ # (data, stat) = gk.get_ng('/ng-0')
+ # print data
+ else: print __doc__
View
@@ -0,0 +1,237 @@
+"""
+N-Triples Parser
+Copyright 2004, Sean B. Palmer, inamidst.com
+Licensed under GPL 2, W3C, BSD, MIT, or EFL 2
+Documentation:
+ http://inamidst.com/proj/rdf/ntriples-doc
+
+Command line usage:
+ ./ntriples.py <URI> - parses URI as N-Triples
+ ./ntriples.py --help - prints out this help message
+# @@ fully empty document?
+"""
+
+import re
+
+uriref = r'<([^:]+:[^\s"<>]+)>'
+literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"'
+litinfo = r'(?:@([a-z]+(?:-[a-z0-9]+)*)|\^\^' + uriref + r')?'
+
+r_line = re.compile(r'([^\r\n]*)(?:\r\n|\r|\n)')
+r_wspace = re.compile(r'[ \t]*')
+r_wspaces = re.compile(r'[ \t]+')
+r_tail = re.compile(r'[ \t]*\.[ \t]*')
+r_uriref = re.compile(uriref)
+r_nodeid = re.compile(r'_:([A-Za-z][A-Za-z0-9]*)')
+r_literal = re.compile(literal + litinfo)
+
+bufsiz = 2048
+validate = False
+
+class Node(unicode): pass
+
+class URI(Node): pass
+class bNode(Node): pass
+class Literal(Node):
+ def __new__(cls, lit, lang=None, dtype=None):
+ n = str(lang) + ' ' + str(dtype) + ' ' + lit
+ return unicode.__new__(cls, n)
+
+class Sink(object):
+ def __init__(self):
+ self.length = 0
+
+ def triple(self, s, p, o):
+ self.length += 1
+ print (s, p, o)
+
+class ParseError(Exception): pass
+
+quot = {'t': '\t', 'n': '\n', 'r': '\r', '"': '"', '\\': '\\'}
+r_safe = re.compile(r'([\x20\x21\x23-\x5B\x5D-\x7E]+)')
+r_quot = re.compile(r'\\(t|n|r|"|\\)')
+r_uniquot = re.compile(r'\\u([0-9A-F]{4})|\\U([0-9A-F]{8})')
+
+def unquote(s):
+ """Unquote an N-Triples string."""
+ result = []
+ while s:
+ m = r_safe.match(s)
+ if m:
+ s = s[m.end():]
+ result.append(m.group(1))
+ continue
+
+ m = r_quot.match(s)
+ if m:
+ s = s[2:]
+ result.append(quot[m.group(1)])
+ continue
+
+ m = r_uniquot.match(s)
+ if m:
+ s = s[m.end():]
+ u, U = m.groups()
+ codepoint = int(u or U, 16)
+ if codepoint > 0x10FFFF:
+ raise ParseError("Disallowed codepoint: %08X" % codepoint)
+ result.append(unichr(codepoint))
+ elif s.startswith('\\'):
+ raise ParseError("Illegal escape at: %s..." % s[:10])
+ else: raise ParseError("Illegal literal character: %r" % s[0])
+ return unicode(''.join(result))
+
+if not validate:
+ def unquote(s):
+ return s.decode('unicode-escape')
+
+r_hibyte = re.compile(r'([\x80-\xFF])')
+
+def uriquote(uri):
+ return r_hibyte.sub(lambda m: '%%%02X' % ord(m.group(1)), uri)
+if not validate:
+ def uriquote(uri):
+ return uri
+
+class NTriplesParser(object):
+ """An N-Triples Parser.
+ Usage:
+ p = NTriplesParser(sink=MySink())
+ sink = p.parse(f) # file; use parsestring for a string
+ """
+
+ def __init__(self, sink=None):
+ if sink is not None:
+ self.sink = sink
+ else: self.sink = Sink()
+
+ def parse(self, f):
+ """Parse f as an N-Triples file."""
+ if not hasattr(f, 'read'):
+ raise ParseError("Item to parse must be a file-like object.")
+
+ self.file = f
+ self.buffer = ''
+ while True:
+ self.line = self.readline()
+ if self.line is None: break
+ try: self.parseline()
+ except ParseError:
+ raise ParseError("Invalid line: %r" % self.line)
+ return self.sink
+
+ def parsestring(self, s):
+ """Parse s as an N-Triples string."""
+ if not isinstance(s, basestring):
+ raise ParseError("Item to parse must be a string instance.")
+ from cStringIO import StringIO
+ f = StringIO()
+ f.write(s)
+ f.seek(0)
+ self.parse(f)
+
+ def readline(self):
+ """Read an N-Triples line from buffered input."""
+ # N-Triples lines end in either CRLF, CR, or LF
+ # Therefore, we can't just use f.readline()
+ if not self.buffer:
+ buffer = self.file.read(bufsiz)
+ if not buffer: return None
+ self.buffer = buffer
+
+ while True:
+ m = r_line.match(self.buffer)
+ if m: # the more likely prospect
+ self.buffer = self.buffer[m.end():]
+ return m.group(1)
+ else:
+ buffer = self.file.read(bufsiz)
+ if not buffer:
+ raise ParseError("EOF in line")
+ self.buffer += buffer
+
+ def parseline(self):
+ self.eat(r_wspace)
+ if (not self.line) or self.line.startswith('#'):
+ return # The line is empty or a comment
+
+ subject = self.subject()
+ self.eat(r_wspaces)
+
+ predicate = self.predicate()
+ self.eat(r_wspaces)
+
+ object = self.object()
+ self.eat(r_tail)
+
+ if self.line:
+ raise ParseError("Trailing garbage")
+ self.sink.triple(subject, predicate, object)
+
+ def peek(self, token):
+ return self.line.startswith(token)
+
+ def eat(self, pattern):
+ m = pattern.match(self.line)
+ if not m: # @@ Why can't we get the original pattern?
+ raise ParseError("Failed to eat %s" % pattern)
+ self.line = self.line[m.end():]
+ return m
+
+ def subject(self):
+ # @@ Consider using dictionary cases
+ subj = self.uriref() or self.nodeid()
+ if not subj:
+ raise ParseError("Subject must be uriref or nodeID")
+ return subj
+
+ def predicate(self):
+ pred = self.uriref()
+ if not pred:
+ raise ParseError("Predicate must be uriref")
+ return pred
+
+ def object(self):
+ objt = self.uriref() or self.nodeid() or self.literal()
+ if not objt:
+ raise ParseError("Unrecognised object type")
+ return objt
+
+ def uriref(self):
+ if self.peek('<'):
+ uri = self.eat(r_uriref).group(1)
+ uri = unquote(uri)
+ uri = uriquote(uri)
+ return URI(uri)
+ return False
+
+ def nodeid(self):
+ if self.peek('_'):
+ return bNode(self.eat(r_nodeid).group(1))
+ return False
+
+ def literal(self):
+ if self.peek('"'):
+ lit, lang, dtype = self.eat(r_literal).groups()
+ if lang and dtype:
+ raise ParseError("Can't have both a language and a datatype")
+ lit = unquote(lit)
+ return Literal(lit, lang, dtype)
+ return False
+
+def parseURI(uri):
+ import urllib
+ parser = NTriplesParser()
+ u = urllib.urlopen(uri)
+ sink = parser.parse(u)
+ u.close()
+ return sink
+
+def main():
+ import sys
+ if len(sys.argv) == 2:
+ parseURI(sys.argv[1])
+ else: print __doc__
+
+if __name__=="__main__":
+ main()
View
@@ -0,0 +1,6 @@
+<http://dbpedia.org/resource/Galway> <http://www.w3.org/2000/01/rdf-schema#label> "Galway" .
+<http://dbpedia.org/resource/Galway> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/class/yago/CitiesInTheRepuBlicOfIreland> .
+<http://dbpedia.org/resource/Galway> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/class/yago/UniversityTowns> .
+<http://dbpedia.org/resource/Galway> <http://www.georss.org/georss/point> "53.27194444444444 -9.04888888888889"@en .
+<http://dbpedia.org/resource/Galway> <http://www.w3.org/2000/01/rdf-schema#comment> "Galway or City of Galway (Cathair na Gaillimhe) is a city in County Galway, Republic of Ireland. It is the fifth largest and the fastest-growing city in Ireland. It is also the third largest city within the Republic (although this is disputed by Limerick) and the only city in the Province of Connacht. Located on the west coast of Ireland, it sits on the River Corrib between Lough Corrib and Galway Bay. The population of Galway's urban area is 75,414 according to the 2011 census." .
+<http://dbpedia.org/resource/Galway> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Place> .

0 comments on commit 413ee42

Please sign in to comment.