pietercolpaert · pietercolpaert · Apr 14, 2017 · Apr 13, 2017 · Apr 13, 2017 · Apr 13, 2017
diff --git a/bin/validator.php b/bin/validator.php
@@ -11,7 +11,8 @@
 $errored = false;
 $finished = false;
 $tripleCount = 0;
-while (!$finished) {
+$line = true;
+while (!$finished && $line) {
     try {
         $line = fgets(STDIN);
         if ($line)

diff --git a/src/N3Lexer.php b/src/N3Lexer.php
@@ -5,18 +5,10 @@
 // **N3Lexer** tokenizes N3 documents.
 class N3Lexer
 {
-    //private $fromCharCode = String.fromCharCode; //TODO
-
     // Regular expression and replacement string to escape N3 strings.
     // Note how we catch invalid unicode sequences separately (they will trigger an error).
-    private $escapeSequence = '/\\[uU]|\\\(.)/';
-    private $escapeReplacements = [
-      '\\' => '\\', "'"=> "'", '"' => '"',
-      'n' => '\n', 'r' => '\r', 't' => '\t', 'f' => '\f', 'b' => '\b',
-      '_' => '_', '~' => '~', '.' => '.', '-' => '-', '!' => '!', '$' => '$', '&' => '&',
-      '(' => '(', ')' => ')', '*' => '*', '+' => '+', ',' => ',', ';' => ';', '=' => '=',
-      '/' => '/', '?' => '?', '#' => '#', '@' => '@', '%' => '%'
-    ];
+    private $escapeSequence = '/\\\\u([a-fA-F0-9]{4})|\\\\U([a-fA-F0-9]{8})|\\\\[uU]|\\\\(.)/';
+    private $escapeReplacements;
     private $illegalIriChars = '/[\x00-\x20<>\\"\{\}\|\^\`]/';
 
     private $input;
@@ -26,6 +18,13 @@ class N3Lexer
 
     public function __construct($options = []) {
         $this->initTokenize();
+        $this->escapeReplacements = [
+            '\\' => '\\', "'"=> "'", '"' => '"',
+            'n' => "\n", 'r' => "\r", 't' => "\t", 'f' => "\f", 'b' => chr(8),
+            '_' => '_', '~' => '~', '.' => '.', '-' => '-', '!' => '!', '$' => '$', '&' => '&',
+            '(' => '(', ')' => ')', '*' => '*', '+' => '+', ',' => ',', ';' => ';', '=' => '=',
+            '/' => '/', '?' => '?', '#' => '#', '@' => '@', '%' => '%'
+        ];
         // In line mode (N-Triples or N-Quads), only simple features may be parsed
         if ($options["lineMode"]) {
             // Don't tokenize special literals
@@ -53,29 +52,33 @@ public function __construct($options = []) {
     }
 
     // ## Regular expressions
-    private $iri ='/^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/'; // IRI with escape sequences; needs sanity check after unescaping
-    private $unescapedIri =  '/^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/'; // IRI without escape sequences; no unescaping
-    private $unescapedString= '/^"[^"\\\]+"(?=[^"\\\])/'; // non-empty string without escape sequences 
-    private $singleQuotedString= '/^"[^"\\]*(?:\\.[^"\\]*)*"(?=[^"\\])|^\'[^\'\\]*(?:\\.[^\'\\]*)*\'(?=[^\'\\])/';
-    private $tripleQuotedString = '/^""("[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*")""|^\'\'(\'[^\'\\]*(?:(?:\\.|\'(?!\'\'))[^\'\\]*)*\')\'\'/';
-    private $langcode =  '/^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\-])/i';
-    private $prefix = '/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)?:(?=[#\s<])/';
-
-    private $prefixed = "/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}\"'<]))/";
+    //_iri:        /^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/, // IRI with escape sequences; needs sanity check after unescaping
+    private $iri ='/^<((?:[^ <>{}\\\\]|\\\\[uU])+)>[ \\t]*/'; // IRI with escape sequences; needs sanity check after unescaping
+    //      _unescapedIri:    /^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/, // IRI without escape sequences; no unescaping
+    private $unescapedIri =  '/^<([^\\x00-\\x20<>\\\\"\\{\\}\\|\\^\\`]*)>[ \\t]*/'; // IRI without escape sequences; no unescaping
+    //  _unescapedString:      /^"[^"\\]+"(?=[^"\\])/, // non-empty string without escape sequences
+    private $unescapedString= '/^"[^\\\\"]+"(?=[^\\\\"])/'; // non-empty string without escape sequences
+    //  _singleQuotedString:      /^"[^"\\]*(?:\\.[^"\\]*)*"(?=[^"\\])|^'[^'\\]*(?:\\.[^'\\]*)*'(?=[^'\\])/,
+    private $singleQuotedString= '/^"[^"\\\\]*(?:\\\\.[^"\\\\]*)*"(?=[^"\\\\])|^\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'(?=[^\'\\\\])/';
+    //  _tripleQuotedString:       /^""("[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*")""|^''('[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*')''/,
+    private $tripleQuotedString = '/^""("[^\\\\"]*(?:(?:\\\\.|"(?!""))[^\\\\"]*)*")""|^\'\'(\'[^\\\\\']*(?:(?:\\\\.|\'(?!\'\'))[^\\\\\']*)*\')\'\'/';
+    private $langcode =  '/^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\\-])/i';
+    private $prefix = '/^((?:[A-Za-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)?:(?=[#\\s<])/';
 
-    //private $prefixed = "/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}\"'<]))/";
-    private $variable = '/^\?(?:(?:[A-Z_a-z\xc0-\xd6\xd8-\xf6])(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)(?=[.,;!\^\s#()\[\]\{\}"\'<])/';
+    private $prefixed = "/^((?:[A-Za-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)?:((?:(?:[0-:A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~])(?:(?:[\\.\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~])*(?:[\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~]))?)?)(?:[ \\t]+|(?=\.?[,;!\\^\\s#()\\[\\]\\{\\}\"'<]))/";
+    //OLD VERSION private $prefixed = "/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}\"'<]))/";
+    private $variable = '/^\\?(?:(?:[A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6])(?:[\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)(?=[.,;!\\^\\s#()\\[\\]\\{\\}"\'<])/';
 
-    private $blank = '/^_:((?:[0-9A-Z_a-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)(?:[ \t]+|(?=\.?[,;:\s#()\[\]\{\}"\'<]))/';
-    private $number = "/^[\-+]?(?:\d+\.?\d*([eE](?:[\-\+])?\d+)|\d*\.?\d+)(?=[.,;:\s#()\[\]\{\}\"'<])/";
-    private $boolean = '/^(?:true|false)(?=[.,;\s#()\[\]\{\}"\'<])/';
-    private $keyword = '/^@[a-z]+(?=[\s#<])/i';
-    private $sparqlKeyword= '/^(?:PREFIX|BASE|GRAPH)(?=[\s#<])/i';
-    private $shortPredicates= '/^a(?=\s+|<)/';
-    private $newline= '/^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/';
-    private $comment= '/#([^\n\r]*)/';
-    private $whitespace= '/^[ \t]+/';
-    private $endOfFile= '/^(?:#[^\n\r]*)?$/';
+    private $blank = '/^_:((?:[0-9A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)(?:[ \\t]+|(?=\\.?[,;:\\s#()\\[\\]\\{\\}"\'<]))/';
+    private $number = "/^[\\-+]?(?:\\d+\\.?\\d*([eE](?:[\\-\\+])?\\d+)|\\d*\\.?\\d+)(?=[.,;:\\s#()\\[\\]\\{\\}\"'<])/";
+    private $boolean = '/^(?:true|false)(?=[.,;\\s#()\\[\\]\\{\\}"\'<])/';
+    private $keyword = '/^@[a-z]+(?=[\\s#<])/i';
+    private $sparqlKeyword= '/^(?:PREFIX|BASE|GRAPH)(?=[\\s#<])/i';
+    private $shortPredicates= '/^a(?=\\s+|<)/';
+    private $newline= '/^[ \\t]*(?:#[^\\n\\r]*)?(?:\\r\\n|\\n|\\r)[ \\t]*/';
+    private $comment= '/#([^\\n\\r]*)/';
+    private $whitespace= '/^[ \\t]+/';
+    private $endOfFile= '/^(?:#[^\\n\\r]*)?$/';
 
     // ## Private methods
     // ### `_tokenizeToEnd` tokenizes as for as possible, emitting tokens through the callback
@@ -166,7 +169,7 @@ private function tokenizeToEnd($callback, $inputFinished) {
                     // Try to find a full IRI with escape sequences
                     else if (preg_match($this->iri, $input, $match)) {
                         $unescaped = $this->unescape($match[1]);
-                        if ($unescaped === null || preg_match($illegalIriChars,$unescaped))
+                        if ($unescaped === null || preg_match($this->illegalIriChars,$unescaped))
                             return $reportSyntaxError($this);
                         $type = 'IRI';
                         $value = $unescaped;
@@ -177,7 +180,6 @@ private function tokenizeToEnd($callback, $inputFinished) {
                         $matchLength = 2;
                         $value = 'http://www.w3.org/2000/10/swap/log#implies';
                     }
-
                     break;
                 case '_':
                     // Try to find a blank node. Since it can contain (but not end with) a dot,
@@ -198,24 +200,24 @@ private function tokenizeToEnd($callback, $inputFinished) {
                         $type = 'literal';
                         $value = $match[0];
                     }
-                // Try to find any other literal wrapped in a pair of single or double quotes
+                    // Try to find any other literal wrapped in a pair of single or double quotes
                     else if (preg_match($this->singleQuotedString, $input, $match)) {
                         $unescaped = $this->unescape($match[0]);
                         if ($unescaped === null)
                             return $reportSyntaxError($this);
                         $type = 'literal';
-                        $value = preg_replace('/^'|'$/g', '"',$unescaped);
+                        $value = preg_replace('/^\'|\'$/', '"',$unescaped);
                     }
                     // Try to find a literal wrapped in three pairs of single or double quotes
                     else if (preg_match($this->tripleQuotedString, $input, $match)) {
                         $unescaped = isset($match[1])?$match[1]:$match[2];
                         // Count the newlines and advance line counter
-                        $this->line .= strlen(preg_split('/\r\n|\r|\n/',$unescaped)) - 1;
+                        $this->line += sizeof(preg_split('/\r\n|\r|\n/',$unescaped)) - 1;
                         $unescaped = $this->unescape($unescaped);
                         if ($unescaped === null)
                             return $reportSyntaxError($this);
                         $type = 'literal';
-                        $value = preg_replace("/^'|'$/g", '"',$unescaped);
+                        $value = preg_replace("/^'|'$/", '"',$unescaped);
                     }
                 break;
 
@@ -382,7 +384,11 @@ private function tokenizeToEnd($callback, $inputFinished) {
 
     // ### `_unescape` replaces N3 escape codes by their corresponding characters
     private function unescape($item) {
-        return preg_replace_callback($this->escapeSequence, function ($sequence, $unicode4, $unicode8, $escapedChar) {
+        return preg_replace_callback($this->escapeSequence, function ($match) {
+            $sequence = $match[0];
+            $unicode4 = isset($match[1])?$match[1]:null;
+            $unicode8 = isset($match[2])?$match[2]:null;
+            $escapedChar = isset($match[3])?$match[3]:null;
             $charCode;
             if ($unicode4) {
                 $charCode = intval($unicode4, 16);
@@ -391,14 +397,11 @@ private function unescape($item) {
             else if ($unicode8) {
                 $charCode = intval($unicode8, 16);
                 return mb_convert_encoding('&#' . intval($charCode) . ';', 'UTF-8', 'HTML-ENTITIES');
-                //if ($charCode <= 0xFFFF) return fromCharCode($charCode);
-                //return fromCharCode(0xD800 . (($charCode -= 0x10000) / 0x400), 0xDC00 . ($charCode & 0x3FF));
             }
             else {
-                $replacement = $this->escapeReplacements[$escapedChar];
-                if (!$replacement)
+                if (!isset($this->escapeReplacements[$escapedChar]))
                     throw new \Exception();
-                return $replacement;
+                return $this->escapeReplacements[$escapedChar];
             }
         },$item);
     }

diff --git a/src/TriGParser.php b/src/TriGParser.php
@@ -837,7 +837,10 @@ private function initReaders ()
 
         // ### `_error` emits an error message through the callback
         $this->error = function ($message, $token) {
-            call_user_func($this->callback, new \Exception($message . ' on line ' . $token['line'] . '.'),null);
+            if ($this->callback) 
+                call_user_func($this->callback, new \Exception($message . ' on line ' . $token['line'] . '.'),null);
+            else
+                throw new \Exception($message . ' on line ' . $token['line'] . '.');
         };
 
         // ### `_resolveIRI` resolves a relative IRI token against the base path,
@@ -994,7 +997,10 @@ public function parse($input, $tripleCallback = null, $prefixCallback = null, $f
                 }
             }
         } catch (\Exception $e) {
-            call_user_func($this->callback, $e, null);
+            if ($this->callback)
+                call_user_func($this->callback, $e, null);
+            else
+                throw $e;
             $this->callback = function () {};
         }
     }

diff --git a/src/TriGWriter.php b/src/TriGWriter.php
@@ -6,14 +6,13 @@
 class TriGWriter
 {
     // Matches a literal as represented in memory by the N3 library
-    CONST LITERALMATCHER = '/^"(.*)"(?:\^\^(.+)|@([\-a-z]+))?$/is';
+    CONST LITERALMATCHER = '/^"(.*)"(?:\\^\\^(.+)|@([\\-a-z]+))?$/is';
     // rdf:type predicate (for 'a' abbreviation)
     CONST RDF_PREFIX = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
     CONST RDF_TYPE   = self::RDF_PREFIX . 'type';
 
     // Characters in literals that require escaping
-    CONST ESCAPE = "/[\"\\\t\n\r\f]/u"; #/u';
-    CONST ESCAPEALL = "/[\"\\\t\n\r\b\f]/u";
+    CONST ESCAPE =    "/[\"\\\\\\t\\n\\r\\b\\f]/";
     //HHVM does not allow this to be a constant
     private $ESCAPEREPLACEMENTS;
 
@@ -29,7 +28,7 @@ public function __construct($options = [])
     {
         $this->ESCAPEREPLACEMENTS = [
             '\\' => '\\\\', '"' => '\\"', "\t" => "\\t",
-            "\n" => '\\n', "\r" => "\\r", "\b"=> "\\b", "\f"=> "\\f"
+            "\n" => '\\n', "\r" => "\\r", chr(8) => "\\b", "\f"=> "\\f"
         ];
         $this->initWriter ();
         /* Initialize writer, depending on the format*/
@@ -44,29 +43,15 @@ public function __construct($options = [])
             $this->writeTriple = $this->writeTripleLine;
         }
 
-        // TODO: I think we could do without this...
-        /*$this->characterReplacer = function ($character) {
+        $this->characterReplacer = function ($character) {
             // Replace a single character by its escaped version
             $character = $character[0];
-            if (strlen($character) > 0 && isset(self::ESCAPEREPLACEMENTS[$character[0]])) {
-                return self::ESCAPEREPLACEMENTS[$character[0]];
+            if (strlen($character) > 0 && isset($this->ESCAPEREPLACEMENTS[$character[0]])) {
+                return $this->ESCAPEREPLACEMENTS[$character[0]];
             } else {
-                // Replace a single character with its 4-bit unicode escape sequence
-                $result = "";
-                if (strlen($character) === 1) {
-                    //TODO
-                    //$result = $character.charCodeAt(0).toString(16);
-                    //$result = \'\\u0000\'.substr(0, 6 - strlen($result)) + $result;
-                }
-                // Replace a surrogate pair with its 8-bit unicode escape sequence
-                else {
-                    //$result = (($character.charCodeAt(0) - 0xD800) * 0x400 +
-                    //$character.charCodeAt(1) + 0x2400).toString(16);
-                    //$result = \'\\U00000000\'.substr(0, 10 - strlen($result)) + $result;
-                }
-                return $result;
+                return $result; //no escaping necessary, should not happen, or something is wrong in our regex
             }
-            };*/
+        };
     }
 
     private function initWriter () 
@@ -163,8 +148,8 @@ private function encodeIriOrBlankNode ($entity) {
             return $entity;
         }
         // Escape special characters
-        //if (preg_match(self::ESCAPE, $entity))
-        //    $entity = preg_replace_callback(self::ESCAPEALL, $this->characterReplacer,$entity);
+        if (preg_match(self::ESCAPE, $entity))
+            $entity = preg_replace_callback(self::ESCAPE, $this->characterReplacer,$entity);
 
         // Try to represent the IRI as prefixed name
         preg_match($this->prefixRegex, $entity, $prefixMatch);
@@ -181,8 +166,15 @@ private function encodeIriOrBlankNode ($entity) {
 
     // ### `_encodeLiteral` represents a literal
     private function encodeLiteral ($value, $type = null, $language = null) {
+        //TODO: change back to a single quote and escape all the other things
         // Escape special characters - TODO: unicode characters?
-        if (preg_match('/[\t\n\r\f]/',$value)) {
+
+        // Escape special characters
+        if (preg_match(self::ESCAPE, $value))
+            $value = preg_replace_callback(self::ESCAPE, $this->characterReplacer,$value);
+
+
+        /*if (preg_match('/[\\t\\n\\r\\f]/',$value)) {
 
             $value = str_replace(array('\\', '"""'), array('\\\\', '\\"""'), $value);
 
@@ -194,10 +186,11 @@ private function encodeLiteral ($value, $type = null, $language = null) {
             }
             // enclose between 3 double quotes
             $value = '"""' . $value . '"""';
-        } else {
+            } else {*/
             // enclose in double quotes, while escaping back slashes
-            $value = '"' . str_replace(array('\\', '"'), array('\\\\', '\\"'), $value) . '"';
-        }
+//            $value = '"' . str_replace(array('\\', '"'), array('\\\\', '\\"'), $value) . '"';
+//        }
+        $value = '"' . $value . '"';
 
         // Write the literal, possibly with type or language
         if (isset($language))