Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed escaping unicode characters and newlines, tabs, etc. #6

Merged
merged 7 commits into from
Apr 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion bin/validator.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
$errored = false;
$finished = false;
$tripleCount = 0;
while (!$finished) {
$line = true;
while (!$finished && $line) {
try {
$line = fgets(STDIN);
if ($line)
Expand Down
89 changes: 46 additions & 43 deletions src/N3Lexer.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,10 @@
// **N3Lexer** tokenizes N3 documents.
class N3Lexer
{
//private $fromCharCode = String.fromCharCode; //TODO

// Regular expression and replacement string to escape N3 strings.
// Note how we catch invalid unicode sequences separately (they will trigger an error).
private $escapeSequence = '/\\[uU]|\\\(.)/';
private $escapeReplacements = [
'\\' => '\\', "'"=> "'", '"' => '"',
'n' => '\n', 'r' => '\r', 't' => '\t', 'f' => '\f', 'b' => '\b',
'_' => '_', '~' => '~', '.' => '.', '-' => '-', '!' => '!', '$' => '$', '&' => '&',
'(' => '(', ')' => ')', '*' => '*', '+' => '+', ',' => ',', ';' => ';', '=' => '=',
'/' => '/', '?' => '?', '#' => '#', '@' => '@', '%' => '%'
];
private $escapeSequence = '/\\\\u([a-fA-F0-9]{4})|\\\\U([a-fA-F0-9]{8})|\\\\[uU]|\\\\(.)/';
private $escapeReplacements;
private $illegalIriChars = '/[\x00-\x20<>\\"\{\}\|\^\`]/';

private $input;
Expand All @@ -26,6 +18,13 @@ class N3Lexer

public function __construct($options = []) {
$this->initTokenize();
$this->escapeReplacements = [
'\\' => '\\', "'"=> "'", '"' => '"',
'n' => "\n", 'r' => "\r", 't' => "\t", 'f' => "\f", 'b' => chr(8),
'_' => '_', '~' => '~', '.' => '.', '-' => '-', '!' => '!', '$' => '$', '&' => '&',
'(' => '(', ')' => ')', '*' => '*', '+' => '+', ',' => ',', ';' => ';', '=' => '=',
'/' => '/', '?' => '?', '#' => '#', '@' => '@', '%' => '%'
];
// In line mode (N-Triples or N-Quads), only simple features may be parsed
if ($options["lineMode"]) {
// Don't tokenize special literals
Expand Down Expand Up @@ -53,29 +52,33 @@ public function __construct($options = []) {
}

// ## Regular expressions
private $iri ='/^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/'; // IRI with escape sequences; needs sanity check after unescaping
private $unescapedIri = '/^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/'; // IRI without escape sequences; no unescaping
private $unescapedString= '/^"[^"\\\]+"(?=[^"\\\])/'; // non-empty string without escape sequences
private $singleQuotedString= '/^"[^"\\]*(?:\\.[^"\\]*)*"(?=[^"\\])|^\'[^\'\\]*(?:\\.[^\'\\]*)*\'(?=[^\'\\])/';
private $tripleQuotedString = '/^""("[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*")""|^\'\'(\'[^\'\\]*(?:(?:\\.|\'(?!\'\'))[^\'\\]*)*\')\'\'/';
private $langcode = '/^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\-])/i';
private $prefix = '/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)?:(?=[#\s<])/';

private $prefixed = "/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}\"'<]))/";
//_iri: /^<((?:[^ <>{}\\]|\\[uU])+)>[ \t]*/, // IRI with escape sequences; needs sanity check after unescaping
private $iri ='/^<((?:[^ <>{}\\\\]|\\\\[uU])+)>[ \\t]*/'; // IRI with escape sequences; needs sanity check after unescaping
// _unescapedIri: /^<([^\x00-\x20<>\\"\{\}\|\^\`]*)>[ \t]*/, // IRI without escape sequences; no unescaping
private $unescapedIri = '/^<([^\\x00-\\x20<>\\\\"\\{\\}\\|\\^\\`]*)>[ \\t]*/'; // IRI without escape sequences; no unescaping
// _unescapedString: /^"[^"\\]+"(?=[^"\\])/, // non-empty string without escape sequences
private $unescapedString= '/^"[^\\\\"]+"(?=[^\\\\"])/'; // non-empty string without escape sequences
// _singleQuotedString: /^"[^"\\]*(?:\\.[^"\\]*)*"(?=[^"\\])|^'[^'\\]*(?:\\.[^'\\]*)*'(?=[^'\\])/,
private $singleQuotedString= '/^"[^"\\\\]*(?:\\\\.[^"\\\\]*)*"(?=[^"\\\\])|^\'[^\'\\\\]*(?:\\\\.[^\'\\\\]*)*\'(?=[^\'\\\\])/';
// _tripleQuotedString: /^""("[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*")""|^''('[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*')''/,
private $tripleQuotedString = '/^""("[^\\\\"]*(?:(?:\\\\.|"(?!""))[^\\\\"]*)*")""|^\'\'(\'[^\\\\\']*(?:(?:\\\\.|\'(?!\'\'))[^\\\\\']*)*\')\'\'/';
private $langcode = '/^@([a-z]+(?:-[a-z0-9]+)*)(?=[^a-z0-9\\-])/i';
private $prefix = '/^((?:[A-Za-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)?:(?=[#\\s<])/';

//private $prefixed = "/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}\"'<]))/";
private $variable = '/^\?(?:(?:[A-Z_a-z\xc0-\xd6\xd8-\xf6])(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)(?=[.,;!\^\s#()\[\]\{\}"\'<])/';
private $prefixed = "/^((?:[A-Za-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)?:((?:(?:[0-:A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~])(?:(?:[\\.\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~])*(?:[\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6]|%[0-9a-fA-F]{2}|\\\\[!#-\\/;=?\\-@_~]))?)?)(?:[ \\t]+|(?=\.?[,;!\\^\\s#()\\[\\]\\{\\}\"'<]))/";
//OLD VERSION private $prefixed = "/^((?:[A-Za-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff])*)?:((?:(?:[0-:A-Z_a-z\xc0-\xd6\xd8-\xf6]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])(?:(?:[\.\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~])*(?:[\-0-:A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6\xf8-\u037d\u037f-\u1fff\u200c\u200d\u203f\u2040\u2070-\u218f\u2c00-\u2fef\u3001-\ud7ff\uf900-\ufdcf\ufdf0-\ufffd]|[\ud800-\udb7f][\udc00-\udfff]|%[0-9a-fA-F]{2}|\\[!#-\/;=?\-@_~]))?)?)(?:[ \t]+|(?=\.?[,;!\^\s#()\[\]\{\}\"'<]))/";
private $variable = '/^\\?(?:(?:[A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6])(?:[\\-0-:A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)(?=[.,;!\\^\\s#()\\[\\]\\{\\}"\'<])/';

private $blank = '/^_:((?:[0-9A-Z_a-z\xc0-\xd6\xd8-\xf6])(?:\.?[\-0-9A-Z_a-z\xb7\xc0-\xd6\xd8-\xf6])*)(?:[ \t]+|(?=\.?[,;:\s#()\[\]\{\}"\'<]))/';
private $number = "/^[\-+]?(?:\d+\.?\d*([eE](?:[\-\+])?\d+)|\d*\.?\d+)(?=[.,;:\s#()\[\]\{\}\"'<])/";
private $boolean = '/^(?:true|false)(?=[.,;\s#()\[\]\{\}"\'<])/';
private $keyword = '/^@[a-z]+(?=[\s#<])/i';
private $sparqlKeyword= '/^(?:PREFIX|BASE|GRAPH)(?=[\s#<])/i';
private $shortPredicates= '/^a(?=\s+|<)/';
private $newline= '/^[ \t]*(?:#[^\n\r]*)?(?:\r\n|\n|\r)[ \t]*/';
private $comment= '/#([^\n\r]*)/';
private $whitespace= '/^[ \t]+/';
private $endOfFile= '/^(?:#[^\n\r]*)?$/';
private $blank = '/^_:((?:[0-9A-Z_a-z\\xc0-\\xd6\\xd8-\\xf6])(?:\\.?[\\-0-9A-Z_a-z\\xb7\\xc0-\\xd6\\xd8-\\xf6])*)(?:[ \\t]+|(?=\\.?[,;:\\s#()\\[\\]\\{\\}"\'<]))/';
private $number = "/^[\\-+]?(?:\\d+\\.?\\d*([eE](?:[\\-\\+])?\\d+)|\\d*\\.?\\d+)(?=[.,;:\\s#()\\[\\]\\{\\}\"'<])/";
private $boolean = '/^(?:true|false)(?=[.,;\\s#()\\[\\]\\{\\}"\'<])/';
private $keyword = '/^@[a-z]+(?=[\\s#<])/i';
private $sparqlKeyword= '/^(?:PREFIX|BASE|GRAPH)(?=[\\s#<])/i';
private $shortPredicates= '/^a(?=\\s+|<)/';
private $newline= '/^[ \\t]*(?:#[^\\n\\r]*)?(?:\\r\\n|\\n|\\r)[ \\t]*/';
private $comment= '/#([^\\n\\r]*)/';
private $whitespace= '/^[ \\t]+/';
private $endOfFile= '/^(?:#[^\\n\\r]*)?$/';

// ## Private methods
// ### `_tokenizeToEnd` tokenizes as for as possible, emitting tokens through the callback
Expand Down Expand Up @@ -166,7 +169,7 @@ private function tokenizeToEnd($callback, $inputFinished) {
// Try to find a full IRI with escape sequences
else if (preg_match($this->iri, $input, $match)) {
$unescaped = $this->unescape($match[1]);
if ($unescaped === null || preg_match($illegalIriChars,$unescaped))
if ($unescaped === null || preg_match($this->illegalIriChars,$unescaped))
return $reportSyntaxError($this);
$type = 'IRI';
$value = $unescaped;
Expand All @@ -177,7 +180,6 @@ private function tokenizeToEnd($callback, $inputFinished) {
$matchLength = 2;
$value = 'http://www.w3.org/2000/10/swap/log#implies';
}

break;
case '_':
// Try to find a blank node. Since it can contain (but not end with) a dot,
Expand All @@ -198,24 +200,24 @@ private function tokenizeToEnd($callback, $inputFinished) {
$type = 'literal';
$value = $match[0];
}
// Try to find any other literal wrapped in a pair of single or double quotes
// Try to find any other literal wrapped in a pair of single or double quotes
else if (preg_match($this->singleQuotedString, $input, $match)) {
$unescaped = $this->unescape($match[0]);
if ($unescaped === null)
return $reportSyntaxError($this);
$type = 'literal';
$value = preg_replace('/^'|'$/g', '"',$unescaped);
$value = preg_replace('/^\'|\'$/', '"',$unescaped);
}
// Try to find a literal wrapped in three pairs of single or double quotes
else if (preg_match($this->tripleQuotedString, $input, $match)) {
$unescaped = isset($match[1])?$match[1]:$match[2];
// Count the newlines and advance line counter
$this->line .= strlen(preg_split('/\r\n|\r|\n/',$unescaped)) - 1;
$this->line += sizeof(preg_split('/\r\n|\r|\n/',$unescaped)) - 1;
$unescaped = $this->unescape($unescaped);
if ($unescaped === null)
return $reportSyntaxError($this);
$type = 'literal';
$value = preg_replace("/^'|'$/g", '"',$unescaped);
$value = preg_replace("/^'|'$/", '"',$unescaped);
}
break;

Expand Down Expand Up @@ -382,7 +384,11 @@ private function tokenizeToEnd($callback, $inputFinished) {

// ### `_unescape` replaces N3 escape codes by their corresponding characters
private function unescape($item) {
return preg_replace_callback($this->escapeSequence, function ($sequence, $unicode4, $unicode8, $escapedChar) {
return preg_replace_callback($this->escapeSequence, function ($match) {
$sequence = $match[0];
$unicode4 = isset($match[1])?$match[1]:null;
$unicode8 = isset($match[2])?$match[2]:null;
$escapedChar = isset($match[3])?$match[3]:null;
$charCode;
if ($unicode4) {
$charCode = intval($unicode4, 16);
Expand All @@ -391,14 +397,11 @@ private function unescape($item) {
else if ($unicode8) {
$charCode = intval($unicode8, 16);
return mb_convert_encoding('&#' . intval($charCode) . ';', 'UTF-8', 'HTML-ENTITIES');
//if ($charCode <= 0xFFFF) return fromCharCode($charCode);
//return fromCharCode(0xD800 . (($charCode -= 0x10000) / 0x400), 0xDC00 . ($charCode & 0x3FF));
}
else {
$replacement = $this->escapeReplacements[$escapedChar];
if (!$replacement)
if (!isset($this->escapeReplacements[$escapedChar]))
throw new \Exception();
return $replacement;
return $this->escapeReplacements[$escapedChar];
}
},$item);
}
Expand Down
10 changes: 8 additions & 2 deletions src/TriGParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -837,7 +837,10 @@ private function initReaders ()

// ### `_error` emits an error message through the callback
$this->error = function ($message, $token) {
call_user_func($this->callback, new \Exception($message . ' on line ' . $token['line'] . '.'),null);
if ($this->callback)
call_user_func($this->callback, new \Exception($message . ' on line ' . $token['line'] . '.'),null);
else
throw new \Exception($message . ' on line ' . $token['line'] . '.');
};

// ### `_resolveIRI` resolves a relative IRI token against the base path,
Expand Down Expand Up @@ -994,7 +997,10 @@ public function parse($input, $tripleCallback = null, $prefixCallback = null, $f
}
}
} catch (\Exception $e) {
call_user_func($this->callback, $e, null);
if ($this->callback)
call_user_func($this->callback, $e, null);
else
throw $e;
$this->callback = function () {};
}
}
Expand Down
51 changes: 22 additions & 29 deletions src/TriGWriter.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
class TriGWriter
{
// Matches a literal as represented in memory by the N3 library
CONST LITERALMATCHER = '/^"(.*)"(?:\^\^(.+)|@([\-a-z]+))?$/is';
CONST LITERALMATCHER = '/^"(.*)"(?:\\^\\^(.+)|@([\\-a-z]+))?$/is';
// rdf:type predicate (for 'a' abbreviation)
CONST RDF_PREFIX = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
CONST RDF_TYPE = self::RDF_PREFIX . 'type';

// Characters in literals that require escaping
CONST ESCAPE = "/[\"\\\t\n\r\f]/u"; #/u';
CONST ESCAPEALL = "/[\"\\\t\n\r\b\f]/u";
CONST ESCAPE = "/[\"\\\\\\t\\n\\r\\b\\f]/";
//HHVM does not allow this to be a constant
private $ESCAPEREPLACEMENTS;

Expand All @@ -29,7 +28,7 @@ public function __construct($options = [])
{
$this->ESCAPEREPLACEMENTS = [
'\\' => '\\\\', '"' => '\\"', "\t" => "\\t",
"\n" => '\\n', "\r" => "\\r", "\b"=> "\\b", "\f"=> "\\f"
"\n" => '\\n', "\r" => "\\r", chr(8) => "\\b", "\f"=> "\\f"
];
$this->initWriter ();
/* Initialize writer, depending on the format*/
Expand All @@ -44,29 +43,15 @@ public function __construct($options = [])
$this->writeTriple = $this->writeTripleLine;
}

// TODO: I think we could do without this...
/*$this->characterReplacer = function ($character) {
$this->characterReplacer = function ($character) {
// Replace a single character by its escaped version
$character = $character[0];
if (strlen($character) > 0 && isset(self::ESCAPEREPLACEMENTS[$character[0]])) {
return self::ESCAPEREPLACEMENTS[$character[0]];
if (strlen($character) > 0 && isset($this->ESCAPEREPLACEMENTS[$character[0]])) {
return $this->ESCAPEREPLACEMENTS[$character[0]];
} else {
// Replace a single character with its 4-bit unicode escape sequence
$result = "";
if (strlen($character) === 1) {
//TODO
//$result = $character.charCodeAt(0).toString(16);
//$result = \'\\u0000\'.substr(0, 6 - strlen($result)) + $result;
}
// Replace a surrogate pair with its 8-bit unicode escape sequence
else {
//$result = (($character.charCodeAt(0) - 0xD800) * 0x400 +
//$character.charCodeAt(1) + 0x2400).toString(16);
//$result = \'\\U00000000\'.substr(0, 10 - strlen($result)) + $result;
}
return $result;
return $result; //no escaping necessary, should not happen, or something is wrong in our regex
}
};*/
};
}

private function initWriter ()
Expand Down Expand Up @@ -163,8 +148,8 @@ private function encodeIriOrBlankNode ($entity) {
return $entity;
}
// Escape special characters
//if (preg_match(self::ESCAPE, $entity))
// $entity = preg_replace_callback(self::ESCAPEALL, $this->characterReplacer,$entity);
if (preg_match(self::ESCAPE, $entity))
$entity = preg_replace_callback(self::ESCAPE, $this->characterReplacer,$entity);

// Try to represent the IRI as prefixed name
preg_match($this->prefixRegex, $entity, $prefixMatch);
Expand All @@ -181,8 +166,15 @@ private function encodeIriOrBlankNode ($entity) {

// ### `_encodeLiteral` represents a literal
private function encodeLiteral ($value, $type = null, $language = null) {
//TODO: change back to a single quote and escape all the other things
// Escape special characters - TODO: unicode characters?
if (preg_match('/[\t\n\r\f]/',$value)) {

// Escape special characters
if (preg_match(self::ESCAPE, $value))
$value = preg_replace_callback(self::ESCAPE, $this->characterReplacer,$value);


/*if (preg_match('/[\\t\\n\\r\\f]/',$value)) {

$value = str_replace(array('\\', '"""'), array('\\\\', '\\"""'), $value);

Expand All @@ -194,10 +186,11 @@ private function encodeLiteral ($value, $type = null, $language = null) {
}
// enclose between 3 double quotes
$value = '"""' . $value . '"""';
} else {
} else {*/
// enclose in double quotes, while escaping back slashes
$value = '"' . str_replace(array('\\', '"'), array('\\\\', '\\"'), $value) . '"';
}
// $value = '"' . str_replace(array('\\', '"'), array('\\\\', '\\"'), $value) . '"';
// }
$value = '"' . $value . '"';

// Write the literal, possibly with type or language
if (isset($language))
Expand Down
Loading