Skip to content

Commit

Permalink
Handle legacy named character references correctly
Browse files Browse the repository at this point in the history
Fixes #67.
  • Loading branch information
mathiasbynens committed Sep 23, 2018
1 parent 24377e2 commit 903c6b5
Show file tree
Hide file tree
Showing 8 changed files with 157 additions and 106 deletions.
87 changes: 45 additions & 42 deletions he.js

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions package.json
Expand Up @@ -42,6 +42,7 @@
"devDependencies": { "devDependencies": {
"codecov.io": "^0.1.6", "codecov.io": "^0.1.6",
"grunt": "^0.4.5", "grunt": "^0.4.5",
"grunt-cli": "^1.3.1",
"grunt-shell": "^1.1.1", "grunt-shell": "^1.1.1",
"grunt-template": "^0.2.3", "grunt-template": "^0.2.3",
"istanbul": "^0.4.2", "istanbul": "^0.4.2",
Expand All @@ -50,6 +51,7 @@
"qunit-extras": "^1.4.5", "qunit-extras": "^1.4.5",
"qunitjs": "~1.11.0", "qunitjs": "~1.11.0",
"regenerate": "^1.2.1", "regenerate": "^1.2.1",
"regexgen": "^1.3.0",
"requirejs": "^2.1.22", "requirejs": "^2.1.22",
"sort-object": "^3.0.2" "sort-object": "^3.0.2"
} }
Expand Down
3 changes: 2 additions & 1 deletion scripts/export-data.js
Expand Up @@ -24,8 +24,9 @@ module.exports = {
'regexHexadecimalEscapeSource': '&#[xX]([a-fA-F0-9]+)(;?)', 'regexHexadecimalEscapeSource': '&#[xX]([a-fA-F0-9]+)(;?)',
'regexInvalidRawCodePoints': require('./invalid-code-points-regex.js'), 'regexInvalidRawCodePoints': require('./invalid-code-points-regex.js'),
'regexLegacyReferenceSource': require('./legacy-reference-regex.js'), 'regexLegacyReferenceSource': require('./legacy-reference-regex.js'),
'regexNamedReferenceSource': '&([0-9a-zA-Z]+);', 'regexNamedReferenceSource': require('./named-reference-regex.js'),
'stringInvalidCodePoints': require('./invalid-code-points-string.js'), 'stringInvalidCodePoints': require('./invalid-code-points-string.js'),
'regexAmbiguousAmpersand': '&([0-9a-zA-Z]+)',
'testDataMap': formatJSON('entities'), 'testDataMap': formatJSON('entities'),
'version': require('../package.json').version 'version': require('../package.json').version
}; };
2 changes: 1 addition & 1 deletion scripts/legacy-reference-regex.js
Expand Up @@ -2,6 +2,6 @@


const legacyReferences = require('../data/decode-legacy-named-references.json'); const legacyReferences = require('../data/decode-legacy-named-references.json');
const regexLegacyReference = '&(' + legacyReferences.join('|') + const regexLegacyReference = '&(' + legacyReferences.join('|') +
')([=a-zA-Z0-9])?'; ')(?!;)([=a-zA-Z0-9]?)';


module.exports = regexLegacyReference; module.exports = regexLegacyReference;
24 changes: 24 additions & 0 deletions scripts/named-reference-regex.js
@@ -0,0 +1,24 @@
'use strict';

const namedReferences = Object.keys(
require('../data/decode-map.json')
).sort((a, b) => b.length - a.length);

// const Trie = require('regexgen').Trie;
// const trie = new Trie();
// trie.addAll(namedReferences);
// const pattern = trie.toString();
// console.log(pattern);
// → 12 KB instead of the 16 KB of the current output.
// However, the current output gzips better, and has better
// run-time performance.

// Verify all references consist of characters that don’t need escaping
// within regular expressions. (If this is not the case, then we can’t
// simply do a `join('|')`.)
console.assert(namedReferences.every((reference) => {
return /^[a-zA-Z0-9]+$/.test(reference);
}));
const regexNamedReference = '&(' + namedReferences.join('|') + ');';

module.exports = regexNamedReference;
93 changes: 49 additions & 44 deletions src/he.js
Expand Up @@ -52,13 +52,15 @@
var regexInvalidEntity = /&#(?:[xX][^a-fA-F0-9]|[^0-9xX])/; var regexInvalidEntity = /&#(?:[xX][^a-fA-F0-9]|[^0-9xX])/;
var regexInvalidRawCodePoint = /<%= regexInvalidRawCodePoints %>/; var regexInvalidRawCodePoint = /<%= regexInvalidRawCodePoints %>/;
var regexDecode = /<%= var regexDecode = /<%=
regexNamedReferenceSource
%>|<%=
regexLegacyReferenceSource
%>|<%=
regexDecimalEscapeSource regexDecimalEscapeSource
%>|<%= %>|<%=
regexHexadecimalEscapeSource regexHexadecimalEscapeSource
%>|<%= %>|<%=
regexNamedReferenceSource regexAmbiguousAmpersand
%>|<%=
regexLegacyReferenceSource
%>/g; %>/g;
var decodeMap = <%= decodeMap %>; var decodeMap = <%= decodeMap %>;
var decodeMapLegacy = <%= decodeMapLegacy %>; var decodeMapLegacy = <%= decodeMapLegacy %>;
Expand Down Expand Up @@ -237,69 +239,72 @@
if (strict && regexInvalidEntity.test(html)) { if (strict && regexInvalidEntity.test(html)) {
parseError('malformed character reference'); parseError('malformed character reference');
} }
return html.replace(regexDecode, function($0, $1, $2, $3, $4, $5, $6, $7) { return html.replace(regexDecode, function($0, $1, $2, $3, $4, $5, $6, $7, $8) {
var codePoint; var codePoint;
var semicolon; var semicolon;
var decDigits; var decDigits;
var hexDigits; var hexDigits;
var reference; var reference;
var next; var next;

if ($1) { if ($1) {
reference = $1;
// Note: there is no need to check `has(decodeMap, reference)`.
return decodeMap[reference];
}

if ($2) {
// Decode named character references without trailing `;`, e.g. `&amp`.
// This is only a parse error if it gets converted to `&`, or if it is
// followed by `=` in an attribute context.
reference = $2;
next = $3;
if (next && options.isAttributeValue) {
if (strict && next == '=') {
parseError('`&` did not start a character reference');
}
return $0;
} else {
if (strict) {
parseError(
'named character reference was not terminated by a semicolon'
);
}
// Note: there is no need to check `has(decodeMapLegacy, reference)`.
return decodeMapLegacy[reference] + (next || '');
}
}
if ($4) {
// Decode decimal escapes, e.g. `&#119558;`. // Decode decimal escapes, e.g. `&#119558;`.
decDigits = $1; decDigits = $4;
semicolon = $2; semicolon = $5;
if (strict && !semicolon) { if (strict && !semicolon) {
parseError('character reference was not terminated by a semicolon'); parseError('character reference was not terminated by a semicolon');
} }
codePoint = parseInt(decDigits, 10); codePoint = parseInt(decDigits, 10);
return codePointToSymbol(codePoint, strict); return codePointToSymbol(codePoint, strict);
} }
if ($3) {
if ($6) {
// Decode hexadecimal escapes, e.g. `&#x1D306;`. // Decode hexadecimal escapes, e.g. `&#x1D306;`.
hexDigits = $3; hexDigits = $6;
semicolon = $4; semicolon = $7;
if (strict && !semicolon) { if (strict && !semicolon) {
parseError('character reference was not terminated by a semicolon'); parseError('character reference was not terminated by a semicolon');
} }
codePoint = parseInt(hexDigits, 16); codePoint = parseInt(hexDigits, 16);
return codePointToSymbol(codePoint, strict); return codePointToSymbol(codePoint, strict);
} }
if ($5) {
// Decode named character references with trailing `;`, e.g. `&copy;`. // If we’re still here, `if ($7)` is implied; it’s an ambiguous
reference = $5; // ampersand for sure. https://mths.be/notes/ambiguous-ampersands
if (has(decodeMap, reference)) { if (strict) {
return decodeMap[reference]; parseError(
} else { 'named character reference was not terminated by a semicolon'
// Ambiguous ampersand. https://mths.be/notes/ambiguous-ampersands );
if (strict) {
parseError(
'named character reference was not terminated by a semicolon'
);
}
return $0;
}
}
// If we’re still here, it’s a legacy reference for sure. No need for an
// extra `if` check.
// Decode named character references without trailing `;`, e.g. `&amp`
// This is only a parse error if it gets converted to `&`, or if it is
// followed by `=` in an attribute context.
reference = $6;
next = $7;
if (next && options.isAttributeValue) {
if (strict && next == '=') {
parseError('`&` did not start a character reference');
}
return $0;
} else {
if (strict) {
parseError(
'named character reference was not terminated by a semicolon'
);
}
// Note: there is no need to check `has(decodeMapLegacy, reference)`.
return decodeMapLegacy[reference] + (next || '');
} }
return $0;
}); });
}; };
// Expose default options (so they can be overridden globally). // Expose default options (so they can be overridden globally).
Expand Down
26 changes: 17 additions & 9 deletions tests/tests.js
Expand Up @@ -5979,6 +5979,10 @@
{ {
'decoded': 'a\u200Cb', 'decoded': 'a\u200Cb',
'encoded': 'a&zwnj;b' 'encoded': 'a&zwnj;b'
},
{
'decoded': '&xxx; &xxx &thorn; &thorn &curren;t &current',
'encoded': '&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current'
} }
]; ];


Expand Down Expand Up @@ -6041,6 +6045,11 @@
'\u2209 \xACi \xACin \xA9123', '\u2209 \xACi \xACin \xA9123',
'Legacy named references (without a trailing semicolon)' 'Legacy named references (without a trailing semicolon)'
); );
equal(
he.decode('&amp;xxx; &amp;xxx &ampthorn; &ampthorn &ampcurren;t &ampcurrent'),
'&xxx; &xxx &thorn; &thorn &curren;t &current',
'Legacy named references'
);
equal( equal(
he.decode('a&#x1D306;b&#X0000000000001d306;c'), he.decode('a&#x1D306;b&#X0000000000001d306;c'),
'a\uD834\uDF06b\uD834\uDF06c', 'a\uD834\uDF06b\uD834\uDF06c',
Expand Down Expand Up @@ -6213,15 +6222,14 @@
'Parse error: `I\'m ¬it; I tell you`' 'Parse error: `I\'m ¬it; I tell you`'
); );
he.decode.options.strict = false; he.decode.options.strict = false;
raises( // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
function() { equal(
he.decode('I\'m &notit; I tell you', { he.decode('I\'m &notit; I tell you', {
'strict': true, 'strict': true,
'isAttributeValue': true 'isAttributeValue': true
}); }),
}, 'I\'m &notit; I tell you',
Error, 'No parse error: `I\'m &notit; I tell you` as attribute value'
'Parse error: `I\'m &notit; I tell you` as attribute value'
); );
equal( equal(
he.decode('I\'m &notit; I tell you', { he.decode('I\'m &notit; I tell you', {
Expand Down
26 changes: 17 additions & 9 deletions tests/tests.src.js
Expand Up @@ -5979,6 +5979,10 @@
{ {
'decoded': 'a\u200Cb', 'decoded': 'a\u200Cb',
'encoded': 'a&zwnj;b' 'encoded': 'a&zwnj;b'
},
{
'decoded': '&xxx; &xxx &thorn; &thorn &curren;t &current',
'encoded': '&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current'
} }
]; ];


Expand Down Expand Up @@ -6041,6 +6045,11 @@
'\u2209 \xACi \xACin \xA9123', '\u2209 \xACi \xACin \xA9123',
'Legacy named references (without a trailing semicolon)' 'Legacy named references (without a trailing semicolon)'
); );
equal(
he.decode('&amp;xxx; &amp;xxx &ampthorn; &ampthorn &ampcurren;t &ampcurrent'),
'&xxx; &xxx &thorn; &thorn &curren;t &current',
'Legacy named references'
);
equal( equal(
he.decode('a&#x1D306;b&#X0000000000001d306;c'), he.decode('a&#x1D306;b&#X0000000000001d306;c'),
'a\uD834\uDF06b\uD834\uDF06c', 'a\uD834\uDF06b\uD834\uDF06c',
Expand Down Expand Up @@ -6213,15 +6222,14 @@
'Parse error: `I\'m ¬it; I tell you`' 'Parse error: `I\'m ¬it; I tell you`'
); );
he.decode.options.strict = false; he.decode.options.strict = false;
raises( // https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
function() { equal(
he.decode('I\'m &notit; I tell you', { he.decode('I\'m &notit; I tell you', {
'strict': true, 'strict': true,
'isAttributeValue': true 'isAttributeValue': true
}); }),
}, 'I\'m &notit; I tell you',
Error, 'No parse error: `I\'m &notit; I tell you` as attribute value'
'Parse error: `I\'m &notit; I tell you` as attribute value'
); );
equal( equal(
he.decode('I\'m &notit; I tell you', { he.decode('I\'m &notit; I tell you', {
Expand Down

0 comments on commit 903c6b5

Please sign in to comment.