Permalink
Browse files

Handle legacy named character references correctly

Fixes #67.
  • Loading branch information...
mathiasbynens committed Sep 23, 2018
1 parent 24377e2 commit 903c6b52f82259298dbaecbea04d26232bf61370
Showing with 157 additions and 106 deletions.
  1. +45 −42 he.js
  2. +2 −0 package.json
  3. +2 −1 scripts/export-data.js
  4. +1 −1 scripts/legacy-reference-regex.js
  5. +24 −0 scripts/named-reference-regex.js
  6. +49 −44 src/he.js
  7. +17 −9 tests/tests.js
  8. +17 −9 tests/tests.src.js
87 he.js

Large diffs are not rendered by default.

Oops, something went wrong.
@@ -42,6 +42,7 @@
"devDependencies": {
"codecov.io": "^0.1.6",
"grunt": "^0.4.5",
"grunt-cli": "^1.3.1",
"grunt-shell": "^1.1.1",
"grunt-template": "^0.2.3",
"istanbul": "^0.4.2",
@@ -50,6 +51,7 @@
"qunit-extras": "^1.4.5",
"qunitjs": "~1.11.0",
"regenerate": "^1.2.1",
"regexgen": "^1.3.0",
"requirejs": "^2.1.22",
"sort-object": "^3.0.2"
}
@@ -24,8 +24,9 @@ module.exports = {
'regexHexadecimalEscapeSource': '&#[xX]([a-fA-F0-9]+)(;?)',
'regexInvalidRawCodePoints': require('./invalid-code-points-regex.js'),
'regexLegacyReferenceSource': require('./legacy-reference-regex.js'),
'regexNamedReferenceSource': '&([0-9a-zA-Z]+);',
'regexNamedReferenceSource': require('./named-reference-regex.js'),
'stringInvalidCodePoints': require('./invalid-code-points-string.js'),
'regexAmbiguousAmpersand': '&([0-9a-zA-Z]+)',
'testDataMap': formatJSON('entities'),
'version': require('../package.json').version
};
@@ -2,6 +2,6 @@

const legacyReferences = require('../data/decode-legacy-named-references.json');
const regexLegacyReference = '&(' + legacyReferences.join('|') +
')([=a-zA-Z0-9])?';
')(?!;)([=a-zA-Z0-9]?)';

module.exports = regexLegacyReference;
@@ -0,0 +1,24 @@
'use strict';

const namedReferences = Object.keys(
require('../data/decode-map.json')
).sort((a, b) => b.length - a.length);

// const Trie = require('regexgen').Trie;
// const trie = new Trie();
// trie.addAll(namedReferences);
// const pattern = trie.toString();
// console.log(pattern);
// → 12 KB instead of the 16 KB of the current output.
// However, the current output gzips better, and has better
// run-time performance.

// Verify all references consist of characters that don’t need escaping
// within regular expressions. (If this is not the case, then we can’t
// simply do a `join('|')`.)
console.assert(namedReferences.every((reference) => {
return /^[a-zA-Z0-9]+$/.test(reference);
}));
const regexNamedReference = '&(' + namedReferences.join('|') + ');';

module.exports = regexNamedReference;
@@ -52,13 +52,15 @@
var regexInvalidEntity = /&#(?:[xX][^a-fA-F0-9]|[^0-9xX])/;
var regexInvalidRawCodePoint = /<%= regexInvalidRawCodePoints %>/;
var regexDecode = /<%=
regexNamedReferenceSource
%>|<%=
regexLegacyReferenceSource
%>|<%=
regexDecimalEscapeSource
%>|<%=
regexHexadecimalEscapeSource
%>|<%=
regexNamedReferenceSource
%>|<%=
regexLegacyReferenceSource
regexAmbiguousAmpersand
%>/g;
var decodeMap = <%= decodeMap %>;
var decodeMapLegacy = <%= decodeMapLegacy %>;
@@ -237,69 +239,72 @@
if (strict && regexInvalidEntity.test(html)) {
parseError('malformed character reference');
}
return html.replace(regexDecode, function($0, $1, $2, $3, $4, $5, $6, $7) {
return html.replace(regexDecode, function($0, $1, $2, $3, $4, $5, $6, $7, $8) {
var codePoint;
var semicolon;
var decDigits;
var hexDigits;
var reference;
var next;

if ($1) {
reference = $1;
// Note: there is no need to check `has(decodeMap, reference)`.
return decodeMap[reference];
}

if ($2) {
// Decode named character references without trailing `;`, e.g. `&amp`.
// This is only a parse error if it gets converted to `&`, or if it is
// followed by `=` in an attribute context.
reference = $2;
next = $3;
if (next && options.isAttributeValue) {
if (strict && next == '=') {
parseError('`&` did not start a character reference');
}
return $0;
} else {
if (strict) {
parseError(
'named character reference was not terminated by a semicolon'
);
}
// Note: there is no need to check `has(decodeMapLegacy, reference)`.
return decodeMapLegacy[reference] + (next || '');
}
}

if ($4) {
// Decode decimal escapes, e.g. `&#119558;`.
decDigits = $1;
semicolon = $2;
decDigits = $4;
semicolon = $5;
if (strict && !semicolon) {
parseError('character reference was not terminated by a semicolon');
}
codePoint = parseInt(decDigits, 10);
return codePointToSymbol(codePoint, strict);
}
if ($3) {

if ($6) {
// Decode hexadecimal escapes, e.g. `&#x1D306;`.
hexDigits = $3;
semicolon = $4;
hexDigits = $6;
semicolon = $7;
if (strict && !semicolon) {
parseError('character reference was not terminated by a semicolon');
}
codePoint = parseInt(hexDigits, 16);
return codePointToSymbol(codePoint, strict);
}
if ($5) {
// Decode named character references with trailing `;`, e.g. `&copy;`.
reference = $5;
if (has(decodeMap, reference)) {
return decodeMap[reference];
} else {
// Ambiguous ampersand. https://mths.be/notes/ambiguous-ampersands
if (strict) {
parseError(
'named character reference was not terminated by a semicolon'
);
}
return $0;
}
}
// If we’re still here, it’s a legacy reference for sure. No need for an
// extra `if` check.
// Decode named character references without trailing `;`, e.g. `&amp`
// This is only a parse error if it gets converted to `&`, or if it is
// followed by `=` in an attribute context.
reference = $6;
next = $7;
if (next && options.isAttributeValue) {
if (strict && next == '=') {
parseError('`&` did not start a character reference');
}
return $0;
} else {
if (strict) {
parseError(
'named character reference was not terminated by a semicolon'
);
}
// Note: there is no need to check `has(decodeMapLegacy, reference)`.
return decodeMapLegacy[reference] + (next || '');

// If we’re still here, `if ($7)` is implied; it’s an ambiguous
// ampersand for sure. https://mths.be/notes/ambiguous-ampersands
if (strict) {
parseError(
'named character reference was not terminated by a semicolon'
);
}
return $0;
});
};
// Expose default options (so they can be overridden globally).
@@ -5979,6 +5979,10 @@
{
'decoded': 'a\u200Cb',
'encoded': 'a&zwnj;b'
},
{
'decoded': '&xxx; &xxx &thorn; &thorn &curren;t &current',
'encoded': '&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current'
}
];

@@ -6041,6 +6045,11 @@
'\u2209 \xACi \xACin \xA9123',
'Legacy named references (without a trailing semicolon)'
);
equal(
he.decode('&amp;xxx; &amp;xxx &ampthorn; &ampthorn &ampcurren;t &ampcurrent'),
'&xxx; &xxx &thorn; &thorn &curren;t &current',
'Legacy named references'
);
equal(
he.decode('a&#x1D306;b&#X0000000000001d306;c'),
'a\uD834\uDF06b\uD834\uDF06c',
@@ -6213,15 +6222,14 @@
'Parse error: `I\'m ¬it; I tell you`'
);
he.decode.options.strict = false;
raises(
function() {
he.decode('I\'m &notit; I tell you', {
'strict': true,
'isAttributeValue': true
});
},
Error,
'Parse error: `I\'m &notit; I tell you` as attribute value'
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
equal(
he.decode('I\'m &notit; I tell you', {
'strict': true,
'isAttributeValue': true
}),
'I\'m &notit; I tell you',
'No parse error: `I\'m &notit; I tell you` as attribute value'
);
equal(
he.decode('I\'m &notit; I tell you', {
@@ -5979,6 +5979,10 @@
{
'decoded': 'a\u200Cb',
'encoded': 'a&zwnj;b'
},
{
'decoded': '&xxx; &xxx &thorn; &thorn &curren;t &current',
'encoded': '&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current'
}
];

@@ -6041,6 +6045,11 @@
'\u2209 \xACi \xACin \xA9123',
'Legacy named references (without a trailing semicolon)'
);
equal(
he.decode('&amp;xxx; &amp;xxx &ampthorn; &ampthorn &ampcurren;t &ampcurrent'),
'&xxx; &xxx &thorn; &thorn &curren;t &current',
'Legacy named references'
);
equal(
he.decode('a&#x1D306;b&#X0000000000001d306;c'),
'a\uD834\uDF06b\uD834\uDF06c',
@@ -6213,15 +6222,14 @@
'Parse error: `I\'m ¬it; I tell you`'
);
he.decode.options.strict = false;
raises(
function() {
he.decode('I\'m &notit; I tell you', {
'strict': true,
'isAttributeValue': true
});
},
Error,
'Parse error: `I\'m &notit; I tell you` as attribute value'
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
equal(
he.decode('I\'m &notit; I tell you', {
'strict': true,
'isAttributeValue': true
}),
'I\'m &notit; I tell you',
'No parse error: `I\'m &notit; I tell you` as attribute value'
);
equal(
he.decode('I\'m &notit; I tell you', {

0 comments on commit 903c6b5

Please sign in to comment.