Handle legacy named character references correctly

Fixes #67.
mathiasbynens · Sep 23, 2018 · 903c6b5 · 903c6b5
1 parent 24377e2
commit 903c6b5
Show file tree

Hide file tree

Showing 8 changed files with 157 additions and 106 deletions.
diff --git a/he.js b/he.js
diff --git a/package.json b/package.json
@@ -42,6 +42,7 @@
   "devDependencies": {
     "codecov.io": "^0.1.6",
     "grunt": "^0.4.5",
+    "grunt-cli": "^1.3.1",
     "grunt-shell": "^1.1.1",
     "grunt-template": "^0.2.3",
     "istanbul": "^0.4.2",
@@ -50,6 +51,7 @@
     "qunit-extras": "^1.4.5",
     "qunitjs": "~1.11.0",
     "regenerate": "^1.2.1",
+    "regexgen": "^1.3.0",
     "requirejs": "^2.1.22",
     "sort-object": "^3.0.2"
   }

diff --git a/scripts/export-data.js b/scripts/export-data.js
@@ -24,8 +24,9 @@ module.exports = {
 	'regexHexadecimalEscapeSource': '&#[xX]([a-fA-F0-9]+)(;?)',
 	'regexInvalidRawCodePoints': require('./invalid-code-points-regex.js'),
 	'regexLegacyReferenceSource': require('./legacy-reference-regex.js'),
-	'regexNamedReferenceSource': '&([0-9a-zA-Z]+);',
+	'regexNamedReferenceSource': require('./named-reference-regex.js'),
 	'stringInvalidCodePoints': require('./invalid-code-points-string.js'),
+	'regexAmbiguousAmpersand': '&([0-9a-zA-Z]+)',
 	'testDataMap': formatJSON('entities'),
 	'version': require('../package.json').version
 };
diff --git a/scripts/legacy-reference-regex.js b/scripts/legacy-reference-regex.js
@@ -2,6 +2,6 @@
 
 const legacyReferences = require('../data/decode-legacy-named-references.json');
 const regexLegacyReference = '&(' + legacyReferences.join('|') +
-	')([=a-zA-Z0-9])?';
+	')(?!;)([=a-zA-Z0-9]?)';
 
 module.exports = regexLegacyReference;
diff --git a/scripts/named-reference-regex.js b/scripts/named-reference-regex.js
@@ -0,0 +1,24 @@
+'use strict';
+
+const namedReferences = Object.keys(
+	require('../data/decode-map.json')
+).sort((a, b) => b.length - a.length);
+
+// const Trie = require('regexgen').Trie;
+// const trie = new Trie();
+// trie.addAll(namedReferences);
+// const pattern = trie.toString();
+// console.log(pattern);
+// → 12 KB instead of the 16 KB of the current output.
+// However, the current output gzips better, and has better
+// run-time performance.
+
+// Verify all references consist of characters that don’t need escaping
+// within regular expressions. (If this is not the case, then we can’t
+// simply do a `join('|')`.)
+console.assert(namedReferences.every((reference) => {
+	return /^[a-zA-Z0-9]+$/.test(reference);
+}));
+const regexNamedReference = '&(' + namedReferences.join('|') + ');';
+
+module.exports = regexNamedReference;
diff --git a/src/he.js b/src/he.js
@@ -52,13 +52,15 @@
 	var regexInvalidEntity = /&#(?:[xX][^a-fA-F0-9]|[^0-9xX])/;
 	var regexInvalidRawCodePoint = /<%= regexInvalidRawCodePoints %>/;
 	var regexDecode = /<%=
+		regexNamedReferenceSource
+	%>|<%=
+		regexLegacyReferenceSource
+	%>|<%=
 		regexDecimalEscapeSource
 	%>|<%=
 		regexHexadecimalEscapeSource
 	%>|<%=
-		regexNamedReferenceSource
+		regexAmbiguousAmpersand
-	%>|<%=
-		regexLegacyReferenceSource
 	%>/g;
 	var decodeMap = <%= decodeMap %>;
 	var decodeMapLegacy = <%= decodeMapLegacy %>;
@@ -237,69 +239,72 @@
 		if (strict && regexInvalidEntity.test(html)) {
 			parseError('malformed character reference');
 		}
-		return html.replace(regexDecode, function($0, $1, $2, $3, $4, $5, $6, $7) {
+		return html.replace(regexDecode, function($0, $1, $2, $3, $4, $5, $6, $7, $8) {
 			var codePoint;
 			var semicolon;
 			var decDigits;
 			var hexDigits;
 			var reference;
 			var next;
+
 			if ($1) {
+				reference = $1;
+				// Note: there is no need to check `has(decodeMap, reference)`.
+				return decodeMap[reference];
+			}
+
+			if ($2) {
+				// Decode named character references without trailing `;`, e.g. `&amp`.
+				// This is only a parse error if it gets converted to `&`, or if it is
+				// followed by `=` in an attribute context.
+				reference = $2;
+				next = $3;
+				if (next && options.isAttributeValue) {
+					if (strict && next == '=') {
+						parseError('`&` did not start a character reference');
+					}
+					return $0;
+				} else {
+					if (strict) {
+						parseError(
+							'named character reference was not terminated by a semicolon'
+						);
+					}
+					// Note: there is no need to check `has(decodeMapLegacy, reference)`.
+					return decodeMapLegacy[reference] + (next || '');
+				}
+			}
+
+			if ($4) {
 				// Decode decimal escapes, e.g. `&#119558;`.
-				decDigits = $1;
+				decDigits = $4;
-				semicolon = $2;
+				semicolon = $5;
 				if (strict && !semicolon) {
 					parseError('character reference was not terminated by a semicolon');
 				}
 				codePoint = parseInt(decDigits, 10);
 				return codePointToSymbol(codePoint, strict);
 			}
-			if ($3) {
+
+			if ($6) {
 				// Decode hexadecimal escapes, e.g. `&#x1D306;`.
-				hexDigits = $3;
+				hexDigits = $6;
-				semicolon = $4;
+				semicolon = $7;
 				if (strict && !semicolon) {
 					parseError('character reference was not terminated by a semicolon');
 				}
 				codePoint = parseInt(hexDigits, 16);
 				return codePointToSymbol(codePoint, strict);
 			}
-			if ($5) {
+
-				// Decode named character references with trailing `;`, e.g. `&copy;`.
+			// If we’re still here, `if ($7)` is implied; it’s an ambiguous
-				reference = $5;
+			// ampersand for sure. https://mths.be/notes/ambiguous-ampersands
-				if (has(decodeMap, reference)) {
+			if (strict) {
-					return decodeMap[reference];
+				parseError(
-				} else {
+					'named character reference was not terminated by a semicolon'
-					// Ambiguous ampersand. https://mths.be/notes/ambiguous-ampersands
+				);
-					if (strict) {
-						parseError(
-							'named character reference was not terminated by a semicolon'
-						);
-					}
-					return $0;
-				}
-			}
-			// If we’re still here, it’s a legacy reference for sure. No need for an
-			// extra `if` check.
-			// Decode named character references without trailing `;`, e.g. `&amp`
-			// This is only a parse error if it gets converted to `&`, or if it is
-			// followed by `=` in an attribute context.
-			reference = $6;
-			next = $7;
-			if (next && options.isAttributeValue) {
-				if (strict && next == '=') {
-					parseError('`&` did not start a character reference');
-				}
-				return $0;
-			} else {
-				if (strict) {
-					parseError(
-						'named character reference was not terminated by a semicolon'
-					);
-				}
-				// Note: there is no need to check `has(decodeMapLegacy, reference)`.
-				return decodeMapLegacy[reference] + (next || '');
 			}
+			return $0;
 		});
 	};
 	// Expose default options (so they can be overridden globally).

diff --git a/tests/tests.js b/tests/tests.js
@@ -5979,6 +5979,10 @@
 		{
 			'decoded': 'a\u200Cb',
 			'encoded': 'a&zwnj;b'
+		},
+		{
+			'decoded': '&xxx; &xxx &thorn; &thorn &curren;t &current',
+			'encoded': '&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current'
 		}
 	];
 
@@ -6041,6 +6045,11 @@
 			'\u2209 \xACi \xACin \xA9123',
 			'Legacy named references (without a trailing semicolon)'
 		);
+		equal(
+			he.decode('&amp;xxx; &amp;xxx &ampthorn; &ampthorn &ampcurren;t &ampcurrent'),
+			'&xxx; &xxx &thorn; &thorn &curren;t &current',
+			'Legacy named references'
+		);
 		equal(
 			he.decode('a&#x1D306;b&#X0000000000001d306;c'),
 			'a\uD834\uDF06b\uD834\uDF06c',
@@ -6213,15 +6222,14 @@
 			'Parse error: `I\'m ¬it; I tell you`'
 		);
 		he.decode.options.strict = false;
-		raises(
+		// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
-			function() {
+		equal(
-				he.decode('I\'m &notit; I tell you', {
+			he.decode('I\'m &notit; I tell you', {
-					'strict': true,
+				'strict': true,
-					'isAttributeValue': true
+				'isAttributeValue': true
-				});
+			}),
-			},
+			'I\'m &notit; I tell you',
-			Error,
+			'No parse error: `I\'m &notit; I tell you` as attribute value'
-			'Parse error: `I\'m &notit; I tell you` as attribute value'
 		);
 		equal(
 			he.decode('I\'m &notit; I tell you', {

diff --git a/tests/tests.src.js b/tests/tests.src.js
@@ -5979,6 +5979,10 @@
 		{
 			'decoded': 'a\u200Cb',
 			'encoded': 'a&zwnj;b'
+		},
+		{
+			'decoded': '&xxx; &xxx &thorn; &thorn &curren;t &current',
+			'encoded': '&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current'
 		}
 	];
 
@@ -6041,6 +6045,11 @@
 			'\u2209 \xACi \xACin \xA9123',
 			'Legacy named references (without a trailing semicolon)'
 		);
+		equal(
+			he.decode('&amp;xxx; &amp;xxx &ampthorn; &ampthorn &ampcurren;t &ampcurrent'),
+			'&xxx; &xxx &thorn; &thorn &curren;t &current',
+			'Legacy named references'
+		);
 		equal(
 			he.decode('a&#x1D306;b&#X0000000000001d306;c'),
 			'a\uD834\uDF06b\uD834\uDF06c',
@@ -6213,15 +6222,14 @@
 			'Parse error: `I\'m ¬it; I tell you`'
 		);
 		he.decode.options.strict = false;
-		raises(
+		// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
-			function() {
+		equal(
-				he.decode('I\'m &notit; I tell you', {
+			he.decode('I\'m &notit; I tell you', {
-					'strict': true,
+				'strict': true,
-					'isAttributeValue': true
+				'isAttributeValue': true
-				});
+			}),
-			},
+			'I\'m &notit; I tell you',
-			Error,
+			'No parse error: `I\'m &notit; I tell you` as attribute value'
-			'Parse error: `I\'m &notit; I tell you` as attribute value'
 		);
 		equal(
 			he.decode('I\'m &notit; I tell you', {