Skip to content

Commit

Permalink
Simplify regexInvalidRawCodePoint
Browse files Browse the repository at this point in the history
By using a lookahead the regular expression to match lone surrogates gets a bit more compact.

This also fixes an encoding issue caused by a typo that was introduced in 74fd7cd.
  • Loading branch information
mathiasbynens committed May 24, 2014
1 parent 65f5b49 commit 107b566
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 49 deletions.
2 changes: 1 addition & 1 deletion he.js

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions scripts/export-data.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ var formatJSON = function(fileName) {
};

module.exports = {
'regexEncodeNonAscii': require('./encode-non-ascii-regex.js'),
'regexInvalidRawCodePoints': require('./invalid-code-points-regex.js'),
'decodeMap': formatJSON('decode-map'),
'decodeMapLegacy': formatJSON('decode-map-legacy'),
'decodeMapOverrides': formatJSON('decode-map-overrides'),
'encodeMap': formatJSON('encode-map'),
'invalidReferenceCodePoints': formatJSON('invalid-character-reference-code-points'),
'regexAsciiWhitelist': require('./ascii-whitelist-regex.js'),
'regexAstralSymbol': require('./astral-symbol-regex.js'),
'regexBmpWhitelist': require('./bmp-whitelist-regex.js'),
'regexDecimalEscapeSource': '&#([0-9]+)(;?)',
'regexEncodeNonAscii': require('./encode-non-ascii-regex.js'),
'regexHexadecimalEscapeSource': '&#[xX]([a-fA-F0-9]+)(;?)',
'regexNamedReferenceSource': '&([0-9a-zA-Z]+);',
'regexInvalidRawCodePoints': require('./invalid-code-points-regex.js'),
'regexLegacyReferenceSource': require('./legacy-reference-regex.js'),
'regexLoneSurrogate': '[\\uD800-\\uDBFF](?:[^\\uDC00-\\uDFFF]|$)|(?:[^\\uD800-\uDBFF]|^)[\\uDC00-\\uDFFF]',
'regexAsciiWhitelist': require('./ascii-whitelist-regex.js'),
'regexBmpWhitelist': require('./bmp-whitelist-regex.js'),
'encodeMap': formatJSON('encode-map'),
'decodeMapOverrides': formatJSON('decode-map-overrides'),
'decodeMap': formatJSON('decode-map'),
'decodeMapLegacy': formatJSON('decode-map-legacy'),
'testDataMap': formatJSON('entities'),
'invalidReferenceCodePoints': formatJSON('invalid-character-reference-code-points'),
'regexLoneSurrogate': '[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF]',
'regexNamedReferenceSource': '&([0-9a-zA-Z]+);',
'stringInvalidCodePoints': require('./invalid-code-points-string.js'),
'testDataMap': formatJSON('entities'),
'version': require('../package.json').version
};
105 changes: 87 additions & 18 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -6427,77 +6427,146 @@
'a&b123;+©>⃒<⃒
fja',
'All kinds of symbols when `encodeEverything: true, useNamedReferences: true`'
);
equal(
he.encode('foo\uD800bar'),
'foo�bar',
'Lone high surrogate'
);
raises(
function() {
he.encode('foo\uD800bar', { 'strict': true });
},
Error,
'Lone high surrogate triggers parse error when `strict: true`'
);
equal(
he.encode('\uD800bar'),
'�bar',
'Lone high surrogate at the start of a string'
);
raises(
function() {
he.encode('\uD800bar', { 'strict': true });
},
Error,
'Lone high surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uD800'),
'foo�',
'Lone high surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uD800', { 'strict': true });
},
Error,
'Lone high surrogate at the end of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDBFFbar'),
'foo�bar',
'Lone high surrogate'
);
raises(
function() {
he.encode('foo\uDBFFbar', { 'strict': true });
},
Error,
'Lone high surrogate triggers parse error when `strict: true`'
);
equal(
he.encode('\uDBFFbar'),
'�bar',
'Lone high surrogate at the start of a string'
);
raises(
function() {
he.encode('\uDBFFbar', { 'strict': true });
},
Error,
'Lone high surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDBFF'),
'foo�',
'Lone high surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uDBFF', { 'strict': true });
},
Error,
'Lone high surrogate at the end of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDC00bar'),
'foo�bar',
'Lone high surrogate'
'Lone low surrogate'
);
raises(
function() {
he.encode('foo\uDC00bar', { 'strict': true });
},
Error,
'Lone high surrogate triggers parse error when `strict: true`'
'Lone low surrogate triggers parse error when `strict: true`'
);
equal(
he.encode('\uDC00bar'),
'�bar',
'Lone high surrogate at the start of a string'
'Lone low surrogate at the start of a string'
);
raises(
function() {
he.encode('\uDC00bar', { 'strict': true });
},
Error,
'Lone high surrogate at the start of a string triggers parse error when `strict: true`'
'Lone low surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDC00'),
'foo�',
'Lone high surrogate at the end of a string'
'Lone low surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uDC00', { 'strict': true });
},
Error,
'Lone high surrogate at the end of a string triggers parse error when `strict: true`'
'Lone low surrogate at the end of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uD800bar'),
'foo�bar',
he.encode('foo\uDFFFbar'),
'foo�bar',
'Lone low surrogate'
);
raises(
function() {
he.encode('foo\uD800bar', { 'strict': true });
he.encode('foo\uDFFFbar', { 'strict': true });
},
Error,
'Lone low surrogate triggers parse error when `strict: true`'
);



equal(
he.encode('\uD800bar'),
'�bar',
he.encode('\uDFFFbar'),
'�bar',
'Lone low surrogate at the start of a string'
);
raises(
function() {
he.encode('\uD800bar', { 'strict': true });
he.encode('\uDFFFbar', { 'strict': true });
},
Error,
'Lone low surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uD800'),
'foo�',
he.encode('foo\uDFFF'),
'foo�',
'Lone low surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uD800', { 'strict': true });
he.encode('foo\uDFFF', { 'strict': true });
},
Error,
'Lone low surrogate at the end of a string triggers parse error when `strict: true`'
Expand Down
105 changes: 87 additions & 18 deletions tests/tests.src.js
Original file line number Diff line number Diff line change
Expand Up @@ -6427,77 +6427,146 @@
'a&b123;+©>⃒<⃒
fja',
'All kinds of symbols when `encodeEverything: true, useNamedReferences: true`'
);
equal(
he.encode('foo\uD800bar'),
'foo�bar',
'Lone high surrogate'
);
raises(
function() {
he.encode('foo\uD800bar', { 'strict': true });
},
Error,
'Lone high surrogate triggers parse error when `strict: true`'
);
equal(
he.encode('\uD800bar'),
'�bar',
'Lone high surrogate at the start of a string'
);
raises(
function() {
he.encode('\uD800bar', { 'strict': true });
},
Error,
'Lone high surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uD800'),
'foo�',
'Lone high surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uD800', { 'strict': true });
},
Error,
'Lone high surrogate at the end of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDBFFbar'),
'foo�bar',
'Lone high surrogate'
);
raises(
function() {
he.encode('foo\uDBFFbar', { 'strict': true });
},
Error,
'Lone high surrogate triggers parse error when `strict: true`'
);
equal(
he.encode('\uDBFFbar'),
'�bar',
'Lone high surrogate at the start of a string'
);
raises(
function() {
he.encode('\uDBFFbar', { 'strict': true });
},
Error,
'Lone high surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDBFF'),
'foo�',
'Lone high surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uDBFF', { 'strict': true });
},
Error,
'Lone high surrogate at the end of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDC00bar'),
'foo�bar',
'Lone high surrogate'
'Lone low surrogate'
);
raises(
function() {
he.encode('foo\uDC00bar', { 'strict': true });
},
Error,
'Lone high surrogate triggers parse error when `strict: true`'
'Lone low surrogate triggers parse error when `strict: true`'
);
equal(
he.encode('\uDC00bar'),
'�bar',
'Lone high surrogate at the start of a string'
'Lone low surrogate at the start of a string'
);
raises(
function() {
he.encode('\uDC00bar', { 'strict': true });
},
Error,
'Lone high surrogate at the start of a string triggers parse error when `strict: true`'
'Lone low surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uDC00'),
'foo�',
'Lone high surrogate at the end of a string'
'Lone low surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uDC00', { 'strict': true });
},
Error,
'Lone high surrogate at the end of a string triggers parse error when `strict: true`'
'Lone low surrogate at the end of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uD800bar'),
'foo�bar',
he.encode('foo\uDFFFbar'),
'foo�bar',
'Lone low surrogate'
);
raises(
function() {
he.encode('foo\uD800bar', { 'strict': true });
he.encode('foo\uDFFFbar', { 'strict': true });
},
Error,
'Lone low surrogate triggers parse error when `strict: true`'
);



equal(
he.encode('\uD800bar'),
'�bar',
he.encode('\uDFFFbar'),
'�bar',
'Lone low surrogate at the start of a string'
);
raises(
function() {
he.encode('\uD800bar', { 'strict': true });
he.encode('\uDFFFbar', { 'strict': true });
},
Error,
'Lone low surrogate at the start of a string triggers parse error when `strict: true`'
);
equal(
he.encode('foo\uD800'),
'foo�',
he.encode('foo\uDFFF'),
'foo�',
'Lone low surrogate at the end of a string'
);
raises(
function() {
he.encode('foo\uD800', { 'strict': true });
he.encode('foo\uDFFF', { 'strict': true });
},
Error,
'Lone low surrogate at the end of a string triggers parse error when `strict: true`'
Expand Down

0 comments on commit 107b566

Please sign in to comment.