diff --git a/rewrite-pattern.js b/rewrite-pattern.js index 898dda0..cb523b5 100644 --- a/rewrite-pattern.js +++ b/rewrite-pattern.js @@ -21,6 +21,11 @@ function flatMap(array, callback) { return result; } +function regenerateContainsAstral(regenerateData) { + const data = regenerateData.data; + return data.length >= 1 && data[data.length - 1] >= 0x10000; +} + const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g; // Prepare a Regenerate set containing all code points, used for negative @@ -330,7 +335,7 @@ const buildHandler = (action) => { } // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. - /* istanbul ignore next */ + /* node:coverage ignore next */ default: throw new Error(`Unknown set action: ${ characterClassItem.kind }`); } @@ -414,7 +419,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => { break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. - /* istanbul ignore next */ + /* node:coverage ignore next */ default: throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`); } @@ -441,7 +446,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => { case 'characterClassEscape': handlePositive.regSet(data, getCharacterClassEscapeSet( item.value, - config.flags.unicode, + config.flags.unicode || config.flags.unicodeSets, config.flags.ignoreCase )); break; @@ -465,7 +470,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => { break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. - /* istanbul ignore next */ + /* node:coverage ignore next */ default: throw new Error(`Unknown term type: ${ item.type }`); } @@ -488,13 +493,15 @@ const processCharacterClass = ( const negative = characterClassItem.negative; const { singleChars, transformed, longStrings } = computed; if (transformed) { - const setStr = singleChars.toString(regenerateOptions); + // If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings + const bmpOnly = regenerateContainsAstral(singleChars); + const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly })); if (negative) { if (config.useUnicodeFlag) { update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`) } else { - if (config.flags.unicode) { + if (config.flags.unicode || config.flags.unicodeSets) { if (config.flags.ignoreCase) { const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET); // Assumption: singleChars do not contain lone surrogates. @@ -518,10 +525,9 @@ const processCharacterClass = ( ); } else { // Generate negative set directly when case folding is not involved. - update( - characterClassItem, - UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions) - ); + const negativeSet = UNICODE_SET.clone().remove(singleChars); + const bmpOnly = regenerateContainsAstral(negativeSet); + update(characterClassItem, negativeSet.toString({ bmpOnly: bmpOnly })); } } else { update(characterClassItem, `(?!${setStr})[\\s\\S]`); @@ -731,7 +737,7 @@ const processTerm = (item, regenerateOptions, groups) => { break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. - /* istanbul ignore next */ + /* node:coverage ignore next */ default: throw new Error(`Unknown term type: ${ item.type }`); } @@ -835,7 +841,7 @@ const rewritePattern = (pattern, flags, options) => { const regenerateOptions = { 'hasUnicodeFlag': config.useUnicodeFlag, - 'bmpOnly': !config.flags.unicode + 'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets }; const groups = { diff --git a/tests/fixtures/character-class.js b/tests/fixtures/character-class.js index 8ef1cd1..50cc050 100644 --- a/tests/fixtures/character-class.js +++ b/tests/fixtures/character-class.js @@ -44,25 +44,33 @@ const characterClassFixtures = [ { pattern: '[^K]', // LATIN CAPITAL LETTER K flags: 'u', - expected: '(?:[\\0-JL-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])', + matches: ["k", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"], + nonMatches: ["K"], + expected: '(?:[\\0-JL-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])', options: { unicodeFlag: 'transform' } }, { pattern: '[^k]', // LATIN SMALL LETTER K flags: 'u', - expected: '(?:[\\0-jl-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])', + matches: ["K", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"], + nonMatches: ["k"], + expected: '(?:[\\0-jl-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])', options: { unicodeFlag: 'transform' } }, { pattern: '[^\u212a]', // KELVIN SIGN flags: 'u', - expected: '(?:[\\0-\\u2129\\u212B-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])', + matches: ["K", "k", "\u{12345}", "\uDAAA", "\uDDDD"], + nonMatches: ["\u212a"], + expected: '(?:[\\0-\\u2129\\u212B-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])', options: { unicodeFlag: 'transform' } }, { pattern: '[^\u{1D50E}]', // MATHEMATICAL FRAKTUR CAPITAL K flags: 'u', - expected: '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])', + matches: ["K", "k", "\u{12345}", "\u{1D50F}", "\uDAAA", "\uDDDD"], + nonMatches: ["\u{1D50E}"], + expected: '(?:[\\0-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF])', options: { unicodeFlag: 'transform' } }, { diff --git a/tests/fixtures/unicode-set.js b/tests/fixtures/unicode-set.js index be6c727..bcd8f36 100644 --- a/tests/fixtures/unicode-set.js +++ b/tests/fixtures/unicode-set.js @@ -105,15 +105,21 @@ const unicodeSetFixtures = [ }, { pattern: '[^[a-z][f-h]]', - expected: '(?:(?![a-z])[\\s\\S])', + matches: ["A", "\u{12345}", "\uDAAA", "\uDDDD"], + nonMatches: ["a", "z"], + expected: '(?:[\\0-`\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])', options: TRANSFORM_U }, { pattern: '[[^a-z][f-h]]', + matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"], + nonMatches: ["a", "z"], expected: '[\\0-`f-h\\{-\\u{10FFFF}]' }, { pattern: '[[^a-z][f-h]]', + matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"], + nonMatches: ["a", "z"], expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])', options: TRANSFORM_U }, @@ -336,6 +342,13 @@ const unicodeSetFixtures = [ { pattern: '[\\p{ASCII}&&\\p{Control}]', expected: '[\\0-\\x1F\\x7F]', + }, + { + pattern: '.', + flags: 'sv', + matches: ['\n'], + options: { unicodeSetsFlag: 'transform', dotAllFlag: 'transform' }, + expected: '[\\s\\S]' } ]; diff --git a/tests/fixtures/unicode.js b/tests/fixtures/unicode.js index 2411e8b..2312ca5 100644 --- a/tests/fixtures/unicode.js +++ b/tests/fixtures/unicode.js @@ -44,7 +44,7 @@ const unicodeFixtures = [ { 'pattern': '[\\s\\S]', 'flags': FLAGS_WITH_UNICODE, - 'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])' + 'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])' }, { 'pattern': '\\d', @@ -68,8 +68,9 @@ const unicodeFixtures = [ }, { 'pattern': '[\\d\\D]', + 'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"], 'flags': FLAGS_WITH_UNICODE, - 'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])' + 'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])' }, { 'pattern': '\\w', @@ -100,8 +101,9 @@ const unicodeFixtures = [ }, { 'pattern': '[\\w\\W]', + 'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"], 'flags': FLAGS_WITH_UNICODE, - 'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])' + 'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])' }, { 'pattern': '[\\uD834\\uDF06-\\uD834\\uDF08a-z]', @@ -180,11 +182,14 @@ const unicodeFixtures = [ }, { 'pattern': '[^a]', + 'matches': ['b', 'A', '\u{1D49C}', '\uDAAA', '\uDDDD'], + 'nonMatches': ['a'], 'flags': FLAGS_WITH_UNICODE_WITHOUT_I, - 'transpiled': '(?:[\\0-`b-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])' + 'transpiled': '(?:[\\0-`b-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])' }, { 'pattern': '[^a]', + 'nonMatches': ['a', 'A'], 'flags': FLAGS_WITH_UNICODE_WITH_I, 'transpiled': '(?:(?![a\\uD800-\\uDFFF])[\\s\\S]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])' }, diff --git a/tests/tests.js b/tests/tests.js index 25b730b..37431ab 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -17,6 +17,30 @@ const { characterClassFixtures } = require("./fixtures/character-class.js"); const { unicodeSetFixtures } = require("./fixtures/unicode-set.js"); const { modifiersFixtures } = require("./fixtures/modifiers.js"); +/** For node 6 compat */ +assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null) }); +assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null) }); + +/** + * comput output regex flags from input flags and transform options + * + * @param {string} inputFlags + * @param {*} regexpuOptions + */ +function getOutputFlags(inputFlags, options) { + let result = inputFlags; + if (options.unicodeSetsFlag === "transform") { + result = result.replace("v", "u"); + } + if (options.unicodeFlag === "transform") { + result = result.replace("u", ""); + } + if (options.dotAllFlag === "transform") { + result = result.replace("s", ""); + } + return result; +} + describe('rewritePattern { unicodeFlag }', () => { const options = { 'unicodeFlag': 'transform' @@ -95,11 +119,11 @@ describe('unicodePropertyEscapes', () => { ); assert.equal( rewritePattern('[^\\p{ASCII_Hex_Digit}_]', 'u', features), - '(?:[\\0-\\/:-@G-\\^`g-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])' + '(?:[\\0-\\/:-@G-\\^`g-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])' ); assert.equal( rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', features), - '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])' + '(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])' ); assert.equal( rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features), @@ -107,7 +131,7 @@ describe('unicodePropertyEscapes', () => { ); assert.equal( rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features), - '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])' + '(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])' ); assert.equal( rewritePattern('(?:\\p{ASCII_Hex_Digit})', 'u', features), @@ -219,10 +243,10 @@ describe('unicodePropertyEscapes', () => { '[\\u{14400}-\\u{14646}]' ); assert.equal( - rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', { + rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', { 'unicodePropertyEscapes': 'transform', }), - '[\\u{14400}-\\u{14646}]' + '[\\0-\\u{143FF}\\u{14647}-\\u{10FFFF}]' ); }); it('should not transpile unicode property when unicodePropertyEscapes is not enabled', () => { @@ -391,6 +415,14 @@ describe('character classes', () => { if (transpiled != '(?:' + expected + ')') { assert.strictEqual(transpiled, expected); } + for (const match of fixture.matches || []) { + const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options)); + assert.match(match, transpiledRegex); + } + for (const nonMatch of fixture.nonMatches || []) { + const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options)); + assert.doesNotMatch(nonMatch, transpiledRegex); + } }); } }); @@ -398,6 +430,35 @@ describe('character classes', () => { describe('unicodeSets (v) flag', () => { + // Re-use the unicode fixtures but replacing the input pattern's `u` flag with `v` flag + for (const fixture of unicodeFixtures) { + if (fixture.flags.includes("u")) { + for (let flag of fixture.flags) { + flag = flag.replace("u", "v"); + const { pattern, transpiled: expected } = fixture; + const inputRE = `/${pattern}/${flag}`; + it(`rewrites \`${inputRE}\` correctly without using the u flag`, () => { + const options = { + unicodeSetsFlag: "transform", + unicodeFlag: "transform", + }; + const transpiled = rewritePattern(pattern, flag, options); + if (transpiled != "(?:" + expected + ")") { + assert.strictEqual(transpiled, expected); + } + for (const match of fixture.matches || []) { + const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options)); + assert.match(match, transpiledRegex); + } + for (const nonMatch of fixture.nonMatches || []) { + const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options)); + assert.doesNotMatch(nonMatch, transpiledRegex); + } + }); + } + } + } + if (IS_NODE_6) return; for (const fixture of unicodeSetFixtures) { @@ -421,12 +482,20 @@ describe('unicodeSets (v) flag', () => { }, throws); }); } else { + const transpiled = rewritePattern(pattern, flags, options); it(`rewrites \`${inputRE}\` correctly ${transformUnicodeFlag ? 'without ' : ''}using the u flag`, () => { - const transpiled = rewritePattern(pattern, flags, options); if (transpiled != '(?:' + expected + ')') { assert.strictEqual(transpiled, expected); } }); + for (const match of fixture.matches || []) { + const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options)); + assert.match(match, transpiledRegex); + } + for (const nonMatch of fixture.nonMatches || []) { + const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options)); + assert.doesNotMatch(nonMatch, transpiledRegex); + } } }