mathiasbynens · nicolo-ribaudo · Sep 23, 2023 · Sep 19, 2023 · Sep 19, 2023 · Sep 19, 2023
diff --git a/rewrite-pattern.js b/rewrite-pattern.js
@@ -21,6 +21,11 @@ function flatMap(array, callback) {
 	return result;
 }
 
+function regenerateContainsAstral(regenerateData) {
+	const data = regenerateData.data;
+	return data.length >= 1 && data[data.length - 1] >= 0x10000;
+}
+
 const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;
 
 // Prepare a Regenerate set containing all code points, used for negative
@@ -330,7 +335,7 @@ const buildHandler = (action) => {
 		}
 		// The `default` clause is only here as a safeguard; it should never be
 		// reached. Code coverage tools should ignore it.
-		/* istanbul ignore next */
+		/* node:coverage ignore next */
 		default:
 			throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
 	}
@@ -414,7 +419,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
 			break;
 		// The `default` clause is only here as a safeguard; it should never be
 		// reached. Code coverage tools should ignore it.
-		/* istanbul ignore next */
+		/* node:coverage ignore next */
 		default:
 			throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
 	}
@@ -441,7 +446,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
 			case 'characterClassEscape':
 				handlePositive.regSet(data, getCharacterClassEscapeSet(
 					item.value,
-					config.flags.unicode,
+					config.flags.unicode || config.flags.unicodeSets,
 					config.flags.ignoreCase
 				));
 				break;
@@ -465,7 +470,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
 				break;
 			// The `default` clause is only here as a safeguard; it should never be
 			// reached. Code coverage tools should ignore it.
-			/* istanbul ignore next */
+			/* node:coverage ignore next */
 			default:
 				throw new Error(`Unknown term type: ${ item.type }`);
 		}
@@ -488,13 +493,15 @@ const processCharacterClass = (
 	const negative = characterClassItem.negative;
 	const { singleChars, transformed, longStrings } = computed;
 	if (transformed) {
-		const setStr = singleChars.toString(regenerateOptions);
+		// If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
+		const bmpOnly = regenerateContainsAstral(singleChars);
+		const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));
 
 		if (negative) {
 			if (config.useUnicodeFlag) {
 				update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
 			} else {
-				if (config.flags.unicode) {
+				if (config.flags.unicode || config.flags.unicodeSets) {
 					if (config.flags.ignoreCase) {
 						const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
 						// Assumption: singleChars do not contain lone surrogates.
@@ -518,10 +525,9 @@ const processCharacterClass = (
 						);
 					} else {
 						// Generate negative set directly when case folding is not involved.
-						update(
-							characterClassItem,
-							UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions)
-						);
+						const negativeSet = UNICODE_SET.clone().remove(singleChars);
+						const bmpOnly = regenerateContainsAstral(negativeSet);
+						update(characterClassItem, negativeSet.toString({ bmpOnly: bmpOnly }));
 					}
 				} else {
 					update(characterClassItem, `(?!${setStr})[\\s\\S]`);
@@ -731,7 +737,7 @@ const processTerm = (item, regenerateOptions, groups) => {
 			break;
 		// The `default` clause is only here as a safeguard; it should never be
 		// reached. Code coverage tools should ignore it.
-		/* istanbul ignore next */
+		/* node:coverage ignore next */
 		default:
 			throw new Error(`Unknown term type: ${ item.type }`);
 	}
@@ -835,7 +841,7 @@ const rewritePattern = (pattern, flags, options) => {
 
 	const regenerateOptions = {
 		'hasUnicodeFlag': config.useUnicodeFlag,
-		'bmpOnly': !config.flags.unicode
+		'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
 	};
 
 	const groups = {

diff --git a/tests/fixtures/character-class.js b/tests/fixtures/character-class.js
@@ -44,25 +44,33 @@ const characterClassFixtures = [
 	{
 		pattern: '[^K]', // LATIN CAPITAL LETTER K
 		flags: 'u',
-		expected: '(?:[\\0-JL-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
+		matches: ["k", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
+		nonMatches: ["K"],
+		expected: '(?:[\\0-JL-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
 		options: { unicodeFlag: 'transform' }
 	},
 	{
 		pattern: '[^k]', // LATIN SMALL LETTER K
 		flags: 'u',
-		expected: '(?:[\\0-jl-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
+		matches: ["K", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
+		nonMatches: ["k"],
+		expected: '(?:[\\0-jl-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
 		options: { unicodeFlag: 'transform' }
 	},
 	{
 		pattern: '[^\u212a]', // KELVIN SIGN
 		flags: 'u',
-		expected: '(?:[\\0-\\u2129\\u212B-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
+		matches: ["K", "k", "\u{12345}", "\uDAAA", "\uDDDD"],
+		nonMatches: ["\u212a"],
+		expected: '(?:[\\0-\\u2129\\u212B-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
 		options: { unicodeFlag: 'transform' }
 	},
 	{
 		pattern: '[^\u{1D50E}]', // MATHEMATICAL FRAKTUR CAPITAL K
 		flags: 'u',
-		expected: '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
+		matches: ["K", "k", "\u{12345}", "\u{1D50F}", "\uDAAA", "\uDDDD"],
+		nonMatches: ["\u{1D50E}"],
+		expected: '(?:[\\0-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF])',
 		options: { unicodeFlag: 'transform' }
 	},
 	{

diff --git a/tests/fixtures/unicode-set.js b/tests/fixtures/unicode-set.js
@@ -105,15 +105,21 @@ const unicodeSetFixtures = [
 	},
 	{
 		pattern: '[^[a-z][f-h]]',
-		expected: '(?:(?![a-z])[\\s\\S])',
+		matches: ["A", "\u{12345}", "\uDAAA", "\uDDDD"],
+		nonMatches: ["a", "z"],
+		expected: '(?:[\\0-`\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
 		options: TRANSFORM_U
 	},
 	{
 		pattern: '[[^a-z][f-h]]',
+		matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
+		nonMatches: ["a", "z"],
 		expected: '[\\0-`f-h\\{-\\u{10FFFF}]'
 	},
 	{
 		pattern: '[[^a-z][f-h]]',
+		matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
+		nonMatches: ["a", "z"],
 		expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
 		options: TRANSFORM_U
 	},
@@ -336,6 +342,13 @@ const unicodeSetFixtures = [
 	{
 		pattern: '[\\p{ASCII}&&\\p{Control}]',
 		expected: '[\\0-\\x1F\\x7F]',
+	},
+	{
+		pattern: '.',
+		flags: 'sv',
+		matches: ['\n'],
+		options: { unicodeSetsFlag: 'transform', dotAllFlag: 'transform' },
+		expected: '[\\s\\S]'
 	}
 ];
 

diff --git a/tests/fixtures/unicode.js b/tests/fixtures/unicode.js
@@ -44,7 +44,7 @@ const unicodeFixtures = [
 	{
 		'pattern': '[\\s\\S]',
 		'flags': FLAGS_WITH_UNICODE,
-		'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
+		'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
 	},
 	{
 		'pattern': '\\d',
@@ -68,8 +68,9 @@ const unicodeFixtures = [
 	},
 	{
 		'pattern': '[\\d\\D]',
+		'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
 		'flags': FLAGS_WITH_UNICODE,
-		'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
+		'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
 	},
 	{
 		'pattern': '\\w',
@@ -100,8 +101,9 @@ const unicodeFixtures = [
 	},
 	{
 		'pattern': '[\\w\\W]',
+		'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
 		'flags': FLAGS_WITH_UNICODE,
-		'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
+		'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
 	},
 	{
 		'pattern': '[\\uD834\\uDF06-\\uD834\\uDF08a-z]',
@@ -180,11 +182,14 @@ const unicodeFixtures = [
 	},
 	{
 		'pattern': '[^a]',
+		'matches': ['b', 'A', '\u{1D49C}', '\uDAAA', '\uDDDD'],
+		'nonMatches': ['a'],
 		'flags': FLAGS_WITH_UNICODE_WITHOUT_I,
-		'transpiled': '(?:[\\0-`b-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
+		'transpiled': '(?:[\\0-`b-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
 	},
 	{
 		'pattern': '[^a]',
+		'nonMatches': ['a', 'A'],
 		'flags': FLAGS_WITH_UNICODE_WITH_I,
 		'transpiled': '(?:(?![a\\uD800-\\uDFFF])[\\s\\S]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
 	},

diff --git a/tests/tests.js b/tests/tests.js
@@ -17,6 +17,30 @@ const { characterClassFixtures } = require("./fixtures/character-class.js");
 const { unicodeSetFixtures } = require("./fixtures/unicode-set.js");
 const { modifiersFixtures } = require("./fixtures/modifiers.js");
 
+/** For node 6 compat */
+assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null) });
+assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null) });
+
+/**
+ * comput output regex flags from input flags and transform options
+ *
+ * @param {string} inputFlags
+ * @param {*} regexpuOptions
+ */
+function getOutputFlags(inputFlags, options) {
+	let result = inputFlags;
+	if (options.unicodeSetsFlag === "transform") {
+		result = result.replace("v", "u");
+	}
+	if (options.unicodeFlag === "transform") {
+		result = result.replace("u", "");
+	}
+	if (options.dotAllFlag === "transform") {
+		result = result.replace("s", "");
+	}
+	return result;
+}
+
 describe('rewritePattern { unicodeFlag }', () => {
 	const options = {
 		'unicodeFlag': 'transform'
@@ -95,19 +119,19 @@ describe('unicodePropertyEscapes', () => {
 		);
 		assert.equal(
 			rewritePattern('[^\\p{ASCII_Hex_Digit}_]', 'u', features),
-			'(?:[\\0-\\/:-@G-\\^`g-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
+			'(?:[\\0-\\/:-@G-\\^`g-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
 		);
 		assert.equal(
 			rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', features),
-			'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
+			'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
 		);
 		assert.equal(
 			rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
 			'(?:_|\\uD811[\\uDC00-\\uDE46])'
 		);
 		assert.equal(
 			rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
-			'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
+			'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
 		);
 		assert.equal(
 			rewritePattern('(?:\\p{ASCII_Hex_Digit})', 'u', features),
@@ -219,10 +243,10 @@ describe('unicodePropertyEscapes', () => {
 			'[\\u{14400}-\\u{14646}]'
 		);
 		assert.equal(
-			rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
+			rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
 				'unicodePropertyEscapes': 'transform',
 			}),
-			'[\\u{14400}-\\u{14646}]'
+			'[\\0-\\u{143FF}\\u{14647}-\\u{10FFFF}]'
 		);
 	});
 	it('should not transpile unicode property when unicodePropertyEscapes is not enabled', () => {
@@ -391,13 +415,50 @@ describe('character classes', () => {
 			if (transpiled != '(?:' + expected + ')') {
 				assert.strictEqual(transpiled, expected);
 			}
+			for (const match of fixture.matches || []) {
+				const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
+				assert.match(match, transpiledRegex);
+			}
+			for (const nonMatch of fixture.nonMatches || []) {
+				const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
+				assert.doesNotMatch(nonMatch, transpiledRegex);
+			}
 		});
 	}
 });
 
 
 
 describe('unicodeSets (v) flag', () => {
+	// Re-use the unicode fixtures but replacing the input pattern's `u` flag with `v` flag
+	for (const fixture of unicodeFixtures) {
+		if (fixture.flags.includes("u")) {
+			for (let flag of fixture.flags) {
+				flag = flag.replace("u", "v");
+				const { pattern, transpiled: expected } = fixture;
+				const inputRE = `/${pattern}/${flag}`;
+				it(`rewrites \`${inputRE}\` correctly without using the u flag`, () => {
+					const options = {
+						unicodeSetsFlag: "transform",
+						unicodeFlag: "transform",
+					};
+					const transpiled = rewritePattern(pattern, flag, options);
+					if (transpiled != "(?:" + expected + ")") {
+						assert.strictEqual(transpiled, expected);
+					}
+					for (const match of fixture.matches || []) {
+						const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
+						assert.match(match, transpiledRegex);
+					}
+					for (const nonMatch of fixture.nonMatches || []) {
+						const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
+						assert.doesNotMatch(nonMatch, transpiledRegex);
+					}
+				});
+			}
+		}
+	}
+
 	if (IS_NODE_6) return;
 
 	for (const fixture of unicodeSetFixtures) {
@@ -421,12 +482,20 @@ describe('unicodeSets (v) flag', () => {
 				}, throws);
 			});
 		} else {
+			const transpiled = rewritePattern(pattern, flags, options);
 			it(`rewrites \`${inputRE}\` correctly ${transformUnicodeFlag ? 'without ' : ''}using the u flag`, () => {
-				const transpiled = rewritePattern(pattern, flags, options);
 				if (transpiled != '(?:' + expected + ')') {
 					assert.strictEqual(transpiled, expected);
 				}
 			});
+			for (const match of fixture.matches || []) {
+				const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
+				assert.match(match, transpiledRegex);
+			}
+			for (const nonMatch of fixture.nonMatches || []) {
+				const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
+				assert.doesNotMatch(nonMatch, transpiledRegex);
+			}
 		}
 	}