Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix v-flag bugs #85

Merged
merged 7 commits into from
Sep 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
30 changes: 18 additions & 12 deletions rewrite-pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ function flatMap(array, callback) {
return result;
}

function regenerateContainsAstral(regenerateData) {
const data = regenerateData.data;
return data.length >= 1 && data[data.length - 1] >= 0x10000;
}

const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;

// Prepare a Regenerate set containing all code points, used for negative
Expand Down Expand Up @@ -330,7 +335,7 @@ const buildHandler = (action) => {
}
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
}
Expand Down Expand Up @@ -414,7 +419,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
}
Expand All @@ -441,7 +446,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
case 'characterClassEscape':
handlePositive.regSet(data, getCharacterClassEscapeSet(
item.value,
config.flags.unicode,
config.flags.unicode || config.flags.unicodeSets,
config.flags.ignoreCase
));
break;
Expand All @@ -465,7 +470,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
Expand All @@ -488,13 +493,15 @@ const processCharacterClass = (
const negative = characterClassItem.negative;
const { singleChars, transformed, longStrings } = computed;
if (transformed) {
const setStr = singleChars.toString(regenerateOptions);
// If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
const bmpOnly = regenerateContainsAstral(singleChars);
const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));

if (negative) {
if (config.useUnicodeFlag) {
update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
} else {
if (config.flags.unicode) {
if (config.flags.unicode || config.flags.unicodeSets) {
if (config.flags.ignoreCase) {
const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
// Assumption: singleChars do not contain lone surrogates.
Expand All @@ -518,10 +525,9 @@ const processCharacterClass = (
);
} else {
// Generate negative set directly when case folding is not involved.
update(
characterClassItem,
UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions)
);
const negativeSet = UNICODE_SET.clone().remove(singleChars);
const bmpOnly = regenerateContainsAstral(negativeSet);
update(characterClassItem, negativeSet.toString({ bmpOnly: bmpOnly }));
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the regenerate set spans from code points before surrogate to astral sets, toString({ bmpOnly: false }) returns much more verbose results while toString({ bmpOnly: false }) is already correct: I think it should be fixed in regenerate later.

const regenerate = require('regenerate');
const set = regenerate().addRange(0xd000, 0x10000);

console.log(set.toString());
// [\uD000-\uD7FF\uE000-\uFFFF]|\uD800\uDC00|[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?:[^\uD800-\uDBFF]|^)[\uDC00-\uDFFF]


console.log(set.toString({ bmpOnly: true }));
// [\uD000-\uFFFF]|\uD800\uDC00

The latter is apparently correct as it matches lone surrogates as well as U+10000. The former seems like [\uD000-\uFFFF]|\uD800\uDC00 is passed to the bmp pass again.

}
} else {
update(characterClassItem, `(?!${setStr})[\\s\\S]`);
Expand Down Expand Up @@ -731,7 +737,7 @@ const processTerm = (item, regenerateOptions, groups) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
Expand Down Expand Up @@ -835,7 +841,7 @@ const rewritePattern = (pattern, flags, options) => {

const regenerateOptions = {
'hasUnicodeFlag': config.useUnicodeFlag,
'bmpOnly': !config.flags.unicode
'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
};

const groups = {
Expand Down
16 changes: 12 additions & 4 deletions tests/fixtures/character-class.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,33 @@ const characterClassFixtures = [
{
pattern: '[^K]', // LATIN CAPITAL LETTER K
flags: 'u',
expected: '(?:[\\0-JL-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["k", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["K"],
expected: '(?:[\\0-JL-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are now much shorter and easier to reason about. I also added matches tests so that we are confident that transpiled result is correct.

options: { unicodeFlag: 'transform' }
},
{
pattern: '[^k]', // LATIN SMALL LETTER K
flags: 'u',
expected: '(?:[\\0-jl-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["k"],
expected: '(?:[\\0-jl-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u212a]', // KELVIN SIGN
flags: 'u',
expected: '(?:[\\0-\\u2129\\u212B-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u212a"],
expected: '(?:[\\0-\\u2129\\u212B-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u{1D50E}]', // MATHEMATICAL FRAKTUR CAPITAL K
flags: 'u',
expected: '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\u{1D50F}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u{1D50E}"],
expected: '(?:[\\0-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
Expand Down
15 changes: 14 additions & 1 deletion tests/fixtures/unicode-set.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,21 @@ const unicodeSetFixtures = [
},
{
pattern: '[^[a-z][f-h]]',
expected: '(?:(?![a-z])[\\s\\S])',
matches: ["A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:[\\0-`\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '[\\0-`f-h\\{-\\u{10FFFF}]'
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
Expand Down Expand Up @@ -336,6 +342,13 @@ const unicodeSetFixtures = [
{
pattern: '[\\p{ASCII}&&\\p{Control}]',
expected: '[\\0-\\x1F\\x7F]',
},
{
pattern: '.',
flags: 'sv',
matches: ['\n'],
options: { unicodeSetsFlag: 'transform', dotAllFlag: 'transform' },
expected: '[\\s\\S]'
}
];

Expand Down
13 changes: 9 additions & 4 deletions tests/fixtures/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ const unicodeFixtures = [
{
'pattern': '[\\s\\S]',
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\d',
Expand All @@ -68,8 +68,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\d\\D]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\w',
Expand Down Expand Up @@ -100,8 +101,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\w\\W]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[\\uD834\\uDF06-\\uD834\\uDF08a-z]',
Expand Down Expand Up @@ -180,11 +182,14 @@ const unicodeFixtures = [
},
{
'pattern': '[^a]',
'matches': ['b', 'A', '\u{1D49C}', '\uDAAA', '\uDDDD'],
'nonMatches': ['a'],
'flags': FLAGS_WITH_UNICODE_WITHOUT_I,
'transpiled': '(?:[\\0-`b-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-`b-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[^a]',
'nonMatches': ['a', 'A'],
'flags': FLAGS_WITH_UNICODE_WITH_I,
'transpiled': '(?:(?![a\\uD800-\\uDFFF])[\\s\\S]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
Expand Down
81 changes: 75 additions & 6 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,30 @@ const { characterClassFixtures } = require("./fixtures/character-class.js");
const { unicodeSetFixtures } = require("./fixtures/unicode-set.js");
const { modifiersFixtures } = require("./fixtures/modifiers.js");

/** For node 6 compat */
assert.match || (assert.match = function match(value, regex) { assert.ok(regex.exec(value) !== null) });
assert.doesNotMatch || (assert.doesNotMatch = function doesNotMatch(value, regex) { assert.ok(regex.exec(value) === null) });

/**
* comput output regex flags from input flags and transform options
*
* @param {string} inputFlags
* @param {*} regexpuOptions
*/
function getOutputFlags(inputFlags, options) {
let result = inputFlags;
if (options.unicodeSetsFlag === "transform") {
result = result.replace("v", "u");
}
if (options.unicodeFlag === "transform") {
result = result.replace("u", "");
}
if (options.dotAllFlag === "transform") {
result = result.replace("s", "");
}
return result;
}

describe('rewritePattern { unicodeFlag }', () => {
const options = {
'unicodeFlag': 'transform'
Expand Down Expand Up @@ -95,19 +119,19 @@ describe('unicodePropertyEscapes', () => {
);
assert.equal(
rewritePattern('[^\\p{ASCII_Hex_Digit}_]', 'u', features),
'(?:[\\0-\\/:-@G-\\^`g-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\/:-@G-\\^`g-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
);
assert.equal(
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', features),
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
);
assert.equal(
rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
'(?:_|\\uD811[\\uDC00-\\uDE46])'
);
assert.equal(
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}_]', 'u', features),
'(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'(?:[\\0-\\uFFFF]|[\\uD800-\\uD810\\uD812-\\uDBFF][\\uDC00-\\uDFFF]|\\uD811[\\uDE47-\\uDFFF])'
);
assert.equal(
rewritePattern('(?:\\p{ASCII_Hex_Digit})', 'u', features),
Expand Down Expand Up @@ -219,10 +243,10 @@ describe('unicodePropertyEscapes', () => {
'[\\u{14400}-\\u{14646}]'
);
assert.equal(
rewritePattern('[\\p{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
rewritePattern('[\\P{Script_Extensions=Anatolian_Hieroglyphs}]', 'u', {
'unicodePropertyEscapes': 'transform',
}),
'[\\u{14400}-\\u{14646}]'
'[\\0-\\u{143FF}\\u{14647}-\\u{10FFFF}]'
);
});
it('should not transpile unicode property when unicodePropertyEscapes is not enabled', () => {
Expand Down Expand Up @@ -391,13 +415,50 @@ describe('character classes', () => {
if (transpiled != '(?:' + expected + ')') {
assert.strictEqual(transpiled, expected);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
});
}
});



describe('unicodeSets (v) flag', () => {
// Re-use the unicode fixtures but replacing the input pattern's `u` flag with `v` flag
for (const fixture of unicodeFixtures) {
if (fixture.flags.includes("u")) {
for (let flag of fixture.flags) {
flag = flag.replace("u", "v");
const { pattern, transpiled: expected } = fixture;
const inputRE = `/${pattern}/${flag}`;
it(`rewrites \`${inputRE}\` correctly without using the u flag`, () => {
const options = {
unicodeSetsFlag: "transform",
unicodeFlag: "transform",
};
const transpiled = rewritePattern(pattern, flag, options);
if (transpiled != "(?:" + expected + ")") {
assert.strictEqual(transpiled, expected);
}
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flag, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
});
}
}
}

if (IS_NODE_6) return;

for (const fixture of unicodeSetFixtures) {
Expand All @@ -421,12 +482,20 @@ describe('unicodeSets (v) flag', () => {
}, throws);
});
} else {
const transpiled = rewritePattern(pattern, flags, options);
it(`rewrites \`${inputRE}\` correctly ${transformUnicodeFlag ? 'without ' : ''}using the u flag`, () => {
const transpiled = rewritePattern(pattern, flags, options);
if (transpiled != '(?:' + expected + ')') {
assert.strictEqual(transpiled, expected);
}
});
for (const match of fixture.matches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.match(match, transpiledRegex);
}
for (const nonMatch of fixture.nonMatches || []) {
const transpiledRegex = new RegExp(`^${transpiled}$`, getOutputFlags(flags, options));
assert.doesNotMatch(nonMatch, transpiledRegex);
}
}
}

Expand Down