Skip to content

Commit

Permalink
Merge branch 'main' into v-flag-unicodePropertyEscapes
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolo-ribaudo committed Sep 23, 2023
2 parents 2217416 + 91ee342 commit 499a608
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 38 deletions.
16 changes: 7 additions & 9 deletions property-escapes.md
Original file line number Diff line number Diff line change
@@ -1,28 +1,26 @@
# Unicode property escapes in _regexpu_

To opt-in to experimental support for [Unicode property escapes](https://github.com/mathiasbynens/es-regexp-unicode-property-escapes), enable [the `unicodePropertyEscape` option](README.md#unicodepropertyescape-default-false).
To enable support for [Unicode property escapes](https://github.com/mathiasbynens/es-regexp-unicode-property-escapes), set [the `unicodePropertyEscape` option](README.md#stable-regular-expression-features) to `transform`.

```js
rewritePattern('\\p{Script_Extensions=Anatolian_Hieroglyphs}', 'u', {
'unicodePropertyEscape': true
'unicodePropertyEscapes': 'transform'
});
// → '(?:\\uD811[\\uDC00-\\uDE46])'
// → '[\\u{14400}-\\u{14646}]'
```

If you’re targeting ES2015 environments exclusively, consider enabling [the `useUnicodeFlag` option](README.md#useunicodeflag-default-false) for simpler (but not necessarily more compact) output.
If you’re targeting ES5 environments, consider enabling [the `unicodeFlag` option](README.md#stable-regular-expression-features).

```js
rewritePattern('\\p{Script_Extensions=Anatolian_Hieroglyphs}', 'u', {
'unicodePropertyEscape': true,
'useUnicodeFlag': true
'unicodePropertyEscape': 'transform',
'unicodeFlag': 'transform'
});
// → '[\\u{14400}-\\u{14646}]'
// → '(?:\\uD811[\\uDC00-\\uDE46])'
```

[An online demo is available.](https://mothereff.in/regexpu#input=var+regex+%3D+/%5Cp%7BScript_Extensions%3DGreek%7D/u%3B&unicodePropertyEscape=1)

Note that this feature is non-standard. This implementation may or may not reflect what eventually gets specified.

What follows is an exhaustive overview of the Unicode properties and values that _regexpu_ supports in `\p{…}` and `\P{…}` expressions in regular expressions with the `u` flag.

## Non-binary properties
Expand Down
34 changes: 20 additions & 14 deletions rewrite-pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ function flatMap(array, callback) {
return result;
}

function regenerateContainsAstral(regenerateData) {
const data = regenerateData.data;
return data.length >= 1 && data[data.length - 1] >= 0x10000;
}

const SPECIAL_CHARS = /([\\^$.*+?()[\]{}|])/g;

// Prepare a Regenerate set containing all code points, used for negative
Expand Down Expand Up @@ -330,7 +335,7 @@ const buildHandler = (action) => {
}
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
}
Expand Down Expand Up @@ -414,7 +419,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
}
Expand All @@ -441,7 +446,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
case 'characterClassEscape':
handlePositive.regSet(data, getCharacterClassEscapeSet(
item.value,
config.flags.unicode,
config.flags.unicode || config.flags.unicodeSets,
config.flags.ignoreCase
));
break;
Expand All @@ -465,7 +470,7 @@ const computeCharacterClass = (characterClassItem, regenerateOptions) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
Expand All @@ -488,13 +493,15 @@ const processCharacterClass = (
const negative = characterClassItem.negative;
const { singleChars, transformed, longStrings } = computed;
if (transformed) {
const setStr = singleChars.toString(regenerateOptions);
// If single chars already contains some astral character, regenerate (bmpOnly: true) will create valid regex strings
const bmpOnly = regenerateContainsAstral(singleChars);
const setStr = singleChars.toString(Object.assign({}, regenerateOptions, { bmpOnly: bmpOnly }));

if (negative) {
if (config.useUnicodeFlag) {
update(characterClassItem, `[^${setStr[0] === '[' ? setStr.slice(1, -1) : setStr}]`)
} else {
if (config.flags.unicode) {
if (config.flags.unicode || config.flags.unicodeSets) {
if (config.flags.ignoreCase) {
const astralCharsSet = singleChars.clone().intersection(ASTRAL_SET);
// Assumption: singleChars do not contain lone surrogates.
Expand All @@ -518,10 +525,9 @@ const processCharacterClass = (
);
} else {
// Generate negative set directly when case folding is not involved.
update(
characterClassItem,
UNICODE_SET.clone().remove(singleChars).toString(regenerateOptions)
);
const negativeSet = UNICODE_SET.clone().remove(singleChars);
const bmpOnly = regenerateContainsAstral(negativeSet);
update(characterClassItem, negativeSet.toString({ bmpOnly: bmpOnly }));
}
} else {
update(characterClassItem, `(?!${setStr})[\\s\\S]`);
Expand Down Expand Up @@ -731,7 +737,7 @@ const processTerm = (item, regenerateOptions, groups) => {
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
/* node:coverage ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
Expand Down Expand Up @@ -773,13 +779,13 @@ const validateOptions = (options) => {
case 'dotAllFlag':
case 'unicodeFlag':
case 'unicodePropertyEscapes':
case 'unicodeSetsFlag':
case 'namedGroups':
if (value != null && value !== false && value !== 'transform') {
throw new Error(`.${key} must be false (default) or 'transform'.`);
}
break;
case 'modifiers':
case 'unicodeSetsFlag':
if (value != null && value !== false && value !== 'parse' && value !== 'transform') {
throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`);
}
Expand Down Expand Up @@ -824,18 +830,18 @@ const rewritePattern = (pattern, flags, options) => {
config.modifiersData.m = undefined;

const regjsparserFeatures = {
'unicodeSet': Boolean(options && options.unicodeSetsFlag),
'modifiers': Boolean(options && options.modifiers),

// Enable every stable RegExp feature by default
'unicodePropertyEscape': true,
'unicodeSet': true,
'namedGroups': true,
'lookbehind': true,
};

const regenerateOptions = {
'hasUnicodeFlag': config.useUnicodeFlag,
'bmpOnly': !config.flags.unicode
'bmpOnly': !config.flags.unicode && !config.flags.unicodeSets
};

const groups = {
Expand Down
16 changes: 12 additions & 4 deletions tests/fixtures/character-class.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,25 +44,33 @@ const characterClassFixtures = [
{
pattern: '[^K]', // LATIN CAPITAL LETTER K
flags: 'u',
expected: '(?:[\\0-JL-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["k", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["K"],
expected: '(?:[\\0-JL-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^k]', // LATIN SMALL LETTER K
flags: 'u',
expected: '(?:[\\0-jl-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "\u212a", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["k"],
expected: '(?:[\\0-jl-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u212a]', // KELVIN SIGN
flags: 'u',
expected: '(?:[\\0-\\u2129\\u212B-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u212a"],
expected: '(?:[\\0-\\u2129\\u212B-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
pattern: '[^\u{1D50E}]', // MATHEMATICAL FRAKTUR CAPITAL K
flags: 'u',
expected: '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])',
matches: ["K", "k", "\u{12345}", "\u{1D50F}", "\uDAAA", "\uDDDD"],
nonMatches: ["\u{1D50E}"],
expected: '(?:[\\0-\\uFFFF]|[\\uD800-\\uD834\\uD836-\\uDBFF][\\uDC00-\\uDFFF]|\\uD835[\\uDC00-\\uDD0D\\uDD0F-\\uDFFF])',
options: { unicodeFlag: 'transform' }
},
{
Expand Down
15 changes: 14 additions & 1 deletion tests/fixtures/unicode-set.js
Original file line number Diff line number Diff line change
Expand Up @@ -105,15 +105,21 @@ const unicodeSetFixtures = [
},
{
pattern: '[^[a-z][f-h]]',
expected: '(?:(?![a-z])[\\s\\S])',
matches: ["A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:[\\0-`\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '[\\0-`f-h\\{-\\u{10FFFF}]'
},
{
pattern: '[[^a-z][f-h]]',
matches: ["f", "A", "\u{12345}", "\uDAAA", "\uDDDD"],
nonMatches: ["a", "z"],
expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])',
options: TRANSFORM_U
},
Expand Down Expand Up @@ -353,6 +359,13 @@ const unicodeSetFixtures = [
{
pattern: '^[\\p{Script=Arabic}&&\\p{Number}]$',
expected: '^[\\u0660-\\u0669\\u06F0-\\u06F9\\u{10E60}-\\u{10E7E}]$'
},
{
pattern: '.',
flags: 'sv',
matches: ['\n'],
options: { unicodeSetsFlag: 'transform', dotAllFlag: 'transform' },
expected: '[\\s\\S]'
}
];

Expand Down
13 changes: 9 additions & 4 deletions tests/fixtures/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ const unicodeFixtures = [
{
'pattern': '[\\s\\S]',
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\d',
Expand All @@ -68,8 +68,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\d\\D]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '\\w',
Expand Down Expand Up @@ -100,8 +101,9 @@ const unicodeFixtures = [
},
{
'pattern': '[\\w\\W]',
'matches': ["a", "0", "\u{12345}", "\uDAAA", "\uDDDD"],
'flags': FLAGS_WITH_UNICODE,
'transpiled': '(?:[\\0-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[\\uD834\\uDF06-\\uD834\\uDF08a-z]',
Expand Down Expand Up @@ -180,11 +182,14 @@ const unicodeFixtures = [
},
{
'pattern': '[^a]',
'matches': ['b', 'A', '\u{1D49C}', '\uDAAA', '\uDDDD'],
'nonMatches': ['a'],
'flags': FLAGS_WITH_UNICODE_WITHOUT_I,
'transpiled': '(?:[\\0-`b-\\uD7FF\\uE000-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]|[\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])|(?:[^\\uD800-\\uDBFF]|^)[\\uDC00-\\uDFFF])'
'transpiled': '(?:[\\0-`b-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
{
'pattern': '[^a]',
'nonMatches': ['a', 'A'],
'flags': FLAGS_WITH_UNICODE_WITH_I,
'transpiled': '(?:(?![a\\uD800-\\uDFFF])[\\s\\S]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])'
},
Expand Down
Loading

0 comments on commit 499a608

Please sign in to comment.