Skip to content

Commit

Permalink
✨ allow surrogate pairs in capture group names (fixes #10)
Browse files Browse the repository at this point in the history
  • Loading branch information
mysticatea committed Apr 4, 2020
1 parent b4a2ad2 commit 3ab3e24
Show file tree
Hide file tree
Showing 4 changed files with 523 additions and 24 deletions.
12 changes: 12 additions & 0 deletions src/unicode/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,15 @@ export function digitToInt(code: number): number {
}
return code - DigitZero
}

export function isLeadSurrogate(code: number): boolean {
return code >= 0xd800 && code <= 0xdbff
}

export function isTrailSurrogate(code: number): boolean {
return code >= 0xdc00 && code <= 0xdfff
}

export function combineSurrogatePair(lead: number, trail: number): number {
return (lead - 0xd800) * 0x400 + (trail - 0xdc00) + 0x10000
}
75 changes: 51 additions & 24 deletions src/validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,6 @@ import {
FullStop,
GreaterThanSign,
HyphenMinus,
isDecimalDigit,
isHexDigit,
isIdContinue,
isIdStart,
isLatinLetter,
isLineTerminator,
isOctalDigit,
isValidLoneUnicodeProperty,
isValidUnicodeProperty,
isValidUnicode,
LatinCapitalLetterB,
LatinCapitalLetterD,
LatinCapitalLetterP,
Expand Down Expand Up @@ -70,6 +60,19 @@ import {
VerticalLine,
ZeroWidthJoiner,
ZeroWidthNonJoiner,
combineSurrogatePair,
isDecimalDigit,
isHexDigit,
isIdContinue,
isIdStart,
isLatinLetter,
isLeadSurrogate,
isLineTerminator,
isOctalDigit,
isTrailSurrogate,
isValidLoneUnicodeProperty,
isValidUnicodeProperty,
isValidUnicode,
} from "./unicode"

function isSyntaxCharacter(cp: number): boolean {
Expand Down Expand Up @@ -1861,18 +1864,31 @@ export class RegExpValidator {
* UnicodeIDStart
* `$`
* `_`
* `\` RegExpUnicodeEscapeSequence[?U]
* `\` RegExpUnicodeEscapeSequence[+U]
* [~U] UnicodeLeadSurrogate UnicodeTrailSurrogate
* ```
* @returns `true` if it ate the next characters successfully.
*/
private eatRegExpIdentifierStart(): boolean {
const start = this.index
const forceUFlag = !this._uFlag && this.ecmaVersion >= 2020
let cp = this.currentCodePoint
this.advance()

if (cp === ReverseSolidus && this.eatRegExpUnicodeEscapeSequence()) {
if (
cp === ReverseSolidus &&
this.eatRegExpUnicodeEscapeSequence(forceUFlag)
) {
cp = this._lastIntValue
} else if (
forceUFlag &&
isLeadSurrogate(cp) &&
isTrailSurrogate(this.currentCodePoint)
) {
cp = combineSurrogatePair(cp, this.currentCodePoint)
this.advance()
}

if (isRegExpIdentifierStart(cp)) {
this._lastIntValue = cp
return true
Expand All @@ -1893,20 +1909,33 @@ export class RegExpValidator {
* UnicodeIDContinue
* `$`
* `_`
* `\` RegExpUnicodeEscapeSequence[?U]
* `\` RegExpUnicodeEscapeSequence[+U]
* [~U] UnicodeLeadSurrogate UnicodeTrailSurrogate
* <ZWNJ>
* <ZWJ>
* ```
* @returns `true` if it ate the next characters successfully.
*/
private eatRegExpIdentifierPart(): boolean {
const start = this.index
const forceUFlag = !this._uFlag && this.ecmaVersion >= 2020
let cp = this.currentCodePoint
this.advance()

if (cp === ReverseSolidus && this.eatRegExpUnicodeEscapeSequence()) {
if (
cp === ReverseSolidus &&
this.eatRegExpUnicodeEscapeSequence(forceUFlag)
) {
cp = this._lastIntValue
} else if (
forceUFlag &&
isLeadSurrogate(cp) &&
isTrailSurrogate(this.currentCodePoint)
) {
cp = combineSurrogatePair(cp, this.currentCodePoint)
this.advance()
}

if (isRegExpIdentifierPart(cp)) {
this._lastIntValue = cp
return true
Expand Down Expand Up @@ -2027,19 +2056,19 @@ export class RegExpValidator {
* ```
* @returns `true` if it ate the next characters successfully.
*/
private eatRegExpUnicodeEscapeSequence(): boolean {
private eatRegExpUnicodeEscapeSequence(forceUFlag = false): boolean {
const start = this.index
const uFlag = forceUFlag || this._uFlag

if (this.eat(LatinSmallLetterU)) {
if (
(this._uFlag && this.eatRegExpUnicodeSurrogatePairEscape()) ||
(uFlag && this.eatRegExpUnicodeSurrogatePairEscape()) ||
this.eatFixedHexDigits(4) ||
(this._uFlag && this.eatRegExpUnicodeCodePointEscape())
(uFlag && this.eatRegExpUnicodeCodePointEscape())
) {
return true
}

if (this.strict || this._uFlag) {
if (this.strict || uFlag) {
this.raise("Invalid unicode escape")
}
this.rewind(start)
Expand All @@ -2062,16 +2091,14 @@ export class RegExpValidator {
if (this.eatFixedHexDigits(4)) {
const lead = this._lastIntValue
if (
lead >= 0xd800 &&
lead <= 0xdbff &&
isLeadSurrogate(lead) &&
this.eat(ReverseSolidus) &&
this.eat(LatinSmallLetterU) &&
this.eatFixedHexDigits(4)
) {
const trail = this._lastIntValue
if (trail >= 0xdc00 && trail <= 0xdfff) {
this._lastIntValue =
(lead - 0xd800) * 0x400 + (trail - 0xdc00) + 0x10000
if (isTrailSurrogate(trail)) {
this._lastIntValue = combineSurrogatePair(lead, trail)
return true
}
}
Expand Down
44 changes: 44 additions & 0 deletions test/fixtures/parser/literal/unicode-group-names-invalid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"options": {
"ecmaVersion": 2020,
"strict": false
},
"patterns": {
"/(?<\\ud83d\\ude80>.)/": {
"error": {
"message": "Invalid regular expression: /(?<\\ud83d\\ude80>.)/: Invalid capture group name",
"index": 4
}
},
"/(?<\\ud83d\\ude80>.)/u": {
"error": {
"message": "Invalid regular expression: /(?<\\ud83d\\ude80>.)/u: Invalid capture group name",
"index": 4
}
},
"/(?<\\u{1f680}>.)/": {
"error": {
"message": "Invalid regular expression: /(?<\\u{1f680}>.)/: Invalid capture group name",
"index": 4
}
},
"/(?<\\u{1f680}>.)/u": {
"error": {
"message": "Invalid regular expression: /(?<\\u{1f680}>.)/u: Invalid capture group name",
"index": 4
}
},
"/(?<🚀>.)/": {
"error": {
"message": "Invalid regular expression: /(?<🚀>.)/: Invalid capture group name",
"index": 4
}
},
"/(?<🚀>.)/u": {
"error": {
"message": "Invalid regular expression: /(?<🚀>.)/u: Invalid capture group name",
"index": 4
}
}
}
}

0 comments on commit 3ab3e24

Please sign in to comment.