-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: transform negative lookaheads #560
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
import regexpAST from 'regexp-tree' | ||
import regexpAST, { NodePath } from 'regexp-tree' | ||
|
||
import { FunctionConfig, HTTPMethod, Path } from './config.js' | ||
import { FeatureFlags } from './feature_flags.js' | ||
|
@@ -116,20 +116,73 @@ const createDeclarationsFromFunctionConfigs = ( | |
return declarations | ||
} | ||
|
||
// Validates and normalizes a pattern so that it's a valid regular expression | ||
// in Go, which is the engine used by our edge nodes. | ||
export const parsePattern = (pattern: string) => { | ||
/** | ||
* Validates and normalizes a pattern so that it's a valid regular expression | ||
* in Go, which is the engine used by our edge nodes. | ||
* | ||
* @param pattern Original regular expression | ||
* @param extractedExclusionPatterns If set, negative lookaheads (which are | ||
* not supported in Go) are transformed into a list of exclusion patterns to | ||
* be stored in this array | ||
* @returns Normalized regular expression | ||
*/ | ||
export const parsePattern = (pattern: string, extractedExclusionPatterns?: string[]) => { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Could we return There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That would be the cleanest way, for sure. But |
||
let enclosedPattern = pattern | ||
if (!pattern.startsWith('^')) enclosedPattern = `^${enclosedPattern}` | ||
if (!pattern.endsWith('$')) enclosedPattern = `${enclosedPattern}$` | ||
|
||
if (!pattern.startsWith('^')) { | ||
enclosedPattern = `^${enclosedPattern}` | ||
} | ||
|
||
if (!pattern.endsWith('$')) { | ||
enclosedPattern = `${enclosedPattern}$` | ||
} | ||
|
||
let lookaheadDepth = 0 | ||
|
||
// Holds the location of every lookahead expression found. | ||
const lookaheads = new Set<string>() | ||
const regexp = new RegExp(enclosedPattern) | ||
const newRegexp = regexpAST.transform(regexp, { | ||
Assertion(path) { | ||
// Lookaheads are not supported. If we find one, throw an error. | ||
if (path.node.kind === 'Lookahead') { | ||
throw new Error('Regular expressions with lookaheads are not supported') | ||
} | ||
Assertion: { | ||
// If we're entering a negative lookahead expression, register its | ||
// location. | ||
pre(path) { | ||
if (path.node.kind !== 'Lookahead') { | ||
return | ||
} | ||
|
||
if (!extractedExclusionPatterns) { | ||
throw new Error('Regular expressions with lookaheads are not supported') | ||
} | ||
|
||
if (!path.node.negative) { | ||
throw new Error('Regular expressions with positive lookaheads are not supported') | ||
} | ||
|
||
lookaheadDepth += 1 | ||
|
||
if (lookaheadDepth > 1) { | ||
throw new Error('Regular expressions with nested lookaheads are not supported') | ||
} | ||
|
||
const lookahead = serializeNodeLocation(path.node) | ||
|
||
if (lookahead) { | ||
lookaheads.add(lookahead) | ||
} | ||
}, | ||
|
||
// If we're leaving a negative lookahead expression, remove it from the | ||
// AST. We'll later replace its functionality with an exclusion pattern. | ||
post(path) { | ||
if (path.node.kind !== 'Lookahead' || !path.node.negative) { | ||
return | ||
} | ||
|
||
lookaheadDepth -= 1 | ||
|
||
path.remove() | ||
}, | ||
}, | ||
|
||
Group(path) { | ||
|
@@ -146,6 +199,79 @@ export const parsePattern = (pattern: string) => { | |
}, | ||
}) | ||
|
||
// The `extractedExclusionPatterns` property works as a shut-off valve: if | ||
// it's not supplied, don't even traverse the AST again to further process | ||
// lookaheads. | ||
if (extractedExclusionPatterns) { | ||
const exclusionPatterns = [...lookaheads].map((lookahead) => getExclusionPatternFromLookahead(regexp, lookahead)) | ||
|
||
extractedExclusionPatterns.push(...exclusionPatterns) | ||
Comment on lines
+202
to
+208
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have you thought about introducing a logging-only mechanism that can give you data on real-world examples so you can spot check if this code is correct? |
||
} | ||
|
||
// Strip leading and forward slashes. | ||
return newRegexp.toString().slice(1, -1) | ||
} | ||
|
||
/** | ||
* Takes a regular expression and a lookahead inside it and returns a new | ||
* regular expression that acts as an exclusion pattern to replace the | ||
* lookahead. | ||
* | ||
* @param regexp Original regular expression | ||
* @param location Serialized location of the lookahead | ||
* @returns Exclusion pattern regular expression | ||
*/ | ||
const getExclusionPatternFromLookahead = (regexp: RegExp, location: string) => { | ||
const exclusionRegexp = regexpAST.transform(regexp, { | ||
Assertion(path) { | ||
if ( | ||
path.node.kind !== 'Lookahead' || | ||
path.node.assertion === null || | ||
serializeNodeLocation(path.node) !== location | ||
) { | ||
return | ||
} | ||
|
||
// Unwrap the lookahead by replacing it with the expression it holds — | ||
// e.g. `(?!foo)` becomes `foo`. | ||
path.replace(path.node.assertion) | ||
|
||
// Traverse the parents of the lookahead all the way up to the root. When | ||
// we find a disjunction, replace it with the child we travelled from. In | ||
// practice this means getting rid of all the branches that are not the | ||
// lookahead. | ||
// For example, in `(a|b(?!c)|d)` the exclusion patterns cannot contain | ||
// the `a` or `d` branches of the disjunction, otherwise `ab` and `ad` | ||
// would incorrectly be excluded. The exclusion must be `bc` only. | ||
let visitor: NodePath | null = path | ||
|
||
while (visitor !== null) { | ||
const child = visitor | ||
|
||
visitor = visitor.parentPath | ||
|
||
if (visitor?.node.type !== 'Disjunction') { | ||
continue | ||
} | ||
|
||
visitor.replace(child.node) | ||
} | ||
}, | ||
}) | ||
|
||
return exclusionRegexp.toString() | ||
} | ||
|
||
/** | ||
* Creates a string representation of a regexp AST node in the format | ||
* `<start line>,<start column>,<start offset>,<end line>,<end column>,<end offset>` | ||
*/ | ||
const serializeNodeLocation = (node: NodePath['node']) => { | ||
if (!node.loc) { | ||
return '' | ||
} | ||
|
||
const { start, end } = node.loc | ||
|
||
return [start.line, start.column, start.offset, end.line, end.column, end.offset].join(',') | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In practice, this is asserting that if this property (which is set by the feature flag) is not used, we'll keep the existing behaviour of throwing on lookaheads.