Skip to content

Commit

Permalink
Merge pull request apple#445 from hamishknight/trivia-pursuit-5.7
Browse files Browse the repository at this point in the history
  • Loading branch information
hamishknight committed Jun 1, 2022
2 parents 3d44861 + 4328e73 commit 8f0160b
Show file tree
Hide file tree
Showing 7 changed files with 236 additions and 143 deletions.
12 changes: 11 additions & 1 deletion Sources/_RegexParser/Regex/AST/CustomCharClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,16 @@ extension AST {
public var lhs: Atom
public var dashLoc: SourceLocation
public var rhs: Atom
public var trivia: [AST.Trivia]

public init(_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom) {
public init(
_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom,
trivia: [AST.Trivia]
) {
self.lhs = lhs
self.dashLoc = dashLoc
self.rhs = rhs
self.trivia = trivia
}
}
public enum SetOp: String, Hashable {
Expand Down Expand Up @@ -95,6 +100,11 @@ extension CustomCC.Member {
return false
}

public var asTrivia: AST.Trivia? {
guard case .trivia(let t) = self else { return nil }
return t
}

public var isSemantic: Bool {
!isTrivia
}
Expand Down
140 changes: 74 additions & 66 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,14 @@ extension Source {
return result
}

/// Perform a lookahead using a temporary source. Within the body of the
/// lookahead, any modifications to the source will not be reflected outside
/// the body.
func lookahead<T>(_ body: (inout Source) throws -> T) rethrows -> T {
var src = self
return try body(&src)
}

/// Attempt to eat the given character, returning its source location if
/// successful, `nil` otherwise.
mutating func tryEatWithLoc(_ c: Character) -> SourceLocation? {
Expand Down Expand Up @@ -413,9 +421,7 @@ extension Source {
) throws -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
var trivia: [AST.Trivia] = []

if let t = try lexNonSemanticWhitespace(context: context) {
trivia.append(t)
}
if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) }

let amt: Located<Quant.Amount>? = try recordLoc { src in
if src.tryEat("*") { return .zeroOrMore }
Expand All @@ -424,7 +430,7 @@ extension Source {

return try src.tryEating { src in
guard src.tryEat("{"),
let range = try src.lexRange(context: context),
let range = try src.lexRange(context: context, trivia: &trivia),
src.tryEat("}")
else { return nil }
return range.value
Expand All @@ -433,9 +439,7 @@ extension Source {
guard let amt = amt else { return nil }

// PCRE allows non-semantic whitespace here in extended syntax mode.
if let t = try lexNonSemanticWhitespace(context: context) {
trivia.append(t)
}
if let t = lexNonSemanticWhitespace(context: context) { trivia.append(t) }

let kind: Located<Quant.Kind> = recordLoc { src in
if src.tryEat("?") { return .reluctant }
Expand All @@ -452,11 +456,17 @@ extension Source {
/// | ExpRange
/// ExpRange -> '..<' <Int> | '...' <Int>
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
mutating func lexRange(context: ParsingContext) throws -> Located<Quant.Amount>? {
mutating func lexRange(
context: ParsingContext, trivia: inout [AST.Trivia]
) throws -> Located<Quant.Amount>? {
try recordLoc { src in
try src.tryEating { src in
if let t = src.lexWhitespace() { trivia.append(t) }

let lowerOpt = try src.lexNumber()

if let t = src.lexWhitespace() { trivia.append(t) }

// ',' or '...' or '..<' or nothing
// TODO: We ought to try and consume whitespace here and emit a
// diagnostic for the user warning them that it would cause the range to
Expand All @@ -476,11 +486,15 @@ extension Source {
closedRange = nil
}

if let t = src.lexWhitespace() { trivia.append(t) }

let upperOpt = try src.lexNumber()?.map { upper in
// If we have an open range, the upper bound should be adjusted down.
closedRange == true ? upper : upper - 1
}

if let t = src.lexWhitespace() { trivia.append(t) }

switch (lowerOpt, closedRange, upperOpt) {
case let (l?, nil, nil):
return .exactly(l)
Expand Down Expand Up @@ -625,11 +639,11 @@ extension Source {
///
mutating func lexComment(context: ParsingContext) throws -> AST.Trivia? {
let trivia: Located<String>? = try recordLoc { src in
if src.tryEat(sequence: "(?#") {
return try src.expectQuoted(endingWith: ")").value
if !context.isInCustomCharacterClass && src.tryEat(sequence: "(?#") {
return try src.lexUntil(eating: ")").value
}
if context.experimentalComments, src.tryEat(sequence: "/*") {
return try src.expectQuoted(endingWith: "*/").value
return try src.lexUntil(eating: "*/").value
}
if context.endOfLineComments, src.tryEat("#") {
// Try eat until we either exhaust the input, or hit a newline. Note
Expand Down Expand Up @@ -667,7 +681,7 @@ extension Source {
/// Does nothing unless `SyntaxOptions.nonSemanticWhitespace` is set
mutating func lexNonSemanticWhitespace(
context: ParsingContext
) throws -> AST.Trivia? {
) -> AST.Trivia? {
guard context.ignoreWhitespace else { return nil }

// FIXME: PCRE only treats space and tab characters as whitespace when
Expand Down Expand Up @@ -699,7 +713,7 @@ extension Source {
if let comment = try lexComment(context: context) {
return comment
}
if let whitespace = try lexNonSemanticWhitespace(context: context) {
if let whitespace = lexNonSemanticWhitespace(context: context) {
return whitespace
}
return nil
Expand Down Expand Up @@ -1178,8 +1192,7 @@ extension Source {
}
}

mutating func lexCustomCCStart(
) throws -> Located<CustomCC.Start>? {
mutating func lexCustomCCStart() -> Located<CustomCC.Start>? {
recordLoc { src in
// Make sure we don't have a POSIX character property. This may require
// walking to its ending to make sure we have a closing ':]', as otherwise
Expand Down Expand Up @@ -1240,8 +1253,9 @@ extension Source {

private func canLexPOSIXCharacterProperty() -> Bool {
do {
var src = self
return try src.lexPOSIXCharacterProperty() != nil
return try lookahead { src in
try src.lexPOSIXCharacterProperty() != nil
}
} catch {
// We want to tend on the side of lexing a POSIX character property, so
// even if it is invalid in some way (e.g invalid property names), still
Expand Down Expand Up @@ -1394,10 +1408,11 @@ extension Source {

/// Checks whether a numbered reference can be lexed.
private func canLexNumberedReference() -> Bool {
var src = self
_ = src.tryEat(anyOf: "+", "-")
guard let next = src.peek() else { return false }
return RadixKind.decimal.characterFilter(next)
lookahead { src in
_ = src.tryEat(anyOf: "+", "-")
guard let next = src.peek() else { return false }
return RadixKind.decimal.characterFilter(next)
}
}

/// Eat a named reference up to a given closing delimiter.
Expand Down Expand Up @@ -1587,53 +1602,55 @@ extension Source {

/// Whether we can lex a group-like reference after the specifier '(?'.
private func canLexGroupLikeReference() -> Bool {
var src = self
if src.tryEat("P") {
return src.tryEat(anyOf: "=", ">") != nil
}
if src.tryEat(anyOf: "&", "R") != nil {
return true
lookahead { src in
if src.tryEat("P") {
return src.tryEat(anyOf: "=", ">") != nil
}
if src.tryEat(anyOf: "&", "R") != nil {
return true
}
return src.canLexNumberedReference()
}
return src.canLexNumberedReference()
}

private func canLexMatchingOptionsAsAtom(context: ParsingContext) -> Bool {
var src = self

// See if we can lex a matching option sequence that terminates in ')'. Such
// a sequence is an atom. If an error is thrown, there are invalid elements
// of the matching option sequence. In such a case, we can lex as a group
// and diagnose the invalid group kind.
guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
return false
lookahead { src in
// See if we can lex a matching option sequence that terminates in ')'.
// Such a sequence is an atom. If an error is thrown, there are invalid
// elements of the matching option sequence. In such a case, we can lex as
// a group and diagnose the invalid group kind.
guard (try? src.lexMatchingOptionSequence(context: context)) != nil else {
return false
}
return src.tryEat(")")
}
return src.tryEat(")")
}

/// Whether a group specifier should be lexed as an atom instead of a group.
private func shouldLexGroupLikeAtom(context: ParsingContext) -> Bool {
var src = self
guard src.tryEat("(") else { return false }
lookahead { src in
guard src.tryEat("(") else { return false }

if src.tryEat("?") {
// The start of a reference '(?P=', '(?R', ...
if src.canLexGroupLikeReference() { return true }
if src.tryEat("?") {
// The start of a reference '(?P=', '(?R', ...
if src.canLexGroupLikeReference() { return true }

// The start of a PCRE callout.
if src.tryEat("C") { return true }
// The start of a PCRE callout.
if src.tryEat("C") { return true }

// The start of an Oniguruma 'of-contents' callout.
if src.tryEat("{") { return true }
// The start of an Oniguruma 'of-contents' callout.
if src.tryEat("{") { return true }

// A matching option atom (?x), (?i), ...
if src.canLexMatchingOptionsAsAtom(context: context) { return true }
// A matching option atom (?x), (?i), ...
if src.canLexMatchingOptionsAsAtom(context: context) { return true }

return false
}
// The start of a backreference directive or Oniguruma named callout.
if src.tryEat("*") { return true }

return false
}
// The start of a backreference directive or Oniguruma named callout.
if src.tryEat("*") { return true }

return false
}

/// Consume an escaped atom, starting from after the backslash
Expand Down Expand Up @@ -2022,20 +2039,11 @@ extension Source {
return AST.Atom(kind.value, kind.location)
}

/// Try to lex the end of a range in a custom character class, which consists
/// of a '-' character followed by an atom.
mutating func lexCustomCharClassRangeEnd(
context: ParsingContext
) throws -> (dashLoc: SourceLocation, AST.Atom)? {
// Make sure we don't have a binary operator e.g '--', and the '-' is not
// ending the custom character class (in which case it is literal).
guard peekCCBinOp() == nil, !starts(with: "-]"),
let dash = tryEatWithLoc("-"),
let end = try lexAtom(context: context)
else {
return nil
}
return (dash, end)
/// Try to lex the range operator '-' for a custom character class.
mutating func lexCustomCharacterClassRangeOperator() -> SourceLocation? {
// Eat a '-', making sure we don't have a binary op such as '--'.
guard peekCCBinOp() == nil else { return nil }
return tryEatWithLoc("-")
}

/// Try to consume a newline sequence matching option kind.
Expand Down
Loading

0 comments on commit 8f0160b

Please sign in to comment.