Skip to content

Commit

Permalink
Parse Java character properties
Browse files Browse the repository at this point in the history
These correspond to various `is`-prefixed
accessors on `java.lang.Character`. For now, parse
them, but mark them unsupported.
  • Loading branch information
hamishknight committed May 27, 2022
1 parent 0c5d625 commit 571c34c
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 3 deletions.
27 changes: 26 additions & 1 deletion Sources/_RegexParser/Regex/AST/Atom.swift
Original file line number Diff line number Diff line change
Expand Up @@ -450,21 +450,46 @@ extension AST.Atom.CharacterProperty {
/// Some special properties implemented by PCRE and Oniguruma.
case pcreSpecial(PCRESpecialCategory)

/// Some special properties implemented by Java.
case javaSpecial(JavaSpecial)

public enum MapKind: Hashable {
case lowercase
case uppercase
case titlecase
}
}

// TODO: erm, separate out or fold into something? splat it in?
public enum PCRESpecialCategory: String, Hashable {
case alphanumeric = "Xan"
case posixSpace = "Xps"
case perlSpace = "Xsp"
case universallyNamed = "Xuc"
case perlWord = "Xwd"
}

/// Special Java properties that correspond to methods on
/// `java.lang.Character`, with the `java` prefix replaced by `is`.
public enum JavaSpecial: String, Hashable, CaseIterable {
case alphabetic = "javaAlphabetic"
case defined = "javaDefined"
case digit = "javaDigit"
case identifierIgnorable = "javaIdentifierIgnorable"
case ideographic = "javaIdeographic"
case isoControl = "javaISOControl"
case javaIdentifierPart = "javaJavaIdentifierPart" // not a typo, that's actually the name
case javaIdentifierStart = "javaJavaIdentifierStart" // not a typo, that's actually the name
case javaLetter = "javaLetter"
case javaLetterOrDigit = "javaLetterOrDigit"
case lowerCase = "javaLowerCase"
case mirrored = "javaMirrored"
case spaceChar = "javaSpaceChar"
case titleCase = "javaTitleCase"
case unicodeIdentifierPart = "javaUnicodeIdentifierPart"
case unicodeIdentifierStart = "javaUnicodeIdentifierStart"
case upperCase = "javaUpperCase"
case whitespace = "javaWhitespace"
}
}

extension AST.Atom {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -764,12 +764,15 @@ extension Source {
return .block(block)
}

// PCRE special properties.
// TODO: Normalize?
// Special properties from other engines.
typealias PCRESpecial = AST.Atom.CharacterProperty.PCRESpecialCategory
if let pcreSpecial = PCRESpecial(rawValue: value) {
return .pcreSpecial(pcreSpecial)
}
typealias JavaSpecial = AST.Atom.CharacterProperty.JavaSpecial
if let javaSpecial = JavaSpecial(rawValue: value) {
return .javaSpecial(javaSpecial)
}

// TODO: This should be versioned, and do we want a more lax behavior for
// the runtime?
Expand Down
2 changes: 2 additions & 0 deletions Sources/_RegexParser/Regex/Parse/Sema.swift
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ extension RegexValidator {
throw error(.unsupported("PCRE property"), at: loc)
case .block:
throw error(.unsupported("Unicode block property"), at: loc)
case .javaSpecial:
throw error(.unsupported("Java property"), at: loc)
}
}

Expand Down
3 changes: 3 additions & 0 deletions Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,9 @@ extension AST.Atom.CharacterProperty {

case .pcreSpecial(let s):
throw Unsupported("TODO: map PCRE special: \(s)")

case .javaSpecial(let s):
throw Unsupported("TODO: map Java special: \(s)")
}
}()

Expand Down
4 changes: 4 additions & 0 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1385,6 +1385,10 @@ extension RegexTests {
parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked)
}

for j in AST.Atom.CharacterProperty.JavaSpecial.allCases {
parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), throwsError: .unsupported)
}

// Try prefixing each block property with "in" to make sure we don't stomp
// on any other property shorthands.
for b in Unicode.Block.allCases {
Expand Down

0 comments on commit 571c34c

Please sign in to comment.