From 571c34cbf56799a506e1bcd265c1e3729af59f61 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 26 May 2022 20:46:35 +0100 Subject: [PATCH] Parse Java character properties These correspond to various `is`-prefixed accessors on `java.lang.Character`. For now, parse them, but mark them unsupported. --- Sources/_RegexParser/Regex/AST/Atom.swift | 27 ++++++++++++++++++- .../CharacterPropertyClassification.swift | 7 +++-- Sources/_RegexParser/Regex/Parse/Sema.swift | 2 ++ .../_StringProcessing/ConsumerInterface.swift | 3 +++ Tests/RegexTests/ParseTests.swift | 4 +++ 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 680a29dee..992604852 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -450,6 +450,9 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) + /// Some special properties implemented by Java. + case javaSpecial(JavaSpecial) + public enum MapKind: Hashable { case lowercase case uppercase @@ -457,7 +460,6 @@ extension AST.Atom.CharacterProperty { } } - // TODO: erm, separate out or fold into something? splat it in? public enum PCRESpecialCategory: String, Hashable { case alphanumeric = "Xan" case posixSpace = "Xps" @@ -465,6 +467,29 @@ extension AST.Atom.CharacterProperty { case universallyNamed = "Xuc" case perlWord = "Xwd" } + + /// Special Java properties that correspond to methods on + /// `java.lang.Character`, with the `java` prefix replaced by `is`. + public enum JavaSpecial: String, Hashable, CaseIterable { + case alphabetic = "javaAlphabetic" + case defined = "javaDefined" + case digit = "javaDigit" + case identifierIgnorable = "javaIdentifierIgnorable" + case ideographic = "javaIdeographic" + case isoControl = "javaISOControl" + case javaIdentifierPart = "javaJavaIdentifierPart" // not a typo, that's actually the name + case javaIdentifierStart = "javaJavaIdentifierStart" // not a typo, that's actually the name + case javaLetter = "javaLetter" + case javaLetterOrDigit = "javaLetterOrDigit" + case lowerCase = "javaLowerCase" + case mirrored = "javaMirrored" + case spaceChar = "javaSpaceChar" + case titleCase = "javaTitleCase" + case unicodeIdentifierPart = "javaUnicodeIdentifierPart" + case unicodeIdentifierStart = "javaUnicodeIdentifierStart" + case upperCase = "javaUpperCase" + case whitespace = "javaWhitespace" + } } extension AST.Atom { diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 2ff162390..fb122e027 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -764,12 +764,15 @@ extension Source { return .block(block) } - // PCRE special properties. - // TODO: Normalize? + // Special properties from other engines. typealias PCRESpecial = AST.Atom.CharacterProperty.PCRESpecialCategory if let pcreSpecial = PCRESpecial(rawValue: value) { return .pcreSpecial(pcreSpecial) } + typealias JavaSpecial = AST.Atom.CharacterProperty.JavaSpecial + if let javaSpecial = JavaSpecial(rawValue: value) { + return .javaSpecial(javaSpecial) + } // TODO: This should be versioned, and do we want a more lax behavior for // the runtime? diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 99654991a..c803087be 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -175,6 +175,8 @@ extension RegexValidator { throw error(.unsupported("PCRE property"), at: loc) case .block: throw error(.unsupported("Unicode block property"), at: loc) + case .javaSpecial: + throw error(.unsupported("Java property"), at: loc) } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 6828245b1..640fe3c93 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -520,6 +520,9 @@ extension AST.Atom.CharacterProperty { case .pcreSpecial(let s): throw Unsupported("TODO: map PCRE special: \(s)") + + case .javaSpecial(let s): + throw Unsupported("TODO: map Java special: \(s)") } }() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 640186a2b..0daa4a457 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1385,6 +1385,10 @@ extension RegexTests { parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) } + for j in AST.Atom.CharacterProperty.JavaSpecial.allCases { + parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), throwsError: .unsupported) + } + // Try prefixing each block property with "in" to make sure we don't stomp // on any other property shorthands. for b in Unicode.Block.allCases {