From 0c5d625caac7699c38b74f2e57579051bfb96fc1 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 26 May 2022 20:46:35 +0100 Subject: [PATCH 1/2] Formalize Unicode block properties Previously we only supported a subset of the Oniguruma spellings for these. Introduce them as an actual Unicode property with the key `blk` or `block`. Additionally, allow a non-Unicode shorthand syntax that uses the prefix `in`. This is supported by Oniguruma and Perl (though Perl discourages its usage). We may want to warn/error on it and suggest users switch to the more explicit form. --- Sources/_RegexParser/Regex/AST/Atom.swift | 8 +- .../CharacterPropertyClassification.swift | 358 ++++++- .../Regex/Parse/Diagnostics.swift | 3 + Sources/_RegexParser/Regex/Parse/Sema.swift | 2 +- .../_RegexParser/Utility/MissingUnicode.swift | 981 ++++++++++++------ .../_StringProcessing/ConsumerInterface.swift | 6 +- Tests/RegexTests/ParseTests.swift | 58 +- 7 files changed, 1069 insertions(+), 347 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index eba720f9b..680a29dee 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -441,13 +441,15 @@ extension AST.Atom.CharacterProperty { /// Character age, as per UnicodeScalar.Properties.age. case age(major: Int, minor: Int) - + + /// A block property. + case block(Unicode.Block) + case posix(Unicode.POSIXProperty) /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) - case onigurumaSpecial(OnigurumaSpecialProperty) - + public enum MapKind: Hashable { case lowercase case uppercase diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 21b5ddc68..2ff162390 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -13,13 +13,17 @@ extension Source { typealias PropertyKind = AST.Atom.CharacterProperty.Kind static private func withNormalizedForms( - _ str: String, match: (String) throws -> T? + _ str: String, requireInPrefix: Bool = false, match: (String) throws -> T? ) rethrows -> T? { // This follows the rules provided by UAX44-LM3, including trying to drop an // "is" prefix, which isn't required by UTS#18 RL1.2, but is nice for // consistency with other engines and the Unicode.Scalar.Properties names. let str = str.filter { !$0.isPatternWhitespace && $0 != "_" && $0 != "-" } .lowercased() + if requireInPrefix { + guard str.hasPrefix("in") else { return nil } + return try match(String(str.dropFirst(2))) + } if let m = try match(str) { return m } @@ -364,6 +368,342 @@ extension Source { } } + static private func classifyBlockProperty( + _ value: String, valueOnly: Bool + ) -> Unicode.Block? { + // Require an 'in' prefix for the shorthand variant. This is supported by + // Oniguruma and Perl. + // TODO: Perl discourages the shorthand 'in' prefix, should we diagnose and + // suggest an explicit key/value? + withNormalizedForms(value, requireInPrefix: valueOnly) { str in + switch str { + case "adlam": return .adlam + case "aegeannumbers": return .aegeanNumbers + case "ahom": return .ahom + case "alchemical", "alchemicalsymbols": return .alchemicalSymbols + case "alphabeticpf", "alphabeticpresentationforms": return .alphabeticPresentationForms + case "anatolianhieroglyphs": return .anatolianHieroglyphs + case "ancientgreekmusic", "ancientgreekmusicalnotation": return .ancientGreekMusicalNotation + case "ancientgreeknumbers": return .ancientGreekNumbers + case "ancientsymbols": return .ancientSymbols + case "arabic": return .arabic + case "arabicexta", "arabicextendeda": return .arabicExtendedA + case "arabicextb", "arabicextendedb": return .arabicExtendedB + case "arabicmath", "arabicmathematicalalphabeticsymbols": return .arabicMathematicalAlphabeticSymbols + case "arabicpfa", "arabicpresentationformsa": return .arabicPresentationFormsA + case "arabicpfb", "arabicpresentationformsb": return .arabicPresentationFormsB + case "arabicsup", "arabicsupplement": return .arabicSupplement + case "armenian": return .armenian + case "arrows": return .arrows + case "ascii", "basiclatin": return .basicLatin + case "avestan": return .avestan + case "balinese": return .balinese + case "bamum": return .bamum + case "bamumsup", "bamumsupplement": return .bamumSupplement + case "bassavah": return .bassaVah + case "batak": return .batak + case "bengali": return .bengali + case "bhaiksuki": return .bhaiksuki + case "blockelements": return .blockElements + case "bopomofo": return .bopomofo + case "bopomofoext", "bopomofoextended": return .bopomofoExtended + case "boxdrawing": return .boxDrawing + case "brahmi": return .brahmi + case "braille", "braillepatterns": return .braillePatterns + case "buginese": return .buginese + case "buhid": return .buhid + case "byzantinemusic", "byzantinemusicalsymbols": return .byzantineMusicalSymbols + case "carian": return .carian + case "caucasianalbanian": return .caucasianAlbanian + case "chakma": return .chakma + case "cham": return .cham + case "cherokee": return .cherokee + case "cherokeesup", "cherokeesupplement": return .cherokeeSupplement + case "chesssymbols": return .chessSymbols + case "chorasmian": return .chorasmian + case "cjk", "cjkunifiedideographs": return .cjkUnifiedIdeographs + case "cjkcompat", "cjkcompatibility": return .cjkCompatibility + case "cjkcompatforms", "cjkcompatibilityforms": return .cjkcompatibilityForms + case "cjkcompatideographs", "cjkcompatibilityideographs": return .cjkCompatibilityIdeographs + case "cjkcompatideographssup", "cjkcompatibilityideographssupplement": return .cjkCompatibilityIdeographsSupplement + case "cjkexta", "cjkunifiedideographsextensiona": return .cjkUnifiedIdeographsExtensionA + case "cjkextb", "cjkunifiedideographsextensionb": return .cjkUnifiedIdeographsExtensionB + case "cjkextc", "cjkunifiedideographsextensionc": return .cjkUnifiedIdeographsExtensionC + case "cjkextd", "cjkunifiedideographsextensiond": return .cjkUnifiedIdeographsExtensionD + case "cjkexte", "cjkunifiedideographsextensione": return .cjkUnifiedIdeographsExtensionE + case "cjkextf", "cjkunifiedideographsextensionf": return .cjkUnifiedIdeographsExtensionF + case "cjkextg", "cjkunifiedideographsextensiong": return .cjkUnifiedIdeographsExtensionG + case "cjkradicalssup", "cjkradicalssupplement": return .cjkRadicalsSupplement + case "cjkstrokes": return .cjkStrokes + case "cjksymbols", "cjksymbolsandpunctuation": return .cjkSymbolsAndPunctuation + case "compatjamo", "hangulcompatibilityjamo": return .hangulCompatibilityJamo + case "controlpictures": return .controlPictures + case "coptic": return .coptic + case "copticepactnumbers": return .copticEpactNumbers + case "countingrod", "countingrodnumerals": return .countingRodNumerals + case "cuneiform": return .cuneiform + case "cuneiformnumbers", "cuneiformnumbersandpunctuation": return .cuneiformNumbersAndPunctuation + case "currencysymbols": return .currencySymbols + case "cypriotsyllabary": return .cypriotSyllabary + case "cyprominoan": return .cyproMinoan + case "cyrillic": return .cyrillic + case "cyrillicexta", "cyrillicextendeda": return .cyrillicExtendedA + case "cyrillicextb", "cyrillicextendedb": return .cyrillicExtendedB + case "cyrillicextc", "cyrillicextendedc": return .cyrillicExtendedC + case "cyrillicsup", "cyrillicsupplement", "cyrillicsupplementary": return .cyrillicSupplement + case "deseret": return .deseret + case "devanagari": return .devanagari + case "devanagariext", "devanagariextended": return .devanagariExtended + case "diacriticals", "combiningdiacriticalmarks": return .combiningDiacriticalMarks + case "diacriticalsext", "combiningdiacriticalmarksextended": return .combiningDiacriticalMarksExtended + case "diacriticalsforsymbols", "combiningdiacriticalmarksforsymbols", + "combiningmarksforsymbols": return .combiningDiacriticalMarksForSymbols + case "diacriticalssup", "combiningdiacriticalmarkssupplement": return .combiningDiacriticalMarksSupplement + case "dingbats": return .dingbats + case "divesakuru": return .divesAkuru + case "dogra": return .dogra + case "domino", "dominotiles": return .dominoTiles + case "duployan": return .duployan + case "earlydynasticcuneiform": return .earlyDynasticCuneiform + case "egyptianhieroglyphformatcontrols": return .egyptianHieroglyphFormatControls + case "egyptianhieroglyphs": return .egyptianHieroglyphs + case "elbasan": return .elbasan + case "elymaic": return .elymaic + case "emoticons": return .emoticons + case "enclosedalphanum", "enclosedalphanumerics": return .enclosedAlphanumerics + case "enclosedalphanumsup", "enclosedalphanumericsupplement": return .enclosedAlphanumericSupplement + case "enclosedcjk", "enclosedcjklettersandmonths": return .enclosedCJKLettersAndMonths + case "enclosedideographicsup", "enclosedideographicsupplement": return .enclosedIdeographicSupplement + case "ethiopic": return .ethiopic + case "ethiopicext", "ethiopicextended": return .ethiopicExtended + case "ethiopicexta", "ethiopicextendeda": return .ethiopicExtendedA + case "ethiopicextb", "ethiopicextendedb": return .ethiopicExtendedB + case "ethiopicsup", "ethiopicsupplement": return .ethiopicSupplement + case "geometricshapes": return .geometricShapes + case "geometricshapesext", "geometricshapesextended": return .geometricShapesExtended + case "georgian": return .georgian + case "georgianext", "georgianextended": return .georgianExtended + case "georgiansup", "georgiansupplement": return .georgianSupplement + case "glagolitic": return .glagolitic + case "glagoliticsup", "glagoliticsupplement": return .glagoliticSupplement + case "gothic": return .gothic + case "grantha": return .grantha + case "greek", "greekandcoptic": return .greekAndCoptic + case "greekext", "greekextended": return .greekExtended + case "gujarati": return .gujarati + case "gunjalagondi": return .gunjalaGondi + case "gurmukhi": return .gurmukhi + case "halfandfullforms", "halfwidthandfullwidthforms": return .halfwidthAndFullwidthForms + case "halfmarks", "combininghalfmarks": return .combiningHalfMarks + case "hangul", "hangulsyllables": return .hangulSyllables + case "hanifirohingya": return .hanifiRohingya + case "hanunoo": return .hanunoo + case "hatran": return .hatran + case "hebrew": return .hebrew + case "highpusurrogates", "highprivateusesurrogates": return .highPrivateUseSurrogates + case "highsurrogates": return .highSurrogates + case "hiragana": return .hiragana + case "idc", "ideographicdescriptioncharacters": return .ideographicDescriptionCharacters + case "ideographicsymbols", "ideographicsymbolsandpunctuation": return .ideographicSymbolsAndPunctuation + case "imperialaramaic": return .imperialAramaic + case "indicnumberforms", "commonindicnumberforms": return .commonIndicNumberForms + case "indicsiyaqnumbers": return .indicSiyaqNumbers + case "inscriptionalpahlavi": return .inscriptionalPahlavi + case "inscriptionalparthian": return .inscriptionalParthian + case "ipaext", "ipaextensions": return .ipaExtensions + case "jamo", "hanguljamo": return .hangulJamo + case "jamoexta", "hanguljamoextendeda": return .hangulJamoExtendedA + case "jamoextb", "hanguljamoextendedb": return .hangulJamoExtendedB + case "javanese": return .javanese + case "kaithi": return .kaithi + case "kanaexta", "kanaextendeda": return .kanaExtendedA + case "kanaextb", "kanaextendedb": return .kanaExtendedB + case "kanasup", "kanasupplement": return .kanaSupplement + case "kanbun": return .kanbun + case "kangxi", "kangxiradicals": return .kangxiRadicals + case "kannada": return .kannada + case "katakana": return .katakana + case "katakanaext", "katakanaphoneticextensions": return .katakanaPhoneticExtensions + case "kayahli": return .kayahLi + case "kharoshthi": return .kharoshthi + case "khitansmallscript": return .khitanSmallScript + case "khmer": return .khmer + case "khmersymbols": return .khmerSymbols + case "khojki": return .khojki + case "khudawadi": return .khudawadi + case "lao": return .lao + case "latin1sup", "latin1supplement", "latin1": return .latin1Supplement + case "latinexta", "latinextendeda": return .latinExtendedA + case "latinextadditional", "latinextendedadditional": return .latinExtendedAdditional + case "latinextb", "latinextendedb": return .latinExtendedB + case "latinextc", "latinextendedc": return .latinExtendedC + case "latinextd", "latinextendedd": return .latinExtendedD + case "latinexte", "latinextendede": return .latinExtendedE + case "latinextf", "latinextendedf": return .latinExtendedF + case "latinextg", "latinextendedg": return .latinExtendedG + case "lepcha": return .lepcha + case "letterlikesymbols": return .letterLikeSymbols + case "limbu": return .limbu + case "lineara": return .linearA + case "linearbideograms": return .linearBIdeograms + case "linearbsyllabary": return .linearBSyllabary + case "lisu": return .lisu + case "lisusup", "lisusupplement": return .lisuSupplement + case "lowsurrogates": return .lowSurrogates + case "lycian": return .lycian + case "lydian": return .lydian + case "mahajani": return .mahajani + case "mahjong", "mahjongtiles": return .mahjongTiles + case "makasar": return .makasar + case "malayalam": return .malayalam + case "mandaic": return .mandaic + case "manichaean": return .manichaean + case "marchen": return .marchen + case "masaramgondi": return .masaramGondi + case "mathalphanum", "mathematicalalphanumericsymbols": return .mathematicalAlphanumericSymbols + case "mathoperators", "mathematicaloperators": return .mathematicalOperators + case "mayannumerals": return .mayanNumerals + case "medefaidrin": return .medefaidrin + case "meeteimayek": return .meeteiMayek + case "meeteimayekext", "meeteimayekextensions": return .meeteiMayekExtensions + case "mendekikakui": return .mendeKikakui + case "meroiticcursive": return .meroiticCursive + case "meroitichieroglyphs": return .meroiticHieroglyphs + case "miao": return .miao + case "miscarrows", "miscellaneoussymbolsandarrows": return .miscellaneousSymbolsAndArrows + case "miscmathsymbolsa", "miscellaneousmathematicalsymbolsa": return .miscellaneousMathematicalSymbolsA + case "miscmathsymbolsb", "miscellaneousmathematicalsymbolsb": return .miscellaneousMathematicalSymbolsB + case "miscpictographs", "miscellaneoussymbolsandpictographs": return .miscellaneousSymbolsandPictographs + case "miscsymbols", "miscellaneoussymbols": return .miscellaneousSymbols + case "misctechnical", "miscellaneoustechnical": return .miscellaneousTechnical + case "modi": return .modi + case "modifierletters", "spacingmodifierletters": return .spacingModifierLetters + case "modifiertoneletters": return .modifierToneLetters + case "mongolian": return .mongolian + case "mongoliansup", "mongoliansupplement": return .mongolianSupplement + case "mro": return .mro + case "multani": return .multani + case "music", "musicalsymbols": return .musicalSymbols + case "myanmar": return .myanmar + case "myanmarexta", "myanmarextendeda": return .myanmarExtendedA + case "myanmarextb", "myanmarextendedb": return .myanmarExtendedB + case "nabataean": return .nabataean + case "nandinagari": return .nandinagari + case "nb", "noblock": return .noBlock + case "newtailue": return .newTailue + case "newa": return .newa + case "nko": return .nko + case "numberforms": return .numberForms + case "nushu": return .nushu + case "nyiakengpuachuehmong": return .nyiakengPuachueHmong + case "ocr", "opticalcharacterrecognition": return .opticalCharacterRecognition + case "ogham": return .ogham + case "olchiki": return .olChiki + case "oldhungarian": return .oldHungarian + case "olditalic": return .oldItalic + case "oldnortharabian": return .oldNorthArabian + case "oldpermic": return .oldPermic + case "oldpersian": return .oldPersian + case "oldsogdian": return .oldSogdian + case "oldsoutharabian": return .oldSouthArabian + case "oldturkic": return .oldTurkic + case "olduyghur": return .oldUyghur + case "oriya": return .oriya + case "ornamentaldingbats": return .ornamentalDingbats + case "osage": return .osage + case "osmanya": return .osmanya + case "ottomansiyaqnumbers": return .ottomanSiyaqNumbers + case "pahawhhmong": return .pahawhHmong + case "palmyrene": return .palmyrene + case "paucinhau": return .pauCinHau + case "phagspa": return .phagsPA + case "phaistos", "phaistosdisc": return .phaistosDisc + case "phoenician": return .phoenician + case "phoneticext", "phoneticextensions": return .phoneticExtensions + case "phoneticextsup", "phoneticextensionssupplement": return .phoneticExtensionsSupplement + case "playingcards": return .playingCards + case "psalterpahlavi": return .psalterPahlavi + case "pua", "privateusearea", "privateuse": return .privateUseArea + case "punctuation", "generalpunctuation": return .generalPunctuation + case "rejang": return .rejang + case "rumi", "ruminumeralsymbols": return .rumiNumeralSymbols + case "runic": return .runic + case "samaritan": return .samaritan + case "saurashtra": return .saurashtra + case "sharada": return .sharada + case "shavian": return .shavian + case "shorthandformatcontrols": return .shorthandFormatControls + case "siddham": return .siddham + case "sinhala": return .sinhala + case "sinhalaarchaicnumbers": return .sinhalaArchaicNumbers + case "smallforms", "smallformvariants": return .smallFormVariants + case "smallkanaext", "smallkanaextension": return .smallKanaExtension + case "sogdian": return .sogdian + case "sorasompeng": return .soraSompeng + case "soyombo": return .soyombo + case "specials": return .specials + case "sundanese": return .sundanese + case "sundanesesup", "sundanesesupplement": return .sundaneseSupplement + case "suparrowsa", "supplementalarrowsa": return .supplementalArrowsA + case "suparrowsb", "supplementalarrowsb": return .supplementalArrowsB + case "suparrowsc", "supplementalarrowsc": return .supplementalArrowsC + case "supmathoperators", "supplementalmathematicaloperators": return .supplementalMathematicalOperators + case "suppuaa", "supplementaryprivateuseareaa": return .supplementaryPrivateUseAreaA + case "suppuab", "supplementaryprivateuseareab": return .supplementaryPrivateUseAreaB + case "suppunctuation", "supplementalpunctuation": return .supplementalPunctuation + case "supsymbolsandpictographs", "supplementalsymbolsandpictographs": return .supplementalSymbolsAndPictographs + case "superandsub", "superscriptsandsubscripts": return .superscriptsAndSubscripts + case "suttonsignwriting": return .suttonSignwriting + case "sylotinagri": return .sylotiNagri + case "symbolsandpictographsexta", "symbolsandpictographsextendeda": return .symbolsAndPictographsExtendedA + case "symbolsforlegacycomputing": return .symbolsForLegacyComputing + case "syriac": return .syriac + case "syriacsup", "syriacsupplement": return .syriacSupplement + case "tagalog": return .tagalog + case "tagbanwa": return .tagbanwa + case "tags": return .tags + case "taile": return .taiLe + case "taitham": return .taiTham + case "taiviet": return .taiViet + case "taixuanjing", "taixuanjingsymbols": return .taiXuanJingSymbols + case "takri": return .takri + case "tamil": return .tamil + case "tamilsup", "tamilsupplement": return .tamilSupplement + case "tangsa": return .tangsa + case "tangut": return .tangut + case "tangutcomponents": return .tangutComponents + case "tangutsup", "tangutsupplement": return .tangutSupplement + case "telugu": return .telugu + case "thaana": return .thaana + case "thai": return .thai + case "tibetan": return .tibetan + case "tifinagh": return .tifinagh + case "tirhuta": return .tirhuta + case "toto": return .toto + case "transportandmap", "transportandmapsymbols": return .transportAndMapSymbols + case "ucas", "unifiedcanadianaboriginalsyllabics", "canadiansyllabics": return .unifiedCanadianAboriginalSyllabics + case "ucasext", "unifiedcanadianaboriginalsyllabicsextended": return .unifiedCanadianAboriginalSyllabicsExtended + case "ucasexta", "unifiedcanadianaboriginalsyllabicsextendeda": return .unifiedCanadianAboriginalSyllabicsExtendedA + case "ugaritic": return .ugaritic + case "vai": return .vai + case "vedicext", "vedicextensions": return .vedicExtensions + case "verticalforms": return .verticalForms + case "vithkuqi": return .vithkuqi + case "vs", "variationselectors": return .variationSelectors + case "vssup", "variationselectorssupplement": return .variationSelectorsSupplement + case "wancho": return .wancho + case "warangciti": return .warangCiti + case "yezidi": return .yezidi + case "yiradicals": return .yiRadicals + case "yisyllables": return .yiSyllables + case "yijing", "yijinghexagramsymbols": return .yijingHexagramSymbols + case "zanabazarsquare": return .zanabazarSquare + case "znamennymusic", "znamennymusicalnotation": return .znamennyMusicalNotation + default: return nil + } + } + } + static func classifySpecialPropValue(_ value: String) -> PropertyKind? { withNormalizedForms(value) { str in switch str { @@ -420,12 +760,12 @@ extension Source { if let posix = classifyPOSIX(value) { return .posix(posix) } - - // Some additional special cases we recognise. - // TODO: Normalize these? - if let oniguruma = OnigurumaSpecialProperty(rawValue: value) { - return .onigurumaSpecial(oniguruma) + if let block = classifyBlockProperty(value, valueOnly: true) { + return .block(block) } + + // PCRE special properties. + // TODO: Normalize? typealias PCRESpecial = AST.Atom.CharacterProperty.PCRESpecialCategory if let pcreSpecial = PCRESpecial(rawValue: value) { return .pcreSpecial(pcreSpecial) @@ -493,6 +833,12 @@ extension Source { throw ParseError.invalidCCC(value) } return .ccc(.init(rawValue: cccValue)) + + case "blk", "block": + guard let block = classifyBlockProperty(value, valueOnly: false) else { + throw ParseError.unrecognizedBlock(value) + } + return .block(block) default: break } diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift index 382f78787..e8de5c827 100644 --- a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift +++ b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift @@ -62,6 +62,7 @@ enum ParseError: Error, Hashable { case unknownProperty(key: String?, value: String) case unrecognizedScript(String) case unrecognizedCategory(String) + case unrecognizedBlock(String) case invalidAge(String) case invalidNumericValue(String) case unrecognizedNumericType(String) @@ -194,6 +195,8 @@ extension ParseError: CustomStringConvertible { return "unrecognized script '\(value)'" case .unrecognizedCategory(let value): return "unrecognized category '\(value)'" + case .unrecognizedBlock(let value): + return "unrecognized block '\(value)'" case .unrecognizedNumericType(let value): return "unrecognized numeric type '\(value)'" case .invalidAge(let value): diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 395dae23b..99654991a 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -173,7 +173,7 @@ extension RegexValidator { break case .pcreSpecial: throw error(.unsupported("PCRE property"), at: loc) - case .onigurumaSpecial: + case .block: throw error(.unsupported("Unicode block property"), at: loc) } } diff --git a/Sources/_RegexParser/Utility/MissingUnicode.swift b/Sources/_RegexParser/Utility/MissingUnicode.swift index 6cd243f43..c61c78c46 100644 --- a/Sources/_RegexParser/Utility/MissingUnicode.swift +++ b/Sources/_RegexParser/Utility/MissingUnicode.swift @@ -19,7 +19,7 @@ extension Unicode { // other script types. /// Character script types. - public enum Script: String, Hashable { + public enum Script: String, Hashable, CaseIterable { case adlam = "Adlam" case ahom = "Ahom" case anatolianHieroglyphs = "Anatolian_Hieroglyphs" @@ -187,7 +187,7 @@ extension Unicode { /// POSIX character properties not already covered by general categories or /// binary properties. - public enum POSIXProperty: String, Hashable { + public enum POSIXProperty: String, Hashable, CaseIterable { case alnum = "alnum" case blank = "blank" case graph = "graph" @@ -204,7 +204,7 @@ extension Unicode { /// Unicode.GeneralCategory + cases for "meta categories" such as "L", which /// encompasses Lu | Ll | Lt | Lm | Lo. - public enum ExtendedGeneralCategory: String, Hashable { + public enum ExtendedGeneralCategory: String, Hashable, CaseIterable { case other = "C" case control = "Cc" case format = "Cf" @@ -254,7 +254,7 @@ extension Unicode { /// A list of Unicode properties that can either be true or false. /// /// https://www.unicode.org/Public/UCD/latest/ucd/PropertyAliases.txt - public enum BinaryProperty: String, Hashable { + public enum BinaryProperty: String, Hashable, CaseIterable { case asciiHexDigit = "ASCII_Hex_Digit" case alphabetic = "Alphabetic" case bidiControl = "Bidi_Control" @@ -323,334 +323,653 @@ extension Unicode { case expandsOnNFKC = "Expands_On_NFKC" case expandsOnNFKD = "Expands_On_NFKD" } -} - -// TODO: These should become aliases for the Block (blk) Unicode character -// property. -/// Oniguruma properties that are not covered by Unicode spellings. -public enum OnigurumaSpecialProperty: String, Hashable { - case inBasicLatin = "In_Basic_Latin" - case inLatin1Supplement = "In_Latin_1_Supplement" - case inLatinExtendedA = "In_Latin_Extended_A" - case inLatinExtendedB = "In_Latin_Extended_B" - case inIPAExtensions = "In_IPA_Extensions" - case inSpacingModifierLetters = "In_Spacing_Modifier_Letters" - case inCombiningDiacriticalMarks = "In_Combining_Diacritical_Marks" - case inGreekAndCoptic = "In_Greek_and_Coptic" - case inCyrillic = "In_Cyrillic" - case inCyrillicSupplement = "In_Cyrillic_Supplement" - case inArmenian = "In_Armenian" - case inHebrew = "In_Hebrew" - case inArabic = "In_Arabic" - case inSyriac = "In_Syriac" - case inArabicSupplement = "In_Arabic_Supplement" - case inThaana = "In_Thaana" - case inNKo = "In_NKo" - case inSamaritan = "In_Samaritan" - case inMandaic = "In_Mandaic" - case inSyriacSupplement = "In_Syriac_Supplement" - case inArabicExtendedB = "In_Arabic_Extended_B" - case inArabicExtendedA = "In_Arabic_Extended_A" - case inDevanagari = "In_Devanagari" - case inBengali = "In_Bengali" - case inGurmukhi = "In_Gurmukhi" - case inGujarati = "In_Gujarati" - case inOriya = "In_Oriya" - case inTamil = "In_Tamil" - case inTelugu = "In_Telugu" - case inKannada = "In_Kannada" - case inMalayalam = "In_Malayalam" - case inSinhala = "In_Sinhala" - case inThai = "In_Thai" - case inLao = "In_Lao" - case inTibetan = "In_Tibetan" - case inMyanmar = "In_Myanmar" - case inGeorgian = "In_Georgian" - case inHangulJamo = "In_Hangul_Jamo" - case inEthiopic = "In_Ethiopic" - case inEthiopicSupplement = "In_Ethiopic_Supplement" - case inCherokee = "In_Cherokee" - case inUnifiedCanadianAboriginalSyllabics = "In_Unified_Canadian_Aboriginal_Syllabics" - case inOgham = "In_Ogham" - case inRunic = "In_Runic" - case inTagalog = "In_Tagalog" - case inHanunoo = "In_Hanunoo" - case inBuhid = "In_Buhid" - case inTagbanwa = "In_Tagbanwa" - case inKhmer = "In_Khmer" - case inMongolian = "In_Mongolian" - case inUnifiedCanadianAboriginalSyllabicsExtended = "In_Unified_Canadian_Aboriginal_Syllabics_Extended" - case inLimbu = "In_Limbu" - case inTaiLe = "In_Tai_Le" - case inNewTaiLue = "In_New_Tai_Lue" - case inKhmerSymbols = "In_Khmer_Symbols" - case inBuginese = "In_Buginese" - case inTaiTham = "In_Tai_Tham" - case inCombiningDiacriticalMarksExtended = "In_Combining_Diacritical_Marks_Extended" - case inBalinese = "In_Balinese" - case inSundanese = "In_Sundanese" - case inBatak = "In_Batak" - case inLepcha = "In_Lepcha" - case inOlChiki = "In_Ol_Chiki" - case inCyrillicExtendedC = "In_Cyrillic_Extended_C" - case inGeorgianExtended = "In_Georgian_Extended" - case inSundaneseSupplement = "In_Sundanese_Supplement" - case inVedicExtensions = "In_Vedic_Extensions" - case inPhoneticExtensions = "In_Phonetic_Extensions" - case inPhoneticExtensions_Supplement = "In_Phonetic_Extensions_Supplement" - case inCombiningDiacriticalMarksSupplement = "In_Combining_Diacritical_Marks_Supplement" - case inLatinExtendedAdditional = "In_Latin_Extended_Additional" - case inGreekExtended = "In_Greek_Extended" - case inGeneralPunctuation = "In_General_Punctuation" - case inSuperscriptsandSubscripts = "In_Superscripts_and_Subscripts" - case inCurrencySymbols = "In_Currency_Symbols" - case inCombiningDiacriticalMarksforSymbols = "In_Combining_Diacritical_Marks_for_Symbols" - case inLetterlikeSymbols = "In_Letterlike_Symbols" - case inNumberForms = "In_Number_Forms" - case inArrows = "In_Arrows" - case inMathematicalOperators = "In_Mathematical_Operators" - case inMiscellaneousTechnical = "In_Miscellaneous_Technical" - case inControlPictures = "In_Control_Pictures" - case inOpticalCharacterRecognition = "In_Optical_Character_Recognition" - case inEnclosedAlphanumerics = "In_Enclosed_Alphanumerics" - case inBoxDrawing = "In_Box_Drawing" - case inBlockElements = "In_Block_Elements" - case inGeometricShapes = "In_Geometric_Shapes" - case inMiscellaneousSymbols = "In_Miscellaneous_Symbols" - case inDingbats = "In_Dingbats" - case inMiscellaneousMathematicalSymbolsA = "In_Miscellaneous_Mathematical_Symbols_A" - case inSupplementalArrowsA = "In_Supplemental_Arrows_A" - case inBraillePatterns = "In_Braille_Patterns" - case inSupplementalArrowsB = "In_Supplemental_Arrows_B" - case inMiscellaneousMathematicalSymbolsB = "In_Miscellaneous_Mathematical_Symbols_B" - case inSupplementalMathematicalOperators = "In_Supplemental_Mathematical_Operators" - case inMiscellaneousSymbolsAndArrows = "In_Miscellaneous_Symbols_and_Arrows" - case inGlagolitic = "In_Glagolitic" - case inLatinExtendedC = "In_Latin_Extended_C" - case inCoptic = "In_Coptic" - case inGeorgianSupplement = "In_Georgian_Supplement" - case inTifinagh = "In_Tifinagh" - case inEthiopicExtended = "In_Ethiopic_Extended" - case inCyrillicExtendedA = "In_Cyrillic_Extended_A" - case inSupplementalPunctuation = "In_Supplemental_Punctuation" - case inCJKRadicalsSupplement = "In_CJK_Radicals_Supplement" - case inKangxiRadicals = "In_Kangxi_Radicals" - case inIdeographicDescriptionCharacters = "In_Ideographic_Description_Characters" - case inCJKSymbolsAndPunctuation = "In_CJK_Symbols_and_Punctuation" - case inHiragana = "In_Hiragana" - case inKatakana = "In_Katakana" - case inBopomofo = "In_Bopomofo" - case inHangulCompatibilityJamo = "In_Hangul_Compatibility_Jamo" - case inKanbun = "In_Kanbun" - case inBopomofoExtended = "In_Bopomofo_Extended" - case inCJKStrokes = "In_CJK_Strokes" - case inKatakanaPhoneticExtensions = "In_Katakana_Phonetic_Extensions" - case inEnclosedCJKLettersAndMonths = "In_Enclosed_CJK_Letters_and_Months" - case inCJKCompatibility = "In_CJK_Compatibility" - case inCJKUnifiedIdeographsExtensionA = "In_CJK_Unified_Ideographs_Extension_A" - case inYijingHexagramSymbols = "In_Yijing_Hexagram_Symbols" - case inCJKUnifiedIdeographs = "In_CJK_Unified_Ideographs" - case inYiSyllables = "In_Yi_Syllables" - case inYiRadicals = "In_Yi_Radicals" - case inLisu = "In_Lisu" - case inVai = "In_Vai" - case inCyrillicExtendedB = "In_Cyrillic_Extended_B" - case inBamum = "In_Bamum" - case inModifierToneLetters = "In_Modifier_Tone_Letters" - case inLatinExtendedD = "In_Latin_Extended_D" - case inSylotiNagri = "In_Syloti_Nagri" - case inCommonIndicNumberForms = "In_Common_Indic_Number_Forms" - case inPhagsPA = "In_Phags_pa" - case inSaurashtra = "In_Saurashtra" - case inDevanagariExtended = "In_Devanagari_Extended" - case inKayahLi = "In_Kayah_Li" - case inRejang = "In_Rejang" - case inHangulJamoExtendedA = "In_Hangul_Jamo_Extended_A" - case inJavanese = "In_Javanese" - case inMyanmarExtendedB = "In_Myanmar_Extended_B" - case inCham = "In_Cham" - case inMyanmarExtendedA = "In_Myanmar_Extended_A" - case inTaiViet = "In_Tai_Viet" - case inMeeteiMayekExtensions = "In_Meetei_Mayek_Extensions" - case inEthiopicExtendedA = "In_Ethiopic_Extended_A" - case inLatinExtendedE = "In_Latin_Extended_E" - case inCherokeeSupplement = "In_Cherokee_Supplement" - case inMeeteiMayek = "In_Meetei_Mayek" - case inHangulSyllables = "In_Hangul_Syllables" - case inHangulJamoExtendedB = "In_Hangul_Jamo_Extended_B" - case inHighSurrogates = "In_High_Surrogates" - case inHighPrivateUseSurrogates = "In_High_Private_Use_Surrogates" - case inLowSurrogates = "In_Low_Surrogates" - case inPrivateUseArea = "In_Private_Use_Area" - case inCJKCompatibilityIdeographs = "In_CJK_Compatibility_Ideographs" - case inAlphabeticPresentationForms = "In_Alphabetic_Presentation_Forms" - case inArabicPresentationFormsA = "In_Arabic_Presentation_Forms_A" - case inVariationSelectors = "In_Variation_Selectors" - case inVerticalForms = "In_Vertical_Forms" - case inCombiningHalfMarks = "In_Combining_Half_Marks" - case inCJKCompatibilityForms = "In_CJK_Compatibility_Forms" - case inSmallFormVariants = "In_Small_Form_Variants" - case inArabicPresentationFormsB = "In_Arabic_Presentation_Forms_B" - case inHalfwidthAndFullwidthForms = "In_Halfwidth_and_Fullwidth_Forms" - case inSpecials = "In_Specials" - case inLinearBSyllabary = "In_Linear_B_Syllabary" - case inLinearBIdeograms = "In_Linear_B_Ideograms" - case inAegeanNumbers = "In_Aegean_Numbers" - case inAncientGreekNumbers = "In_Ancient_Greek_Numbers" - case inAncientSymbols = "In_Ancient_Symbols" - case inPhaistosDisc = "In_Phaistos_Disc" - case inLycian = "In_Lycian" - case inCarian = "In_Carian" - case inCopticEpactNumbers = "In_Coptic_Epact_Numbers" - case inOldItalic = "In_Old_Italic" - case inGothic = "In_Gothic" - case inOldPermic = "In_Old_Permic" - case inUgaritic = "In_Ugaritic" - case inOldPersian = "In_Old_Persian" - case inDeseret = "In_Deseret" - case inShavian = "In_Shavian" - case inOsmanya = "In_Osmanya" - case inOsage = "In_Osage" - case inElbasan = "In_Elbasan" - case inCaucasianAlbanian = "In_Caucasian_Albanian" - case inVithkuqi = "In_Vithkuqi" - case inLinearA = "In_Linear_A" - case inLatinExtendedF = "In_Latin_Extended_F" - case inCypriotSyllabary = "In_Cypriot_Syllabary" - case inImperialAramaic = "In_Imperial_Aramaic" - case inPalmyrene = "In_Palmyrene" - case inNabataean = "In_Nabataean" - case inHatran = "In_Hatran" - case inPhoenician = "In_Phoenician" - case inLydian = "In_Lydian" - case inMeroiticHieroglyphs = "In_Meroitic_Hieroglyphs" - case inMeroiticCursive = "In_Meroitic_Cursive" - case inKharoshthi = "In_Kharoshthi" - case inOldSouthArabian = "In_Old_South_Arabian" - case inOldNorthArabian = "In_Old_North_Arabian" - case inManichaean = "In_Manichaean" - case inAvestan = "In_Avestan" - case inInscriptionalParthian = "In_Inscriptional_Parthian" - case inInscriptionalPahlavi = "In_Inscriptional_Pahlavi" - case inPsalterPahlavi = "In_Psalter_Pahlavi" - case inOldTurkic = "In_Old_Turkic" - case inOldHungarian = "In_Old_Hungarian" - case inHanifiRohingya = "In_Hanifi_Rohingya" - case inRumiNumeralSymbols = "In_Rumi_Numeral_Symbols" - case inYezidi = "In_Yezidi" - case inOldSogdian = "In_Old_Sogdian" - case inSogdian = "In_Sogdian" - case inOldUyghur = "In_Old_Uyghur" - case inChorasmian = "In_Chorasmian" - case inElymaic = "In_Elymaic" - case inBrahmi = "In_Brahmi" - case inKaithi = "In_Kaithi" - case inSoraSompeng = "In_Sora_Sompeng" - case inChakma = "In_Chakma" - case inMahajani = "In_Mahajani" - case inSharada = "In_Sharada" - case inSinhalaArchaicNumbers = "In_Sinhala_Archaic_Numbers" - case inKhojki = "In_Khojki" - case inMultani = "In_Multani" - case inKhudawadi = "In_Khudawadi" - case inGrantha = "In_Grantha" - case inNewa = "In_Newa" - case inTirhuta = "In_Tirhuta" - case inSiddham = "In_Siddham" - case inModi = "In_Modi" - case inMongolianSupplement = "In_Mongolian_Supplement" - case inTakri = "In_Takri" - case inAhom = "In_Ahom" - case inDogra = "In_Dogra" - case inWarangCiti = "In_Warang_Citi" - case inDivesAkuru = "In_Dives_Akuru" - case inNandinagari = "In_Nandinagari" - case inZanabazarSquare = "In_Zanabazar_Square" - case inSoyombo = "In_Soyombo" - case inUnifiedCanadianAboriginalSyllabicsExtendedA = "In_Unified_Canadian_Aboriginal_Syllabics_Extended_A" - case inPauCinHau = "In_Pau_Cin_Hau" - case inBhaiksuki = "In_Bhaiksuki" - case inMarchen = "In_Marchen" - case inMasaramGondi = "In_Masaram_Gondi" - case inGunjalaGondi = "In_Gunjala_Gondi" - case inMakasar = "In_Makasar" - case inLisuSupplement = "In_Lisu_Supplement" - case inTamilSupplement = "In_Tamil_Supplement" - case inCuneiform = "In_Cuneiform" - case inCuneiformNumbersandPunctuation = "In_Cuneiform_Numbers_and_Punctuation" - case inEarlyDynasticCuneiform = "In_Early_Dynastic_Cuneiform" - case inCyproMinoan = "In_Cypro_Minoan" - case inEgyptianHieroglyphs = "In_Egyptian_Hieroglyphs" - case inEgyptianHieroglyphFormatControls = "In_Egyptian_Hieroglyph_Format_Controls" - case inAnatolianHieroglyphs = "In_Anatolian_Hieroglyphs" - case inBamumSupplement = "In_Bamum_Supplement" - case inMro = "In_Mro" - case inTangsa = "In_Tangsa" - case inBassaVah = "In_Bassa_Vah" - case inPahawhHmong = "In_Pahawh_Hmong" - case inMedefaidrin = "In_Medefaidrin" - case inMiao = "In_Miao" - case inIdeographicSymbolsAndPunctuation = "In_Ideographic_Symbols_and_Punctuation" - case inTangut = "In_Tangut" - case inTangutComponents = "In_Tangut_Components" - case inKhitanSmallScript = "In_Khitan_Small_Script" - case inTangutSupplement = "In_Tangut_Supplement" - case inKanaExtendedB = "In_Kana_Extended_B" - case inKanaSupplement = "In_Kana_Supplement" - case inKanaExtendedA = "In_Kana_Extended_A" - case inSmallKanaExtension = "In_Small_Kana_Extension" - case inNushu = "In_Nushu" - case inDuployan = "In_Duployan" - case inShorthandFormatControls = "In_Shorthand_Format_Controls" - case inZnamennyMusicalNotation = "In_Znamenny_Musical_Notation" - case inByzantineMusicalSymbols = "In_Byzantine_Musical_Symbols" - case inMusicalSymbols = "In_Musical_Symbols" - case inAncientGreekMusicalNotation = "In_Ancient_Greek_Musical_Notation" - case inMayanNumerals = "In_Mayan_Numerals" - case inTaiXuanJingSymbols = "In_Tai_Xuan_Jing_Symbols" - case inCountingRodNumerals = "In_Counting_Rod_Numerals" - case inMathematicalAlphanumericSymbols = "In_Mathematical_Alphanumeric_Symbols" - case inSuttonSignWriting = "In_Sutton_SignWriting" - case inLatinExtendedG = "In_Latin_Extended_G" - case inGlagoliticSupplement = "In_Glagolitic_Supplement" - case inNyiakengPuachueHmong = "In_Nyiakeng_Puachue_Hmong" - case inToto = "In_Toto" - case inWancho = "In_Wancho" - case inEthiopicExtendedB = "In_Ethiopic_Extended_B" - case inMendeKikakui = "In_Mende_Kikakui" - case inAdlam = "In_Adlam" - case inIndicSiyaqNumbers = "In_Indic_Siyaq_Numbers" - case inOttomanSiyaqNumbers = "In_Ottoman_Siyaq_Numbers" - case inArabicMathematicalAlphabeticSymbols = "In_Arabic_Mathematical_Alphabetic_Symbols" - case inMahjongTiles = "In_Mahjong_Tiles" - case inDominoTiles = "In_Domino_Tiles" - case inPlayingCards = "In_Playing_Cards" - case inEnclosedAlphanumericSupplement = "In_Enclosed_Alphanumeric_Supplement" - case inEnclosedIdeographicSupplement = "In_Enclosed_Ideographic_Supplement" - case inMiscellaneousSymbolsandPictographs = "In_Miscellaneous_Symbols_and_Pictographs" - case inEmoticons = "In_Emoticons" - case inOrnamentalDingbats = "In_Ornamental_Dingbats" - case inTransportandMapSymbols = "In_Transport_and_Map_Symbols" - case inAlchemicalSymbols = "In_Alchemical_Symbols" - case inGeometricShapesExtended = "In_Geometric_Shapes_Extended" - case inSupplementalArrowsC = "In_Supplemental_Arrows_C" - case inSupplementalSymbolsAndPictographs = "In_Supplemental_Symbols_and_Pictographs" - case inChessSymbols = "In_Chess_Symbols" - case inSymbolsAndPictographsExtendedA = "In_Symbols_and_Pictographs_Extended_A" - case inSymbolsForLegacyComputing = "In_Symbols_for_Legacy_Computing" - case inCJKUnifiedIdeographsExtensionB = "In_CJK_Unified_Ideographs_Extension_B" - case inCJKUnifiedIdeographsExtensionC = "In_CJK_Unified_Ideographs_Extension_C" - case inCJKUnifiedIdeographsExtensionD = "In_CJK_Unified_Ideographs_Extension_D" - case inCJKUnifiedIdeographsExtensionE = "In_CJK_Unified_Ideographs_Extension_E" - case inCJKUnifiedIdeographsExtensionF = "In_CJK_Unified_Ideographs_Extension_F" - case inCJKCompatibilityIdeographsSupplement = "In_CJK_Compatibility_Ideographs_Supplement" - case inCJKUnifiedIdeographsExtensionG = "In_CJK_Unified_Ideographs_Extension_G" - case inTags = "In_Tags" - case inVariationSelectorsSupplement = "In_Variation_Selectors_Supplement" - case inSupplementaryPrivateUseAreaA = "In_Supplementary_Private_Use_Area_A" - case inSupplementaryPrivateUseAreaB = "In_Supplementary_Private_Use_Area_B" - case inNoBlock = "In_No_Block" + /// A list of unicode character blocks, including `No_Block`. + /// https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt + public enum Block: String, Hashable, CaseIterable { + /// 0000..007F; Basic Latin + case basicLatin = "Basic_Latin" + /// 0080..00FF; Latin-1 Supplement + case latin1Supplement = "Latin_1_Supplement" + /// 0100..017F; Latin Extended-A + case latinExtendedA = "Latin_Extended_A" + /// 0180..024F; Latin Extended-B + case latinExtendedB = "Latin_Extended_B" + /// 0250..02AF; IPA Extensions + case ipaExtensions = "IPA_Extensions" + /// 02B0..02FF; Spacing Modifier Letters + case spacingModifierLetters = "Spacing_Modifier_Letters" + /// 0300..036F; Combining Diacritical Marks + case combiningDiacriticalMarks = "Combining_Diacritical_Marks" + /// 0370..03FF; Greek and Coptic + case greekAndCoptic = "Greek_and_Coptic" + /// 0400..04FF; Cyrillic + case cyrillic = "Cyrillic" + /// 0500..052F; Cyrillic Supplement + case cyrillicSupplement = "Cyrillic_Supplement" + /// 0530..058F; Armenian + case armenian = "Armenian" + /// 0590..05FF; Hebrew + case hebrew = "Hebrew" + /// 0600..06FF; Arabic + case arabic = "Arabic" + /// 0700..074F; Syriac + case syriac = "Syriac" + /// 0750..077F; Arabic Supplement + case arabicSupplement = "Arabic_Supplement" + /// 0780..07BF; Thaana + case thaana = "Thaana" + /// 07C0..07FF; NKo + case nko = "NKo" + /// 0800..083F; Samaritan + case samaritan = "Samaritan" + /// 0840..085F; Mandaic + case mandaic = "Mandaic" + /// 0860..086F; Syriac Supplement + case syriacSupplement = "Syriac_Supplement" + /// 0870..089F; Arabic Extended-B + case arabicExtendedB = "Arabic_Extended_B" + /// 08A0..08FF; Arabic Extended-A + case arabicExtendedA = "Arabic_Extended_A" + /// 0900..097F; Devanagari + case devanagari = "Devanagari" + /// 0980..09FF; Bengali + case bengali = "Bengali" + /// 0A00..0A7F; Gurmukhi + case gurmukhi = "Gurmukhi" + /// 0A80..0AFF; Gujarati + case gujarati = "Gujarati" + /// 0B00..0B7F; Oriya + case oriya = "Oriya" + /// 0B80..0BFF; Tamil + case tamil = "Tamil" + /// 0C00..0C7F; Telugu + case telugu = "Telugu" + /// 0C80..0CFF; Kannada + case kannada = "Kannada" + /// 0D00..0D7F; Malayalam + case malayalam = "Malayalam" + /// 0D80..0DFF; Sinhala + case sinhala = "Sinhala" + /// 0E00..0E7F; Thai + case thai = "Thai" + /// 0E80..0EFF; Lao + case lao = "Lao" + /// 0F00..0FFF; Tibetan + case tibetan = "Tibetan" + /// 1000..109F; Myanmar + case myanmar = "Myanmar" + /// 10A0..10FF; Georgian + case georgian = "Georgian" + /// 1100..11FF; Hangul Jamo + case hangulJamo = "Hangul_Jamo" + /// 1200..137F; Ethiopic + case ethiopic = "Ethiopic" + /// 1380..139F; Ethiopic Supplement + case ethiopicSupplement = "Ethiopic_Supplement" + /// 13A0..13FF; Cherokee + case cherokee = "Cherokee" + /// 1400..167F; Unified Canadian Aboriginal Syllabics + case unifiedCanadianAboriginalSyllabics = "Unified_Canadian_Aboriginal_Syllabics" + /// 1680..169F; Ogham + case ogham = "Ogham" + /// 16A0..16FF; Runic + case runic = "Runic" + /// 1700..171F; Tagalog + case tagalog = "Tagalog" + /// 1720..173F; Hanunoo + case hanunoo = "Hanunoo" + /// 1740..175F; Buhid + case buhid = "Buhid" + /// 1760..177F; Tagbanwa + case tagbanwa = "Tagbanwa" + /// 1780..17FF; Khmer + case khmer = "Khmer" + /// 1800..18AF; Mongolian + case mongolian = "Mongolian" + /// 18B0..18FF; Unified Canadian Aboriginal Syllabics Extended + case unifiedCanadianAboriginalSyllabicsExtended = "Unified_Canadian_Aboriginal_Syllabics_Extended" + /// 1900..194F; Limbu + case limbu = "Limbu" + /// 1950..197F; Tai Le + case taiLe = "Tai_Le" + /// 1980..19DF; New Tai Lue + case newTailue = "New_Tai_Lue" + /// 19E0..19FF; Khmer Symbols + case khmerSymbols = "Khmer_Symbols" + /// 1A00..1A1F; Buginese + case buginese = "Buginese" + /// 1A20..1AAF; Tai Tham + case taiTham = "Tai_Tham" + /// 1AB0..1AFF; Combining Diacritical Marks Extended + case combiningDiacriticalMarksExtended = "Combining_Diacritical_Marks_Extended" + /// 1B00..1B7F; Balinese + case balinese = "Balinese" + /// 1B80..1BBF; Sundanese + case sundanese = "Sundanese" + /// 1BC0..1BFF; Batak + case batak = "Batak" + /// 1C00..1C4F; Lepcha + case lepcha = "Lepcha" + /// 1C50..1C7F; Ol Chiki + case olChiki = "Ol_Chiki" + /// 1C80..1C8F; Cyrillic Extended-C + case cyrillicExtendedC = "Cyrillic_Extended_C" + /// 1C90..1CBF; Georgian Extended + case georgianExtended = "Georgian_Extended" + /// 1CC0..1CCF; Sundanese Supplement + case sundaneseSupplement = "Sundanese_Supplement" + /// 1CD0..1CFF; Vedic Extensions + case vedicExtensions = "Vedic_Extensions" + /// 1D00..1D7F; Phonetic Extensions + case phoneticExtensions = "Phonetic_Extensions" + /// 1D80..1DBF; Phonetic Extensions Supplement + case phoneticExtensionsSupplement = "Phonetic_Extensions_Supplement" + /// 1DC0..1DFF; Combining Diacritical Marks Supplement + case combiningDiacriticalMarksSupplement = "Combining_Diacritical_Marks_Supplement" + /// 1E00..1EFF; Latin Extended Additional + case latinExtendedAdditional = "Latin_Extended_Additional" + /// 1F00..1FFF; Greek Extended + case greekExtended = "Greek_Extended" + /// 2000..206F; General Punctuation + case generalPunctuation = "General_Punctuation" + /// 2070..209F; Superscripts and Subscripts + case superscriptsAndSubscripts = "Superscripts_and_Subscripts" + /// 20A0..20CF; Currency Symbols + case currencySymbols = "Currency_Symbols" + /// 20D0..20FF; Combining Diacritical Marks for Symbols + case combiningDiacriticalMarksForSymbols = "Combining_Diacritical_Marks_for_Symbols" + /// 2100..214F; Letterlike Symbols + case letterLikeSymbols = "Letterlike_Symbols" + /// 2150..218F; Number Forms + case numberForms = "Number_Forms" + /// 2190..21FF; Arrows + case arrows = "Arrows" + /// 2200..22FF; Mathematical Operators + case mathematicalOperators = "Mathematical_Operators" + /// 2300..23FF; Miscellaneous Technical + case miscellaneousTechnical = "Miscellaneous_Technical" + /// 2400..243F; Control Pictures + case controlPictures = "Control_Pictures" + /// 2440..245F; Optical Character Recognition + case opticalCharacterRecognition = "Optical_Character_Recognition" + /// 2460..24FF; Enclosed Alphanumerics + case enclosedAlphanumerics = "Enclosed_Alphanumerics" + /// 2500..257F; Box Drawing + case boxDrawing = "Box_Drawing" + /// 2580..259F; Block Elements + case blockElements = "Block_Elements" + /// 25A0..25FF; Geometric Shapes + case geometricShapes = "Geometric_Shapes" + /// 2600..26FF; Miscellaneous Symbols + case miscellaneousSymbols = "Miscellaneous_Symbols" + /// 2700..27BF; Dingbats + case dingbats = "Dingbats" + /// 27C0..27EF; Miscellaneous Mathematical Symbols-A + case miscellaneousMathematicalSymbolsA = "Miscellaneous_Mathematical_Symbols_A" + /// 27F0..27FF; Supplemental Arrows-A + case supplementalArrowsA = "Supplemental_Arrows_A" + /// 2800..28FF; Braille Patterns + case braillePatterns = "Braille_Patterns" + /// 2900..297F; Supplemental Arrows-B + case supplementalArrowsB = "Supplemental_Arrows_B" + /// 2980..29FF; Miscellaneous Mathematical Symbols-B + case miscellaneousMathematicalSymbolsB = "Miscellaneous_Mathematical_Symbols_B" + /// 2A00..2AFF; Supplemental Mathematical Operators + case supplementalMathematicalOperators = "Supplemental_Mathematical_Operators" + /// 2B00..2BFF; Miscellaneous Symbols and Arrows + case miscellaneousSymbolsAndArrows = "Miscellaneous_Symbols_and_Arrows" + /// 2C00..2C5F; Glagolitic + case glagolitic = "Glagolitic" + /// 2C60..2C7F; Latin Extended-C + case latinExtendedC = "Latin_Extended_C" + /// 2C80..2CFF; Coptic + case coptic = "Coptic" + /// 2D00..2D2F; Georgian Supplement + case georgianSupplement = "Georgian_Supplement" + /// 2D30..2D7F; Tifinagh + case tifinagh = "Tifinagh" + /// 2D80..2DDF; Ethiopic Extended + case ethiopicExtended = "Ethiopic_Extended" + /// 2DE0..2DFF; Cyrillic Extended-A + case cyrillicExtendedA = "Cyrillic_Extended_A" + /// 2E00..2E7F; Supplemental Punctuation + case supplementalPunctuation = "Supplemental_Punctuation" + /// 2E80..2EFF; CJK Radicals Supplement + case cjkRadicalsSupplement = "CJK_Radicals_Supplement" + /// 2F00..2FDF; Kangxi Radicals + case kangxiRadicals = "Kangxi_Radicals" + /// 2FF0..2FFF; Ideographic Description Characters + case ideographicDescriptionCharacters = "Ideographic_Description_Characters" + /// 3000..303F; CJK Symbols and Punctuation + case cjkSymbolsAndPunctuation = "CJK_Symbols_and_Punctuation" + /// 3040..309F; Hiragana + case hiragana = "Hiragana" + /// 30A0..30FF; Katakana + case katakana = "Katakana" + /// 3100..312F; Bopomofo + case bopomofo = "Bopomofo" + /// 3130..318F; Hangul Compatibility Jamo + case hangulCompatibilityJamo = "Hangul_Compatibility_Jamo" + /// 3190..319F; Kanbun + case kanbun = "Kanbun" + /// 31A0..31BF; Bopomofo Extended + case bopomofoExtended = "Bopomofo_Extended" + /// 31C0..31EF; CJK Strokes + case cjkStrokes = "CJK_Strokes" + /// 31F0..31FF; Katakana Phonetic Extensions + case katakanaPhoneticExtensions = "Katakana_Phonetic_Extensions" + /// 3200..32FF; Enclosed CJK Letters and Months + case enclosedCJKLettersAndMonths = "Enclosed_CJK_Letters_and_Months" + /// 3300..33FF; CJK Compatibility + case cjkCompatibility = "CJK_Compatibility" + /// 3400..4DBF; CJK Unified Ideographs Extension A + case cjkUnifiedIdeographsExtensionA = "CJK_Unified_Ideographs_Extension_A" + /// 4DC0..4DFF; Yijing Hexagram Symbols + case yijingHexagramSymbols = "Yijing_Hexagram_Symbols" + /// 4E00..9FFF; CJK Unified Ideographs + case cjkUnifiedIdeographs = "CJK_Unified_Ideographs" + /// A000..A48F; Yi Syllables + case yiSyllables = "Yi_Syllables" + /// A490..A4CF; Yi Radicals + case yiRadicals = "Yi_Radicals" + /// A4D0..A4FF; Lisu + case lisu = "Lisu" + /// A500..A63F; Vai + case vai = "Vai" + /// A640..A69F; Cyrillic Extended-B + case cyrillicExtendedB = "Cyrillic_Extended_B" + /// A6A0..A6FF; Bamum + case bamum = "Bamum" + /// A700..A71F; Modifier Tone Letters + case modifierToneLetters = "Modifier_Tone_Letters" + /// A720..A7FF; Latin Extended-D + case latinExtendedD = "Latin_Extended_D" + /// A800..A82F; Syloti Nagri + case sylotiNagri = "Syloti_Nagri" + /// A830..A83F; Common Indic Number Forms + case commonIndicNumberForms = "Common_Indic_Number_Forms" + /// A840..A87F; Phags-pa + case phagsPA = "Phags_pa" + /// A880..A8DF; Saurashtra + case saurashtra = "Saurashtra" + /// A8E0..A8FF; Devanagari Extended + case devanagariExtended = "Devanagari_Extended" + /// A900..A92F; Kayah Li + case kayahLi = "Kayah_Li" + /// A930..A95F; Rejang + case rejang = "Rejang" + /// A960..A97F; Hangul Jamo Extended-A + case hangulJamoExtendedA = "Hangul_Jamo_Extended_A" + /// A980..A9DF; Javanese + case javanese = "Javanese" + /// A9E0..A9FF; Myanmar Extended-B + case myanmarExtendedB = "Myanmar_Extended_B" + /// AA00..AA5F; Cham + case cham = "Cham" + /// AA60..AA7F; Myanmar Extended-A + case myanmarExtendedA = "Myanmar_Extended_A" + /// AA80..AADF; Tai Viet + case taiViet = "Tai_Viet" + /// AAE0..AAFF; Meetei Mayek Extensions + case meeteiMayekExtensions = "Meetei_Mayek_Extensions" + /// AB00..AB2F; Ethiopic Extended-A + case ethiopicExtendedA = "Ethiopic_Extended_A" + /// AB30..AB6F; Latin Extended-E + case latinExtendedE = "Latin_Extended_E" + /// AB70..ABBF; Cherokee Supplement + case cherokeeSupplement = "Cherokee_Supplement" + /// ABC0..ABFF; Meetei Mayek + case meeteiMayek = "Meetei_Mayek" + /// AC00..D7AF; Hangul Syllables + case hangulSyllables = "Hangul_Syllables" + /// D7B0..D7FF; Hangul Jamo Extended-B + case hangulJamoExtendedB = "Hangul_Jamo_Extended_B" + /// D800..DB7F; High Surrogates + case highSurrogates = "High_Surrogates" + /// DB80..DBFF; High Private Use Surrogates + case highPrivateUseSurrogates = "High_Private_Use_Surrogates" + /// DC00..DFFF; Low Surrogates + case lowSurrogates = "Low_Surrogates" + /// E000..F8FF; Private Use Area + case privateUseArea = "Private_Use_Area" + /// F900..FAFF; CJK Compatibility Ideographs + case cjkCompatibilityIdeographs = "CJK_Compatibility_Ideographs" + /// FB00..FB4F; Alphabetic Presentation Forms + case alphabeticPresentationForms = "Alphabetic_Presentation_Forms" + /// FB50..FDFF; Arabic Presentation Forms-A + case arabicPresentationFormsA = "Arabic_Presentation_Forms_A" + /// FE00..FE0F; Variation Selectors + case variationSelectors = "Variation_Selectors" + /// FE10..FE1F; Vertical Forms + case verticalForms = "Vertical_Forms" + /// FE20..FE2F; Combining Half Marks + case combiningHalfMarks = "Combining_Half_Marks" + /// FE30..FE4F; CJK Compatibility Forms + case cjkcompatibilityForms = "CJK_Compatibility_Forms" + /// FE50..FE6F; Small Form Variants + case smallFormVariants = "Small_Form_Variants" + /// FE70..FEFF; Arabic Presentation Forms-B + case arabicPresentationFormsB = "Arabic_Presentation_Forms_B" + /// FF00..FFEF; Halfwidth and Fullwidth Forms + case halfwidthAndFullwidthForms = "Halfwidth_and_Fullwidth_Forms" + /// FFF0..FFFF; Specials + case specials = "Specials" + /// 10000..1007F; Linear B Syllabary + case linearBSyllabary = "Linear_B_Syllabary" + /// 10080..100FF; Linear B Ideograms + case linearBIdeograms = "Linear_B_Ideograms" + /// 10100..1013F; Aegean Numbers + case aegeanNumbers = "Aegean_Numbers" + /// 10140..1018F; Ancient Greek Numbers + case ancientGreekNumbers = "Ancient_Greek_Numbers" + /// 10190..101CF; Ancient Symbols + case ancientSymbols = "Ancient_Symbols" + /// 101D0..101FF; Phaistos Disc + case phaistosDisc = "Phaistos_Disc" + /// 10280..1029F; Lycian + case lycian = "Lycian" + /// 102A0..102DF; Carian + case carian = "Carian" + /// 102E0..102FF; Coptic Epact Numbers + case copticEpactNumbers = "Coptic_Epact_Numbers" + /// 10300..1032F; Old Italic + case oldItalic = "Old_Italic" + /// 10330..1034F; Gothic + case gothic = "Gothic" + /// 10350..1037F; Old Permic + case oldPermic = "Old_Permic" + /// 10380..1039F; Ugaritic + case ugaritic = "Ugaritic" + /// 103A0..103DF; Old Persian + case oldPersian = "Old_Persian" + /// 10400..1044F; Deseret + case deseret = "Deseret" + /// 10450..1047F; Shavian + case shavian = "Shavian" + /// 10480..104AF; Osmanya + case osmanya = "Osmanya" + /// 104B0..104FF; Osage + case osage = "Osage" + /// 10500..1052F; Elbasan + case elbasan = "Elbasan" + /// 10530..1056F; Caucasian Albanian + case caucasianAlbanian = "Caucasian_Albanian" + /// 10570..105BF; Vithkuqi + case vithkuqi = "Vithkuqi" + /// 10600..1077F; Linear A + case linearA = "Linear_A" + /// 10780..107BF; Latin Extended-F + case latinExtendedF = "Latin_Extended_F" + /// 10800..1083F; Cypriot Syllabary + case cypriotSyllabary = "Cypriot_Syllabary" + /// 10840..1085F; Imperial Aramaic + case imperialAramaic = "Imperial_Aramaic" + /// 10860..1087F; Palmyrene + case palmyrene = "Palmyrene" + /// 10880..108AF; Nabataean + case nabataean = "Nabataean" + /// 108E0..108FF; Hatran + case hatran = "Hatran" + /// 10900..1091F; Phoenician + case phoenician = "Phoenician" + /// 10920..1093F; Lydian + case lydian = "Lydian" + /// 10980..1099F; Meroitic Hieroglyphs + case meroiticHieroglyphs = "Meroitic_Hieroglyphs" + /// 109A0..109FF; Meroitic Cursive + case meroiticCursive = "Meroitic_Cursive" + /// 10A00..10A5F; Kharoshthi + case kharoshthi = "Kharoshthi" + /// 10A60..10A7F; Old South Arabian + case oldSouthArabian = "Old_South_Arabian" + /// 10A80..10A9F; Old North Arabian + case oldNorthArabian = "Old_North_Arabian" + /// 10AC0..10AFF; Manichaean + case manichaean = "Manichaean" + /// 10B00..10B3F; Avestan + case avestan = "Avestan" + /// 10B40..10B5F; Inscriptional Parthian + case inscriptionalParthian = "Inscriptional_Parthian" + /// 10B60..10B7F; Inscriptional Pahlavi + case inscriptionalPahlavi = "Inscriptional_Pahlavi" + /// 10B80..10BAF; Psalter Pahlavi + case psalterPahlavi = "Psalter_Pahlavi" + /// 10C00..10C4F; Old Turkic + case oldTurkic = "Old_Turkic" + /// 10C80..10CFF; Old Hungarian + case oldHungarian = "Old_Hungarian" + /// 10D00..10D3F; Hanifi Rohingya + case hanifiRohingya = "Hanifi_Rohingya" + /// 10E60..10E7F; Rumi Numeral Symbols + case rumiNumeralSymbols = "Rumi_Numeral_Symbols" + /// 10E80..10EBF; Yezidi + case yezidi = "Yezidi" + /// 10F00..10F2F; Old Sogdian + case oldSogdian = "Old_Sogdian" + /// 10F30..10F6F; Sogdian + case sogdian = "Sogdian" + /// 10F70..10FAF; Old Uyghur + case oldUyghur = "Old_Uyghur" + /// 10FB0..10FDF; Chorasmian + case chorasmian = "Chorasmian" + /// 10FE0..10FFF; Elymaic + case elymaic = "Elymaic" + /// 11000..1107F; Brahmi + case brahmi = "Brahmi" + /// 11080..110CF; Kaithi + case kaithi = "Kaithi" + /// 110D0..110FF; Sora Sompeng + case soraSompeng = "Sora_Sompeng" + /// 11100..1114F; Chakma + case chakma = "Chakma" + /// 11150..1117F; Mahajani + case mahajani = "Mahajani" + /// 11180..111DF; Sharada + case sharada = "Sharada" + /// 111E0..111FF; Sinhala Archaic Numbers + case sinhalaArchaicNumbers = "Sinhala_Archaic_Numbers" + /// 11200..1124F; Khojki + case khojki = "Khojki" + /// 11280..112AF; Multani + case multani = "Multani" + /// 112B0..112FF; Khudawadi + case khudawadi = "Khudawadi" + /// 11300..1137F; Grantha + case grantha = "Grantha" + /// 11400..1147F; Newa + case newa = "Newa" + /// 11480..114DF; Tirhuta + case tirhuta = "Tirhuta" + /// 11580..115FF; Siddham + case siddham = "Siddham" + /// 11600..1165F; Modi + case modi = "Modi" + /// 11660..1167F; Mongolian Supplement + case mongolianSupplement = "Mongolian_Supplement" + /// 11680..116CF; Takri + case takri = "Takri" + /// 11700..1174F; Ahom + case ahom = "Ahom" + /// 11800..1184F; Dogra + case dogra = "Dogra" + /// 118A0..118FF; Warang Citi + case warangCiti = "Warang_Citi" + /// 11900..1195F; Dives Akuru + case divesAkuru = "Dives_Akuru" + /// 119A0..119FF; Nandinagari + case nandinagari = "Nandinagari" + /// 11A00..11A4F; Zanabazar Square + case zanabazarSquare = "Zanabazar_Square" + /// 11A50..11AAF; Soyombo + case soyombo = "Soyombo" + /// 11AB0..11ABF; Unified Canadian Aboriginal Syllabics Extended-A + case unifiedCanadianAboriginalSyllabicsExtendedA = "Unified_Canadian_Aboriginal_Syllabics_Extended_A" + /// 11AC0..11AFF; Pau Cin Hau + case pauCinHau = "Pau_Cin_Hau" + /// 11C00..11C6F; Bhaiksuki + case bhaiksuki = "Bhaiksuki" + /// 11C70..11CBF; Marchen + case marchen = "Marchen" + /// 11D00..11D5F; Masaram Gondi + case masaramGondi = "Masaram_Gondi" + /// 11D60..11DAF; Gunjala Gondi + case gunjalaGondi = "Gunjala_Gondi" + /// 11EE0..11EFF; Makasar + case makasar = "Makasar" + /// 11FB0..11FBF; Lisu Supplement + case lisuSupplement = "Lisu_Supplement" + /// 11FC0..11FFF; Tamil Supplement + case tamilSupplement = "Tamil_Supplement" + /// 12000..123FF; Cuneiform + case cuneiform = "Cuneiform" + /// 12400..1247F; Cuneiform Numbers and Punctuation + case cuneiformNumbersAndPunctuation = "Cuneiform_Numbers_and_Punctuation" + /// 12480..1254F; Early Dynastic Cuneiform + case earlyDynasticCuneiform = "Early_Dynastic_Cuneiform" + /// 12F90..12FFF; Cypro-Minoan + case cyproMinoan = "Cypro_Minoan" + /// 13000..1342F; Egyptian Hieroglyphs + case egyptianHieroglyphs = "Egyptian_Hieroglyphs" + /// 13430..1343F; Egyptian Hieroglyph Format Controls + case egyptianHieroglyphFormatControls = "Egyptian_Hieroglyph_Format_Controls" + /// 14400..1467F; Anatolian Hieroglyphs + case anatolianHieroglyphs = "Anatolian_Hieroglyphs" + /// 16800..16A3F; Bamum Supplement + case bamumSupplement = "Bamum_Supplement" + /// 16A40..16A6F; Mro + case mro = "Mro" + /// 16A70..16ACF; Tangsa + case tangsa = "Tangsa" + /// 16AD0..16AFF; Bassa Vah + case bassaVah = "Bassa_Vah" + /// 16B00..16B8F; Pahawh Hmong + case pahawhHmong = "Pahawh_Hmong" + /// 16E40..16E9F; Medefaidrin + case medefaidrin = "Medefaidrin" + /// 16F00..16F9F; Miao + case miao = "Miao" + /// 16FE0..16FFF; Ideographic Symbols and Punctuation + case ideographicSymbolsAndPunctuation = "Ideographic_Symbols_and_Punctuation" + /// 17000..187FF; Tangut + case tangut = "Tangut" + /// 18800..18AFF; Tangut Components + case tangutComponents = "Tangut_Components" + /// 18B00..18CFF; Khitan Small Script + case khitanSmallScript = "Khitan_Small_Script" + /// 18D00..18D7F; Tangut Supplement + case tangutSupplement = "Tangut_Supplement" + /// 1AFF0..1AFFF; Kana Extended-B + case kanaExtendedB = "Kana_Extended_B" + /// 1B000..1B0FF; Kana Supplement + case kanaSupplement = "Kana_Supplement" + /// 1B100..1B12F; Kana Extended-A + case kanaExtendedA = "Kana_Extended_A" + /// 1B130..1B16F; Small Kana Extension + case smallKanaExtension = "Small_Kana_Extension" + /// 1B170..1B2FF; Nushu + case nushu = "Nushu" + /// 1BC00..1BC9F; Duployan + case duployan = "Duployan" + /// 1BCA0..1BCAF; Shorthand Format Controls + case shorthandFormatControls = "Shorthand_Format_Controls" + /// 1CF00..1CFCF; Znamenny Musical Notation + case znamennyMusicalNotation = "Znamenny_Musical_Notation" + /// 1D000..1D0FF; Byzantine Musical Symbols + case byzantineMusicalSymbols = "Byzantine_Musical_Symbols" + /// 1D100..1D1FF; Musical Symbols + case musicalSymbols = "Musical_Symbols" + /// 1D200..1D24F; Ancient Greek Musical Notation + case ancientGreekMusicalNotation = "Ancient_Greek_Musical_Notation" + /// 1D2E0..1D2FF; Mayan Numerals + case mayanNumerals = "Mayan_Numerals" + /// 1D300..1D35F; Tai Xuan Jing Symbols + case taiXuanJingSymbols = "Tai_Xuan_Jing_Symbols" + /// 1D360..1D37F; Counting Rod Numerals + case countingRodNumerals = "Counting_Rod_Numerals" + /// 1D400..1D7FF; Mathematical Alphanumeric Symbols + case mathematicalAlphanumericSymbols = "Mathematical_Alphanumeric_Symbols" + /// 1D800..1DAAF; Sutton SignWriting + case suttonSignwriting = "Sutton_SignWriting" + /// 1DF00..1DFFF; Latin Extended-G + case latinExtendedG = "Latin_Extended_G" + /// 1E000..1E02F; Glagolitic Supplement + case glagoliticSupplement = "Glagolitic_Supplement" + /// 1E100..1E14F; Nyiakeng Puachue Hmong + case nyiakengPuachueHmong = "Nyiakeng_Puachue_Hmong" + /// 1E290..1E2BF; Toto + case toto = "Toto" + /// 1E2C0..1E2FF; Wancho + case wancho = "Wancho" + /// 1E7E0..1E7FF; Ethiopic Extended-B + case ethiopicExtendedB = "Ethiopic_Extended_B" + /// 1E800..1E8DF; Mende Kikakui + case mendeKikakui = "Mende_Kikakui" + /// 1E900..1E95F; Adlam + case adlam = "Adlam" + /// 1EC70..1ECBF; Indic Siyaq Numbers + case indicSiyaqNumbers = "Indic_Siyaq_Numbers" + /// 1ED00..1ED4F; Ottoman Siyaq Numbers + case ottomanSiyaqNumbers = "Ottoman_Siyaq_Numbers" + /// 1EE00..1EEFF; Arabic Mathematical Alphabetic Symbols + case arabicMathematicalAlphabeticSymbols = "Arabic_Mathematical_Alphabetic_Symbols" + /// 1F000..1F02F; Mahjong Tiles + case mahjongTiles = "Mahjong_Tiles" + /// 1F030..1F09F; Domino Tiles + case dominoTiles = "Domino_Tiles" + /// 1F0A0..1F0FF; Playing Cards + case playingCards = "Playing_Cards" + /// 1F100..1F1FF; Enclosed Alphanumeric Supplement + case enclosedAlphanumericSupplement = "Enclosed_Alphanumeric_Supplement" + /// 1F200..1F2FF; Enclosed Ideographic Supplement + case enclosedIdeographicSupplement = "Enclosed_Ideographic_Supplement" + /// 1F300..1F5FF; Miscellaneous Symbols and Pictographs + case miscellaneousSymbolsandPictographs = "Miscellaneous_Symbols_and_Pictographs" + /// 1F600..1F64F; Emoticons + case emoticons = "Emoticons" + /// 1F650..1F67F; Ornamental Dingbats + case ornamentalDingbats = "Ornamental_Dingbats" + /// 1F680..1F6FF; Transport and Map Symbols + case transportAndMapSymbols = "Transport_and_Map_Symbols" + /// 1F700..1F77F; Alchemical Symbols + case alchemicalSymbols = "Alchemical_Symbols" + /// 1F780..1F7FF; Geometric Shapes Extended + case geometricShapesExtended = "Geometric_Shapes_Extended" + /// 1F800..1F8FF; Supplemental Arrows-C + case supplementalArrowsC = "Supplemental_Arrows_C" + /// 1F900..1F9FF; Supplemental Symbols and Pictographs + case supplementalSymbolsAndPictographs = "Supplemental_Symbols_and_Pictographs" + /// 1FA00..1FA6F; Chess Symbols + case chessSymbols = "Chess_Symbols" + /// 1FA70..1FAFF; Symbols and Pictographs Extended-A + case symbolsAndPictographsExtendedA = "Symbols_and_Pictographs_Extended_A" + /// 1FB00..1FBFF; Symbols for Legacy Computing + case symbolsForLegacyComputing = "Symbols_for_Legacy_Computing" + /// 20000..2A6DF; CJK Unified Ideographs Extension B + case cjkUnifiedIdeographsExtensionB = "CJK_Unified_Ideographs_Extension_B" + /// 2A700..2B73F; CJK Unified Ideographs Extension C + case cjkUnifiedIdeographsExtensionC = "CJK_Unified_Ideographs_Extension_C" + /// 2B740..2B81F; CJK Unified Ideographs Extension D + case cjkUnifiedIdeographsExtensionD = "CJK_Unified_Ideographs_Extension_D" + /// 2B820..2CEAF; CJK Unified Ideographs Extension E + case cjkUnifiedIdeographsExtensionE = "CJK_Unified_Ideographs_Extension_E" + /// 2CEB0..2EBEF; CJK Unified Ideographs Extension F + case cjkUnifiedIdeographsExtensionF = "CJK_Unified_Ideographs_Extension_F" + /// 2F800..2FA1F; CJK Compatibility Ideographs Supplement + case cjkCompatibilityIdeographsSupplement = "CJK_Compatibility_Ideographs_Supplement" + /// 30000..3134F; CJK Unified Ideographs Extension G + case cjkUnifiedIdeographsExtensionG = "CJK_Unified_Ideographs_Extension_G" + /// E0000..E007F; Tags + case tags = "Tags" + /// E0100..E01EF; Variation Selectors Supplement + case variationSelectorsSupplement = "Variation_Selectors_Supplement" + /// F0000..FFFFF; Supplementary Private Use Area-A + case supplementaryPrivateUseAreaA = "Supplementary_Private_Use_Area_A" + /// 100000..10FFFF; Supplementary Private Use Area-B + case supplementaryPrivateUseAreaB = "Supplementary_Private_Use_Area_B" + /// @missing: 0000..10FFFF; No_Block + case noBlock = "No_Block" + } } extension Character { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index b6bbfd83e..6828245b1 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -512,14 +512,14 @@ extension AST.Atom.CharacterProperty { case .mapping(.titlecase, let value): return consume { $0.properties.titlecaseMapping == value } + case .block(let b): + throw Unsupported("TODO: map block: \(b)") + case .posix(let p): return p.generateConsumer(opts) case .pcreSpecial(let s): throw Unsupported("TODO: map PCRE special: \(s)") - - case .onigurumaSpecial(let s): - throw Unsupported("TODO: map Oniguruma special: \(s)") } }() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 975d86b75..640186a2b 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -34,7 +34,7 @@ extension AST.CustomCharacterClass.Member: ExpressibleByExtendedGraphemeClusterL } enum SemanticErrorKind { - case unsupported, invalid + case unsupported, invalid, unchecked } class RegexTests: XCTestCase {} @@ -68,7 +68,7 @@ func parseTest( XCTFail("unexpected error: \(error)", file: file, line: line) return } - if let errorKind = errorKind { + if let errorKind = errorKind, errorKind != .unchecked { do { _ = try parse(input, .semantic, syntax) XCTFail("expected semantically invalid AST", file: file, line: line) @@ -1352,7 +1352,56 @@ extension RegexTests { parseTest(#"\p{isAlphabetic}"#, prop(.binary(.alphabetic))) parseTest(#"\p{isAlpha=isFalse}"#, prop(.binary(.alphabetic, value: false))) - parseTest(#"\p{In_Runic}"#, prop(.onigurumaSpecial(.inRunic)), throwsError: .unsupported) + parseTest(#"\p{In_Runic}"#, prop(.block(.runic)), throwsError: .unsupported) + + parseTest(#"\p{Hebrew}"#, prop(.scriptExtension(.hebrew))) + parseTest(#"\p{Is_Hebrew}"#, prop(.scriptExtension(.hebrew))) + parseTest(#"\p{In_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) + parseTest(#"\p{Blk=Is_Hebrew}"#, prop(.block(.hebrew)), throwsError: .unsupported) + + // These are the shorthand properties with an "in" prefix we currently + // recognize. Make sure they don't clash with block properties. + parseTest(#"\p{initialpunctuation}"#, prop(.generalCategory(.initialPunctuation))) + parseTest(#"\p{inscriptionalpahlavi}"#, prop(.scriptExtension(.inscriptionalPahlavi))) + parseTest(#"\p{inscriptionalparthian}"#, prop(.scriptExtension(.inscriptionalParthian))) + parseTest(#"\p{inherited}"#, prop(.scriptExtension(.inherited))) + + // Make sure these are round-trippable. + for s in Unicode.Script.allCases { + parseTest(#"\p{\#(s.rawValue)}"#, prop(.scriptExtension(s))) + parseTest(#"\p{is\#(s.rawValue)}"#, prop(.scriptExtension(s))) + } + for g in Unicode.ExtendedGeneralCategory.allCases { + parseTest(#"\p{\#(g.rawValue)}"#, prop(.generalCategory(g))) + parseTest(#"\p{is\#(g.rawValue)}"#, prop(.generalCategory(g))) + } + for p in Unicode.POSIXProperty.allCases { + parseTest(#"\p{\#(p.rawValue)}"#, prop(.posix(p))) + parseTest(#"\p{is\#(p.rawValue)}"#, prop(.posix(p))) + } + for b in Unicode.BinaryProperty.allCases { + // Some of these are unsupported, so don't check for semantic errors. + parseTest(#"\p{\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) + parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) + } + + // Try prefixing each block property with "in" to make sure we don't stomp + // on any other property shorthands. + for b in Unicode.Block.allCases { + parseTest(#"\p{in\#(b.rawValue)}"#, prop(.block(b)), throwsError: .unsupported) + } + + parseTest(#"\p{ASCII}"#, prop(.ascii)) + parseTest(#"\p{isASCII}"#, prop(.ascii)) + parseTest(#"\p{inASCII}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + + parseTest(#"\p{inBasicLatin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{In_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{Blk=Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + parseTest(#"\p{Blk=Is_Basic_Latin}"#, prop(.block(.basicLatin)), throwsError: .unsupported) + + parseTest(#"\p{isAny}"#, prop(.any)) + parseTest(#"\p{isAssigned}"#, prop(.assigned)) parseTest(#"\p{Xan}"#, prop(.pcreSpecial(.alphanumeric)), throwsError: .unsupported) parseTest(#"\p{Xps}"#, prop(.pcreSpecial(.posixSpace)), throwsError: .unsupported) @@ -2610,6 +2659,9 @@ extension RegexTests { diagnosticTest(#"\p{aaa\p{b}}"#, .unknownProperty(key: nil, value: "aaa")) diagnosticTest(#"[[:{:]]"#, .unknownProperty(key: nil, value: "{")) + diagnosticTest(#"\p{Basic_Latin}"#, .unknownProperty(key: nil, value: "Basic_Latin")) + diagnosticTest(#"\p{Blk=In_Basic_Latin}"#, .unrecognizedBlock("In_Basic_Latin")) + // We only filter pattern whitespace, which doesn't include things like // non-breaking spaces. diagnosticTest(#"\p{L\#u{A0}l}"#, .unknownProperty(key: nil, value: "L\u{A0}l")) From 571c34cbf56799a506e1bcd265c1e3729af59f61 Mon Sep 17 00:00:00 2001 From: Hamish Knight Date: Thu, 26 May 2022 20:46:35 +0100 Subject: [PATCH 2/2] Parse Java character properties These correspond to various `is`-prefixed accessors on `java.lang.Character`. For now, parse them, but mark them unsupported. --- Sources/_RegexParser/Regex/AST/Atom.swift | 27 ++++++++++++++++++- .../CharacterPropertyClassification.swift | 7 +++-- Sources/_RegexParser/Regex/Parse/Sema.swift | 2 ++ .../_StringProcessing/ConsumerInterface.swift | 3 +++ Tests/RegexTests/ParseTests.swift | 4 +++ 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift index 680a29dee..992604852 100644 --- a/Sources/_RegexParser/Regex/AST/Atom.swift +++ b/Sources/_RegexParser/Regex/AST/Atom.swift @@ -450,6 +450,9 @@ extension AST.Atom.CharacterProperty { /// Some special properties implemented by PCRE and Oniguruma. case pcreSpecial(PCRESpecialCategory) + /// Some special properties implemented by Java. + case javaSpecial(JavaSpecial) + public enum MapKind: Hashable { case lowercase case uppercase @@ -457,7 +460,6 @@ extension AST.Atom.CharacterProperty { } } - // TODO: erm, separate out or fold into something? splat it in? public enum PCRESpecialCategory: String, Hashable { case alphanumeric = "Xan" case posixSpace = "Xps" @@ -465,6 +467,29 @@ extension AST.Atom.CharacterProperty { case universallyNamed = "Xuc" case perlWord = "Xwd" } + + /// Special Java properties that correspond to methods on + /// `java.lang.Character`, with the `java` prefix replaced by `is`. + public enum JavaSpecial: String, Hashable, CaseIterable { + case alphabetic = "javaAlphabetic" + case defined = "javaDefined" + case digit = "javaDigit" + case identifierIgnorable = "javaIdentifierIgnorable" + case ideographic = "javaIdeographic" + case isoControl = "javaISOControl" + case javaIdentifierPart = "javaJavaIdentifierPart" // not a typo, that's actually the name + case javaIdentifierStart = "javaJavaIdentifierStart" // not a typo, that's actually the name + case javaLetter = "javaLetter" + case javaLetterOrDigit = "javaLetterOrDigit" + case lowerCase = "javaLowerCase" + case mirrored = "javaMirrored" + case spaceChar = "javaSpaceChar" + case titleCase = "javaTitleCase" + case unicodeIdentifierPart = "javaUnicodeIdentifierPart" + case unicodeIdentifierStart = "javaUnicodeIdentifierStart" + case upperCase = "javaUpperCase" + case whitespace = "javaWhitespace" + } } extension AST.Atom { diff --git a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift index 2ff162390..fb122e027 100644 --- a/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift +++ b/Sources/_RegexParser/Regex/Parse/CharacterPropertyClassification.swift @@ -764,12 +764,15 @@ extension Source { return .block(block) } - // PCRE special properties. - // TODO: Normalize? + // Special properties from other engines. typealias PCRESpecial = AST.Atom.CharacterProperty.PCRESpecialCategory if let pcreSpecial = PCRESpecial(rawValue: value) { return .pcreSpecial(pcreSpecial) } + typealias JavaSpecial = AST.Atom.CharacterProperty.JavaSpecial + if let javaSpecial = JavaSpecial(rawValue: value) { + return .javaSpecial(javaSpecial) + } // TODO: This should be versioned, and do we want a more lax behavior for // the runtime? diff --git a/Sources/_RegexParser/Regex/Parse/Sema.swift b/Sources/_RegexParser/Regex/Parse/Sema.swift index 99654991a..c803087be 100644 --- a/Sources/_RegexParser/Regex/Parse/Sema.swift +++ b/Sources/_RegexParser/Regex/Parse/Sema.swift @@ -175,6 +175,8 @@ extension RegexValidator { throw error(.unsupported("PCRE property"), at: loc) case .block: throw error(.unsupported("Unicode block property"), at: loc) + case .javaSpecial: + throw error(.unsupported("Java property"), at: loc) } } diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index 6828245b1..640fe3c93 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -520,6 +520,9 @@ extension AST.Atom.CharacterProperty { case .pcreSpecial(let s): throw Unsupported("TODO: map PCRE special: \(s)") + + case .javaSpecial(let s): + throw Unsupported("TODO: map Java special: \(s)") } }() diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift index 640186a2b..0daa4a457 100644 --- a/Tests/RegexTests/ParseTests.swift +++ b/Tests/RegexTests/ParseTests.swift @@ -1385,6 +1385,10 @@ extension RegexTests { parseTest(#"\p{is\#(b.rawValue)}"#, prop(.binary(b, value: true)), throwsError: .unchecked) } + for j in AST.Atom.CharacterProperty.JavaSpecial.allCases { + parseTest(#"\p{\#(j.rawValue)}"#, prop(.javaSpecial(j)), throwsError: .unsupported) + } + // Try prefixing each block property with "in" to make sure we don't stomp // on any other property shorthands. for b in Unicode.Block.allCases {