Skip to content

Commit

Permalink
Add Unicode Script into built-in rules. (#751)
Browse files Browse the repository at this point in the history
* Add CJK unicode into built-in rules.

Make this change to add `CJK`, `HAN`, `HANGUL`, `KATAKANA`, `HIRAGANA` to built-in rules.

https://unicode.org/faq/han_cjk.html

- Chinese - `HAN`
- Japanese - `KATAKANA`, `HIRAGANA`
- Korean - `HANGUL`

So we can easy to to match the CJK chars.

* Rewrite pest_meta::UNICODE_PROPERTY_NAMES to pest::unicode::unicode_property_names.

-  will generate property names by use macro.
-  has been removed.

* Improve unicode `char_property_functions` macro for supports original BY_NAME values by `ucd-generate` generated.

And export all property names from Unicode (Script).

* Add test for Unicode (Script) bulit-in rules.

* Revert `pest_meta::UNICODE_PROPERTY_NAMES` const, and deprecated it.

* Update pest/src/unicode/mod.rs
  • Loading branch information
huacnlee committed Dec 23, 2022
1 parent 2c47201 commit 25ba0a2
Show file tree
Hide file tree
Showing 10 changed files with 7,339 additions and 115 deletions.
5 changes: 5 additions & 0 deletions derive/tests/grammar.pest
Expand Up @@ -63,6 +63,11 @@ newline = { NEWLINE+ }
unicode = { XID_START ~ XID_CONTINUE* }
SYMBOL = { "shadows builtin" }

han = { HAN+ }
hangul = { HANGUL+ }
hiragana = { HIRAGANA+ }
arabic = { ARABIC+ }

WHITESPACE = _{ " " }
COMMENT = _{ "$"+ }

Expand Down
4 changes: 2 additions & 2 deletions generator/src/generator.rs
Expand Up @@ -13,9 +13,9 @@ use proc_macro2::TokenStream;
use quote::{ToTokens, TokenStreamExt};
use syn::{self, Generics, Ident};

use pest::unicode::unicode_property_names;
use pest_meta::ast::*;
use pest_meta::optimizer::*;
use pest_meta::UNICODE_PROPERTY_NAMES;

pub fn generate(
name: Ident,
Expand Down Expand Up @@ -153,7 +153,7 @@ fn generate_builtin_rules() -> Vec<(&'static str, TokenStream)> {

let box_ty = box_type();

for property in UNICODE_PROPERTY_NAMES {
for property in unicode_property_names() {
let property_ident: Ident = syn::parse_str(property).unwrap();
// insert manually for #property substitution
builtins.push((property, quote! {
Expand Down
99 changes: 5 additions & 94 deletions meta/src/lib.rs
Expand Up @@ -20,9 +20,11 @@
#[macro_use]
extern crate pest;

use once_cell::sync::Lazy;
use std::fmt::Display;

use pest::error::Error;
use pest::unicode::unicode_property_names;

pub mod ast;
pub mod optimizer;
Expand Down Expand Up @@ -69,97 +71,6 @@ pub fn parse_and_optimize(
}

#[doc(hidden)]
pub static UNICODE_PROPERTY_NAMES: &[&str] = &[
/* BINARY */
"ALPHABETIC",
"BIDI_CONTROL",
"CASE_IGNORABLE",
"CASED",
"CHANGES_WHEN_CASEFOLDED",
"CHANGES_WHEN_CASEMAPPED",
"CHANGES_WHEN_LOWERCASED",
"CHANGES_WHEN_TITLECASED",
"CHANGES_WHEN_UPPERCASED",
"DASH",
"DEFAULT_IGNORABLE_CODE_POINT",
"DEPRECATED",
"DIACRITIC",
"EXTENDER",
"GRAPHEME_BASE",
"GRAPHEME_EXTEND",
"GRAPHEME_LINK",
"HEX_DIGIT",
"HYPHEN",
"IDS_BINARY_OPERATOR",
"IDS_TRINARY_OPERATOR",
"ID_CONTINUE",
"ID_START",
"IDEOGRAPHIC",
"JOIN_CONTROL",
"LOGICAL_ORDER_EXCEPTION",
"LOWERCASE",
"MATH",
"NONCHARACTER_CODE_POINT",
"OTHER_ALPHABETIC",
"OTHER_DEFAULT_IGNORABLE_CODE_POINT",
"OTHER_GRAPHEME_EXTEND",
"OTHER_ID_CONTINUE",
"OTHER_ID_START",
"OTHER_LOWERCASE",
"OTHER_MATH",
"OTHER_UPPERCASE",
"PATTERN_SYNTAX",
"PATTERN_WHITE_SPACE",
"PREPENDED_CONCATENATION_MARK",
"QUOTATION_MARK",
"RADICAL",
"REGIONAL_INDICATOR",
"SENTENCE_TERMINAL",
"SOFT_DOTTED",
"TERMINAL_PUNCTUATION",
"UNIFIED_IDEOGRAPH",
"UPPERCASE",
"VARIATION_SELECTOR",
"WHITE_SPACE",
"XID_CONTINUE",
"XID_START",
/* CATEGORY */
"CASED_LETTER",
"CLOSE_PUNCTUATION",
"CONNECTOR_PUNCTUATION",
"CONTROL",
"CURRENCY_SYMBOL",
"DASH_PUNCTUATION",
"DECIMAL_NUMBER",
"ENCLOSING_MARK",
"FINAL_PUNCTUATION",
"FORMAT",
"INITIAL_PUNCTUATION",
"LETTER",
"LETTER_NUMBER",
"LINE_SEPARATOR",
"LOWERCASE_LETTER",
"MARK",
"MATH_SYMBOL",
"MODIFIER_LETTER",
"MODIFIER_SYMBOL",
"NONSPACING_MARK",
"NUMBER",
"OPEN_PUNCTUATION",
"OTHER",
"OTHER_LETTER",
"OTHER_NUMBER",
"OTHER_PUNCTUATION",
"OTHER_SYMBOL",
"PARAGRAPH_SEPARATOR",
"PRIVATE_USE",
"PUNCTUATION",
"SEPARATOR",
"SPACE_SEPARATOR",
"SPACING_MARK",
"SURROGATE",
"SYMBOL",
"TITLECASE_LETTER",
"UNASSIGNED",
"UPPERCASE_LETTER",
];
#[deprecated(note = "use `pest::unicode::unicode_property_names` instead")]
pub static UNICODE_PROPERTY_NAMES: Lazy<Vec<&str>> =
Lazy::new(|| unicode_property_names().collect::<Vec<_>>());
4 changes: 2 additions & 2 deletions meta/src/validator.rs
Expand Up @@ -15,10 +15,10 @@ use std::collections::{HashMap, HashSet};

use pest::error::{Error, ErrorVariant, InputLocation};
use pest::iterators::Pairs;
use pest::unicode::unicode_property_names;
use pest::Span;

use crate::parser::{ParserExpr, ParserNode, ParserRule, Rule};
use crate::UNICODE_PROPERTY_NAMES;

static RUST_KEYWORDS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
[
Expand Down Expand Up @@ -66,7 +66,7 @@ static BUILTINS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
]
.iter()
.cloned()
.chain(UNICODE_PROPERTY_NAMES.iter().cloned())
.chain(unicode_property_names())
.collect::<HashSet<&str>>()
});

Expand Down
2 changes: 1 addition & 1 deletion pest/src/unicode/binary.rs
Expand Up @@ -4,7 +4,7 @@
//
// Unicode version: 15.0.0.
//
// ucd-generate 0.2.13 is available on crates.io.
// ucd-generate 0.2.15 is available on crates.io.

pub const BY_NAME: &'static [(&'static str, &'static ::ucd_trie::TrieSet)] = &[
("ASCII_Hex_Digit", ASCII_HEX_DIGIT), ("Alphabetic", ALPHABETIC),
Expand Down
2 changes: 1 addition & 1 deletion pest/src/unicode/category.rs
Expand Up @@ -4,7 +4,7 @@
//
// Unicode version: 15.0.0.
//
// ucd-generate 0.2.13 is available on crates.io.
// ucd-generate 0.2.15 is available on crates.io.

pub const BY_NAME: &'static [(&'static str, &'static ::ucd_trie::TrieSet)] = &[
("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION),
Expand Down

0 comments on commit 25ba0a2

Please sign in to comment.