Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
172: Add new unic-ucd-common component r=CAD97 a=behnam There are some core properties that are commonly used in Unicode algorithsm, as well as in applications directly, and are not specific to any single area. Examples of these properties are `Alphabetic` and `White_Space`. Also, there are some *resolved* properties that are used commonly, like *Numeric* and *Alphanumeric*, which are commonly defined as based on *General_Category* and *Alphabetic* properties. Since they are common in applications, it makes sense to provided optimized implementations. This new componet, `unic-ucd-common`, hosts these properties.
- Loading branch information
Showing
18 changed files
with
2,377 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
// Copyright 2017 The UNIC Project Developers. | ||
// | ||
// See the COPYRIGHT file at the top-level directory of this distribution. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
|
||
|
||
use std::path::Path; | ||
use std::collections::BTreeSet; | ||
|
||
use source::ucd::derived_core_properties::DERIVED_CORE_PROPERTIES; | ||
use source::ucd::prop_list::PROP_LIST; | ||
use source::ucd::readme::UNICODE_VERSION; | ||
use source::ucd::unicode_data::UNICODE_DATA; | ||
|
||
use writer::utils::tables::ToRangeCharSet; | ||
use writer::common::emit_unicode_version; | ||
use writer::utils::write; | ||
|
||
|
||
pub fn generate(dir: &Path) { | ||
emit_unicode_version(dir, &UNICODE_VERSION); | ||
emit_alphabetic(dir); | ||
emit_white_space(dir); | ||
emit_alphanumeric(dir); | ||
emit_control(dir); | ||
emit_numeric(dir); | ||
} | ||
|
||
|
||
fn emit_alphabetic(dir: &Path) { | ||
write( | ||
dir, | ||
"alphabetic.rsv", | ||
&DERIVED_CORE_PROPERTIES.alphabetic.to_range_char_set(), | ||
); | ||
} | ||
|
||
fn emit_white_space(dir: &Path) { | ||
write( | ||
dir, | ||
"white_space.rsv", | ||
&PROP_LIST.white_space.to_range_char_set(), | ||
); | ||
} | ||
|
||
fn emit_alphanumeric(dir: &Path) { | ||
write( | ||
dir, | ||
"alphanumeric.rsv", | ||
&get_alphanumeric().to_range_char_set(), | ||
); | ||
} | ||
|
||
fn emit_control(dir: &Path) { | ||
let set: BTreeSet<char> = UNICODE_DATA | ||
.entries | ||
.iter() | ||
.filter(|x| x.general_category == "Cc") | ||
.map(|x| x.character) | ||
.collect(); | ||
write(dir, "control.rsv", &set.to_range_char_set()); | ||
} | ||
|
||
fn emit_numeric(dir: &Path) { | ||
write(dir, "numeric.rsv", &get_numeric().to_range_char_set()); | ||
} | ||
|
||
|
||
fn get_numeric() -> BTreeSet<char> { | ||
UNICODE_DATA | ||
.entries | ||
.iter() | ||
.filter(|x| { | ||
["Nd", "Nl", "No"].contains(&x.general_category.as_str()) | ||
}) | ||
.map(|x| x.character) | ||
.collect() | ||
} | ||
|
||
fn get_alphanumeric() -> BTreeSet<char> { | ||
get_numeric() | ||
.union(&DERIVED_CORE_PROPERTIES.alphabetic) | ||
.map(|ch| ch.clone()) | ||
.collect() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[package] | ||
name = "unic-ucd-common" | ||
version = "0.6.0" | ||
authors = ["The UNIC Project Developers"] | ||
repository = "https://github.com/behnam/rust-unic/" | ||
license = "MIT/Apache-2.0" | ||
description = "UNIC - Unicode Character Database - Common Properties" | ||
keywords = ["text", "unicode", "character-property", "alphabetic", "numeric"] | ||
categories = ["internationalization", "text-processing", "parsing", "rendering"] | ||
|
||
# No tests/benches that depends on /data/ | ||
exclude = [] | ||
|
||
[badges] | ||
travis-ci = { repository = "behnam/rust-unic", branch = "master" } | ||
|
||
[dependencies] | ||
unic-char-property = { path = "../../char/property/", version = "0.6.0" } | ||
unic-char-range = { path = "../../char/range", version = "0.6.0" } | ||
unic-ucd-core = { path = "../core/", version = "0.6.0" } | ||
|
||
[dev-dependencies] | ||
unic-ucd-category = { path = "../category/", version = "0.6.0" } |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
// Copyright 2017 The UNIC Project Developers. | ||
// | ||
// See the COPYRIGHT file at the top-level directory of this distribution. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
|
||
|
||
//! Unicode *Alphabetic* Character Property. | ||
|
||
|
||
char_property! { | ||
/// Represents values of the Unicode character property | ||
/// [*Alphabetic*](http://www.unicode.org/reports/tr44/#Alphabetic). | ||
/// | ||
/// The value is `true` for characters that change when lowercased, `false` otherwise. | ||
pub struct Alphabetic(bool) { | ||
abbr => "Alpha"; | ||
long => "Alphabetic"; | ||
human => "Alphabetic"; | ||
|
||
data_table_path => "../tables/alphabetic.rsv"; | ||
} | ||
|
||
/// Return `true` for Alphabetic characters, `false` otherwise. | ||
pub fn is_alphabetic(char) -> bool; | ||
} | ||
|
||
|
||
#[cfg(test)] | ||
mod tests { | ||
#[test] | ||
fn test_values() { | ||
use super::is_alphabetic; | ||
|
||
// ASCII | ||
assert_eq!(is_alphabetic('\u{0020}'), false); | ||
assert_eq!(is_alphabetic('\u{0021}'), false); | ||
assert_eq!(is_alphabetic('\u{0022}'), false); | ||
|
||
assert_eq!(is_alphabetic('\u{0030}'), false); | ||
assert_eq!(is_alphabetic('\u{0031}'), false); | ||
assert_eq!(is_alphabetic('\u{0032}'), false); | ||
|
||
assert_eq!(is_alphabetic('\u{0040}'), false); | ||
assert_eq!(is_alphabetic('\u{0041}'), true); | ||
assert_eq!(is_alphabetic('\u{0042}'), true); | ||
|
||
assert_eq!(is_alphabetic('\u{0060}'), false); | ||
assert_eq!(is_alphabetic('\u{0061}'), true); | ||
assert_eq!(is_alphabetic('\u{0062}'), true); | ||
|
||
assert_eq!(is_alphabetic('\u{007e}'), false); | ||
assert_eq!(is_alphabetic('\u{007f}'), false); | ||
|
||
// Other BMP | ||
assert_eq!(is_alphabetic('\u{061b}'), false); | ||
assert_eq!(is_alphabetic('\u{061c}'), false); | ||
assert_eq!(is_alphabetic('\u{061d}'), false); | ||
|
||
assert_eq!(is_alphabetic('\u{200d}'), false); | ||
assert_eq!(is_alphabetic('\u{200e}'), false); | ||
assert_eq!(is_alphabetic('\u{200f}'), false); | ||
assert_eq!(is_alphabetic('\u{2010}'), false); | ||
|
||
assert_eq!(is_alphabetic('\u{2029}'), false); | ||
assert_eq!(is_alphabetic('\u{202a}'), false); | ||
assert_eq!(is_alphabetic('\u{202e}'), false); | ||
assert_eq!(is_alphabetic('\u{202f}'), false); | ||
|
||
// Other Planes | ||
assert_eq!(is_alphabetic('\u{10000}'), true); | ||
assert_eq!(is_alphabetic('\u{10001}'), true); | ||
|
||
assert_eq!(is_alphabetic('\u{20000}'), true); | ||
assert_eq!(is_alphabetic('\u{30000}'), false); | ||
assert_eq!(is_alphabetic('\u{40000}'), false); | ||
assert_eq!(is_alphabetic('\u{50000}'), false); | ||
assert_eq!(is_alphabetic('\u{60000}'), false); | ||
assert_eq!(is_alphabetic('\u{70000}'), false); | ||
assert_eq!(is_alphabetic('\u{80000}'), false); | ||
assert_eq!(is_alphabetic('\u{90000}'), false); | ||
assert_eq!(is_alphabetic('\u{a0000}'), false); | ||
assert_eq!(is_alphabetic('\u{b0000}'), false); | ||
assert_eq!(is_alphabetic('\u{c0000}'), false); | ||
assert_eq!(is_alphabetic('\u{d0000}'), false); | ||
assert_eq!(is_alphabetic('\u{e0000}'), false); | ||
|
||
assert_eq!(is_alphabetic('\u{efffe}'), false); | ||
assert_eq!(is_alphabetic('\u{effff}'), false); | ||
|
||
// Priavte-Use Area | ||
assert_eq!(is_alphabetic('\u{f0000}'), false); | ||
assert_eq!(is_alphabetic('\u{f0001}'), false); | ||
assert_eq!(is_alphabetic('\u{ffffe}'), false); | ||
assert_eq!(is_alphabetic('\u{fffff}'), false); | ||
assert_eq!(is_alphabetic('\u{100000}'), false); | ||
assert_eq!(is_alphabetic('\u{100001}'), false); | ||
assert_eq!(is_alphabetic('\u{10fffe}'), false); | ||
assert_eq!(is_alphabetic('\u{10ffff}'), false); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
// Copyright 2017 The UNIC Project Developers. | ||
// | ||
// See the COPYRIGHT file at the top-level directory of this distribution. | ||
// | ||
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | ||
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | ||
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | ||
// option. This file may not be copied, modified, or distributed | ||
// except according to those terms. | ||
|
||
|
||
//! *Alphanumeric* Character Property, equal to `General_Category = Nd | Nl | No or Alphabetic = | ||
//! Yes`. | ||
//! | ||
//! This is equivalent to `Alphabetic = true or Numeric = true`. | ||
//! | ||
//! NOTE: This property is not defined by UCD, but is used commonly enough in Unicode algorithms and | ||
//! applications to provide an optimized implementation. | ||
|
||
|
||
char_property! { | ||
/// Represents Unicode characters with `General_Category = Nd | Nl | No`. | ||
/// | ||
/// This is equivalent to `Alphabetic = true or Numeric = true`. | ||
/// | ||
/// The value is `true` for characters that are alphabetic or have a numeric *General_Category*, | ||
/// `false` otherwise. | ||
pub struct Alphanumeric(bool) { | ||
abbr => "Alphanumeric"; | ||
long => "Alphanumeric"; | ||
human => "Alphanumeric"; | ||
|
||
data_table_path => "../tables/alphanumeric.rsv"; | ||
} | ||
|
||
/// Return `true` for alphanumeric characters, `false` otherwise. | ||
/// | ||
/// This is equivalent to `is_alphabetic(char) || is_numeric(char)`. | ||
pub fn is_alphanumeric(char) -> bool; | ||
} | ||
|
||
|
||
#[cfg(test)] | ||
mod tests { | ||
#[test] | ||
fn test_values() { | ||
use super::is_alphanumeric; | ||
|
||
// ASCII | ||
assert_eq!(is_alphanumeric('\u{0020}'), false); | ||
assert_eq!(is_alphanumeric('\u{0021}'), false); | ||
assert_eq!(is_alphanumeric('\u{0022}'), false); | ||
|
||
assert_eq!(is_alphanumeric('\u{0030}'), true); | ||
assert_eq!(is_alphanumeric('\u{0031}'), true); | ||
assert_eq!(is_alphanumeric('\u{0032}'), true); | ||
|
||
assert_eq!(is_alphanumeric('\u{0040}'), false); | ||
assert_eq!(is_alphanumeric('\u{0041}'), true); | ||
assert_eq!(is_alphanumeric('\u{0042}'), true); | ||
|
||
assert_eq!(is_alphanumeric('\u{0060}'), false); | ||
assert_eq!(is_alphanumeric('\u{0061}'), true); | ||
assert_eq!(is_alphanumeric('\u{0062}'), true); | ||
|
||
assert_eq!(is_alphanumeric('\u{007e}'), false); | ||
assert_eq!(is_alphanumeric('\u{007f}'), false); | ||
|
||
// Other BMP | ||
assert_eq!(is_alphanumeric('\u{061b}'), false); | ||
assert_eq!(is_alphanumeric('\u{061c}'), false); | ||
assert_eq!(is_alphanumeric('\u{061d}'), false); | ||
|
||
assert_eq!(is_alphanumeric('\u{200d}'), false); | ||
assert_eq!(is_alphanumeric('\u{200e}'), false); | ||
assert_eq!(is_alphanumeric('\u{200f}'), false); | ||
assert_eq!(is_alphanumeric('\u{2010}'), false); | ||
|
||
assert_eq!(is_alphanumeric('\u{2029}'), false); | ||
assert_eq!(is_alphanumeric('\u{202a}'), false); | ||
assert_eq!(is_alphanumeric('\u{202e}'), false); | ||
assert_eq!(is_alphanumeric('\u{202f}'), false); | ||
|
||
// Other Planes | ||
assert_eq!(is_alphanumeric('\u{10000}'), true); | ||
assert_eq!(is_alphanumeric('\u{10001}'), true); | ||
|
||
assert_eq!(is_alphanumeric('\u{20000}'), true); | ||
assert_eq!(is_alphanumeric('\u{30000}'), false); | ||
assert_eq!(is_alphanumeric('\u{40000}'), false); | ||
assert_eq!(is_alphanumeric('\u{50000}'), false); | ||
assert_eq!(is_alphanumeric('\u{60000}'), false); | ||
assert_eq!(is_alphanumeric('\u{70000}'), false); | ||
assert_eq!(is_alphanumeric('\u{80000}'), false); | ||
assert_eq!(is_alphanumeric('\u{90000}'), false); | ||
assert_eq!(is_alphanumeric('\u{a0000}'), false); | ||
assert_eq!(is_alphanumeric('\u{b0000}'), false); | ||
assert_eq!(is_alphanumeric('\u{c0000}'), false); | ||
assert_eq!(is_alphanumeric('\u{d0000}'), false); | ||
assert_eq!(is_alphanumeric('\u{e0000}'), false); | ||
|
||
assert_eq!(is_alphanumeric('\u{efffe}'), false); | ||
assert_eq!(is_alphanumeric('\u{effff}'), false); | ||
|
||
// Priavte-Use Area | ||
assert_eq!(is_alphanumeric('\u{f0000}'), false); | ||
assert_eq!(is_alphanumeric('\u{f0001}'), false); | ||
assert_eq!(is_alphanumeric('\u{ffffe}'), false); | ||
assert_eq!(is_alphanumeric('\u{fffff}'), false); | ||
assert_eq!(is_alphanumeric('\u{100000}'), false); | ||
assert_eq!(is_alphanumeric('\u{100001}'), false); | ||
assert_eq!(is_alphanumeric('\u{10fffe}'), false); | ||
assert_eq!(is_alphanumeric('\u{10ffff}'), false); | ||
} | ||
} |
Oops, something went wrong.