Skip to content

Commit

Permalink
Merge #172
Browse files Browse the repository at this point in the history
172: Add new unic-ucd-common component r=CAD97 a=behnam

There are some core properties that are commonly used in Unicode
algorithsm, as well as in applications directly, and are not specific to
any single area. Examples of these properties are `Alphabetic` and
`White_Space`.

Also, there are some *resolved* properties that are used commonly, like
*Numeric* and *Alphanumeric*, which are commonly defined as based on
*General_Category* and *Alphabetic* properties. Since they are common in
applications, it makes sense to provided optimized implementations.

This new componet, `unic-ucd-common`, hosts these properties.
  • Loading branch information
bors[bot] committed Oct 5, 2017
2 parents 9108df9 + 652f4c0 commit 35169e0
Show file tree
Hide file tree
Showing 18 changed files with 2,377 additions and 1 deletion.
90 changes: 90 additions & 0 deletions gen/src/writer/ucd/common.rs
@@ -0,0 +1,90 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.


use std::path::Path;
use std::collections::BTreeSet;

use source::ucd::derived_core_properties::DERIVED_CORE_PROPERTIES;
use source::ucd::prop_list::PROP_LIST;
use source::ucd::readme::UNICODE_VERSION;
use source::ucd::unicode_data::UNICODE_DATA;

use writer::utils::tables::ToRangeCharSet;
use writer::common::emit_unicode_version;
use writer::utils::write;


pub fn generate(dir: &Path) {
emit_unicode_version(dir, &UNICODE_VERSION);
emit_alphabetic(dir);
emit_white_space(dir);
emit_alphanumeric(dir);
emit_control(dir);
emit_numeric(dir);
}


fn emit_alphabetic(dir: &Path) {
write(
dir,
"alphabetic.rsv",
&DERIVED_CORE_PROPERTIES.alphabetic.to_range_char_set(),
);
}

fn emit_white_space(dir: &Path) {
write(
dir,
"white_space.rsv",
&PROP_LIST.white_space.to_range_char_set(),
);
}

fn emit_alphanumeric(dir: &Path) {
write(
dir,
"alphanumeric.rsv",
&get_alphanumeric().to_range_char_set(),
);
}

fn emit_control(dir: &Path) {
let set: BTreeSet<char> = UNICODE_DATA
.entries
.iter()
.filter(|x| x.general_category == "Cc")
.map(|x| x.character)
.collect();
write(dir, "control.rsv", &set.to_range_char_set());
}

fn emit_numeric(dir: &Path) {
write(dir, "numeric.rsv", &get_numeric().to_range_char_set());
}


fn get_numeric() -> BTreeSet<char> {
UNICODE_DATA
.entries
.iter()
.filter(|x| {
["Nd", "Nl", "No"].contains(&x.general_category.as_str())
})
.map(|x| x.character)
.collect()
}

fn get_alphanumeric() -> BTreeSet<char> {
get_numeric()
.union(&DERIVED_CORE_PROPERTIES.alphabetic)
.map(|ch| ch.clone())
.collect()
}
4 changes: 3 additions & 1 deletion gen/src/writer/ucd/mod.rs
Expand Up @@ -13,6 +13,7 @@ mod age;
mod bidi;
mod case;
mod category;
mod common;
mod core;
mod ident;
mod name;
Expand All @@ -28,9 +29,10 @@ pub fn generate() {
bidi::generate(&clean_dir("unic/ucd/bidi/tables"));
case::generate(&clean_dir("unic/ucd/case/tables"));
category::generate(&clean_dir("unic/ucd/category/tables"));
common::generate(&clean_dir("unic/ucd/common/tables"));
core::generate(&clean_dir("unic/ucd/core/tables"));
ident::generate(&clean_dir("unic/ucd/ident/tables"));
name::generate(&clean_dir("unic/ucd/name/tables"));
normal::generate(&clean_dir("unic/ucd/normal/tables"));
ident::generate(&clean_dir("unic/ucd/ident/tables"));
segment::generate(&clean_dir("unic/ucd/segment/tables"));
}
1 change: 1 addition & 0 deletions unic/ucd/Cargo.toml
Expand Up @@ -20,6 +20,7 @@ unic-ucd-age = { path = "age/", version = "0.6.0" }
unic-ucd-bidi = { path = "bidi/", version = "0.6.0" }
unic-ucd-case = { path = "case/", version = "0.6.0" }
unic-ucd-category = { path = "category/", version = "0.6.0" }
unic-ucd-common = { path = "common/", version = "0.6.0" }
unic-ucd-core = { path = "core/", version = "0.6.0" }
unic-ucd-ident = { path = "ident/", version = "0.6.0" }
unic-ucd-name = { path = "name/", version = "0.6.0" }
Expand Down
23 changes: 23 additions & 0 deletions unic/ucd/common/Cargo.toml
@@ -0,0 +1,23 @@
[package]
name = "unic-ucd-common"
version = "0.6.0"
authors = ["The UNIC Project Developers"]
repository = "https://github.com/behnam/rust-unic/"
license = "MIT/Apache-2.0"
description = "UNIC - Unicode Character Database - Common Properties"
keywords = ["text", "unicode", "character-property", "alphabetic", "numeric"]
categories = ["internationalization", "text-processing", "parsing", "rendering"]

# No tests/benches that depends on /data/
exclude = []

[badges]
travis-ci = { repository = "behnam/rust-unic", branch = "master" }

[dependencies]
unic-char-property = { path = "../../char/property/", version = "0.6.0" }
unic-char-range = { path = "../../char/range", version = "0.6.0" }
unic-ucd-core = { path = "../core/", version = "0.6.0" }

[dev-dependencies]
unic-ucd-category = { path = "../category/", version = "0.6.0" }
105 changes: 105 additions & 0 deletions unic/ucd/common/src/alphabetic.rs
@@ -0,0 +1,105 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.


//! Unicode *Alphabetic* Character Property.


char_property! {
/// Represents values of the Unicode character property
/// [*Alphabetic*](http://www.unicode.org/reports/tr44/#Alphabetic).
///
/// The value is `true` for characters that change when lowercased, `false` otherwise.
pub struct Alphabetic(bool) {
abbr => "Alpha";
long => "Alphabetic";
human => "Alphabetic";

data_table_path => "../tables/alphabetic.rsv";
}

/// Return `true` for Alphabetic characters, `false` otherwise.
pub fn is_alphabetic(char) -> bool;
}


#[cfg(test)]
mod tests {
#[test]
fn test_values() {
use super::is_alphabetic;

// ASCII
assert_eq!(is_alphabetic('\u{0020}'), false);
assert_eq!(is_alphabetic('\u{0021}'), false);
assert_eq!(is_alphabetic('\u{0022}'), false);

assert_eq!(is_alphabetic('\u{0030}'), false);
assert_eq!(is_alphabetic('\u{0031}'), false);
assert_eq!(is_alphabetic('\u{0032}'), false);

assert_eq!(is_alphabetic('\u{0040}'), false);
assert_eq!(is_alphabetic('\u{0041}'), true);
assert_eq!(is_alphabetic('\u{0042}'), true);

assert_eq!(is_alphabetic('\u{0060}'), false);
assert_eq!(is_alphabetic('\u{0061}'), true);
assert_eq!(is_alphabetic('\u{0062}'), true);

assert_eq!(is_alphabetic('\u{007e}'), false);
assert_eq!(is_alphabetic('\u{007f}'), false);

// Other BMP
assert_eq!(is_alphabetic('\u{061b}'), false);
assert_eq!(is_alphabetic('\u{061c}'), false);
assert_eq!(is_alphabetic('\u{061d}'), false);

assert_eq!(is_alphabetic('\u{200d}'), false);
assert_eq!(is_alphabetic('\u{200e}'), false);
assert_eq!(is_alphabetic('\u{200f}'), false);
assert_eq!(is_alphabetic('\u{2010}'), false);

assert_eq!(is_alphabetic('\u{2029}'), false);
assert_eq!(is_alphabetic('\u{202a}'), false);
assert_eq!(is_alphabetic('\u{202e}'), false);
assert_eq!(is_alphabetic('\u{202f}'), false);

// Other Planes
assert_eq!(is_alphabetic('\u{10000}'), true);
assert_eq!(is_alphabetic('\u{10001}'), true);

assert_eq!(is_alphabetic('\u{20000}'), true);
assert_eq!(is_alphabetic('\u{30000}'), false);
assert_eq!(is_alphabetic('\u{40000}'), false);
assert_eq!(is_alphabetic('\u{50000}'), false);
assert_eq!(is_alphabetic('\u{60000}'), false);
assert_eq!(is_alphabetic('\u{70000}'), false);
assert_eq!(is_alphabetic('\u{80000}'), false);
assert_eq!(is_alphabetic('\u{90000}'), false);
assert_eq!(is_alphabetic('\u{a0000}'), false);
assert_eq!(is_alphabetic('\u{b0000}'), false);
assert_eq!(is_alphabetic('\u{c0000}'), false);
assert_eq!(is_alphabetic('\u{d0000}'), false);
assert_eq!(is_alphabetic('\u{e0000}'), false);

assert_eq!(is_alphabetic('\u{efffe}'), false);
assert_eq!(is_alphabetic('\u{effff}'), false);

// Priavte-Use Area
assert_eq!(is_alphabetic('\u{f0000}'), false);
assert_eq!(is_alphabetic('\u{f0001}'), false);
assert_eq!(is_alphabetic('\u{ffffe}'), false);
assert_eq!(is_alphabetic('\u{fffff}'), false);
assert_eq!(is_alphabetic('\u{100000}'), false);
assert_eq!(is_alphabetic('\u{100001}'), false);
assert_eq!(is_alphabetic('\u{10fffe}'), false);
assert_eq!(is_alphabetic('\u{10ffff}'), false);
}
}
115 changes: 115 additions & 0 deletions unic/ucd/common/src/alphanumeric.rs
@@ -0,0 +1,115 @@
// Copyright 2017 The UNIC Project Developers.
//
// See the COPYRIGHT file at the top-level directory of this distribution.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.


//! *Alphanumeric* Character Property, equal to `General_Category = Nd | Nl | No or Alphabetic =
//! Yes`.
//!
//! This is equivalent to `Alphabetic = true or Numeric = true`.
//!
//! NOTE: This property is not defined by UCD, but is used commonly enough in Unicode algorithms and
//! applications to provide an optimized implementation.


char_property! {
/// Represents Unicode characters with `General_Category = Nd | Nl | No`.
///
/// This is equivalent to `Alphabetic = true or Numeric = true`.
///
/// The value is `true` for characters that are alphabetic or have a numeric *General_Category*,
/// `false` otherwise.
pub struct Alphanumeric(bool) {
abbr => "Alphanumeric";
long => "Alphanumeric";
human => "Alphanumeric";

data_table_path => "../tables/alphanumeric.rsv";
}

/// Return `true` for alphanumeric characters, `false` otherwise.
///
/// This is equivalent to `is_alphabetic(char) || is_numeric(char)`.
pub fn is_alphanumeric(char) -> bool;
}


#[cfg(test)]
mod tests {
#[test]
fn test_values() {
use super::is_alphanumeric;

// ASCII
assert_eq!(is_alphanumeric('\u{0020}'), false);
assert_eq!(is_alphanumeric('\u{0021}'), false);
assert_eq!(is_alphanumeric('\u{0022}'), false);

assert_eq!(is_alphanumeric('\u{0030}'), true);
assert_eq!(is_alphanumeric('\u{0031}'), true);
assert_eq!(is_alphanumeric('\u{0032}'), true);

assert_eq!(is_alphanumeric('\u{0040}'), false);
assert_eq!(is_alphanumeric('\u{0041}'), true);
assert_eq!(is_alphanumeric('\u{0042}'), true);

assert_eq!(is_alphanumeric('\u{0060}'), false);
assert_eq!(is_alphanumeric('\u{0061}'), true);
assert_eq!(is_alphanumeric('\u{0062}'), true);

assert_eq!(is_alphanumeric('\u{007e}'), false);
assert_eq!(is_alphanumeric('\u{007f}'), false);

// Other BMP
assert_eq!(is_alphanumeric('\u{061b}'), false);
assert_eq!(is_alphanumeric('\u{061c}'), false);
assert_eq!(is_alphanumeric('\u{061d}'), false);

assert_eq!(is_alphanumeric('\u{200d}'), false);
assert_eq!(is_alphanumeric('\u{200e}'), false);
assert_eq!(is_alphanumeric('\u{200f}'), false);
assert_eq!(is_alphanumeric('\u{2010}'), false);

assert_eq!(is_alphanumeric('\u{2029}'), false);
assert_eq!(is_alphanumeric('\u{202a}'), false);
assert_eq!(is_alphanumeric('\u{202e}'), false);
assert_eq!(is_alphanumeric('\u{202f}'), false);

// Other Planes
assert_eq!(is_alphanumeric('\u{10000}'), true);
assert_eq!(is_alphanumeric('\u{10001}'), true);

assert_eq!(is_alphanumeric('\u{20000}'), true);
assert_eq!(is_alphanumeric('\u{30000}'), false);
assert_eq!(is_alphanumeric('\u{40000}'), false);
assert_eq!(is_alphanumeric('\u{50000}'), false);
assert_eq!(is_alphanumeric('\u{60000}'), false);
assert_eq!(is_alphanumeric('\u{70000}'), false);
assert_eq!(is_alphanumeric('\u{80000}'), false);
assert_eq!(is_alphanumeric('\u{90000}'), false);
assert_eq!(is_alphanumeric('\u{a0000}'), false);
assert_eq!(is_alphanumeric('\u{b0000}'), false);
assert_eq!(is_alphanumeric('\u{c0000}'), false);
assert_eq!(is_alphanumeric('\u{d0000}'), false);
assert_eq!(is_alphanumeric('\u{e0000}'), false);

assert_eq!(is_alphanumeric('\u{efffe}'), false);
assert_eq!(is_alphanumeric('\u{effff}'), false);

// Priavte-Use Area
assert_eq!(is_alphanumeric('\u{f0000}'), false);
assert_eq!(is_alphanumeric('\u{f0001}'), false);
assert_eq!(is_alphanumeric('\u{ffffe}'), false);
assert_eq!(is_alphanumeric('\u{fffff}'), false);
assert_eq!(is_alphanumeric('\u{100000}'), false);
assert_eq!(is_alphanumeric('\u{100001}'), false);
assert_eq!(is_alphanumeric('\u{10fffe}'), false);
assert_eq!(is_alphanumeric('\u{10ffff}'), false);
}
}

0 comments on commit 35169e0

Please sign in to comment.