/
normalizer.cr
158 lines (139 loc) · 4.27 KB
/
normalizer.cr
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# __Normalization__
#
# Unicode normalization functionality for standard Unicode normalization.
#
# __Usage__
# ```
# str = "À"
# str.bytes # => [65, 204, 128]
# norm = ICU::NFCNormalizer.new
# norm.normalized?(str) # => false
# norm.normalized_quick?(str) # => Maybe
# norm.normalize(str).bytes # => [195, 128]
# ```
#
# __See also__
# - [reference implementation](http://icu-project.org/apiref/icu4c/unorm2_8h.html)
# - [user guide](http://userguide.icu-project.org/transforms/normalization)
# - [unit tests](https://github.com/olbat/icu.cr/blob/master/spec/normalizer_spec.cr)
class ICU::Normalizer
alias Mode = LibICU::UNormalization2Mode
alias CheckResult = LibICU::UNormalizationCheckResult
alias Type = NamedTuple(name: String, mode: Mode)
TYPES = {
NFC: {name: "nfc", mode: Mode::Compose},
NFD: {name: "nfc", mode: Mode::Decompose},
NFKC: {name: "nfkc", mode: Mode::Compose},
NFKD: {name: "nfkc", mode: Mode::Decompose},
NFKCCF: {name: "nfkc_cf", mode: Mode::Compose},
}
@unorm : LibICU::UNormalizer2
@type : Type
# Create a new normalizer that will use the specified [mode](http://www.unicode.org/unicode/reports/tr15/) (NFC, NFD, NFKC, NFKD, NFKCCF)
def initialize(type : Symbol)
if t = TYPES[type]?
@type = t
else
raise ICU::Error.new("unknown type #{type}")
end
ustatus = LibICU::UErrorCode::UZeroError
@unorm = LibICU.unorm2_get_instance(nil, @type[:name], @type[:mode], pointerof(ustatus))
ICU.check_error!(ustatus)
end
def finalize
# nothing since singleton instances retrieved from unorm2_getInstance does not have to be freed
end
# Normalize some text
#
# ```
# str = "À"
# str.bytes # => [65, 204, 128]
# ICU::NFCNormalizer.new.normalize(str).bytes # => [195, 128]
# ```
def normalize(text : String) : String
# allocate twice the size of the source string to be sure
src = text.to_uchars
dest = ICU::UChars.new(text.size * 2)
size = limit = text.size
ustatus = LibICU::UErrorCode::UZeroError
size = LibICU.unorm2_normalize(@unorm, src, text.size, dest, dest.size, pointerof(ustatus))
ICU.check_error!(ustatus)
dest.to_s(size)
end
# Tests if the string is normalized
#
# ```
# ICU::NFCNormalizer.new.normalized?("À") # => false
# ```
#
# (see also: `#normalized_quick?`)
def normalized?(text : String) : Bool
ustatus = LibICU::UErrorCode::UZeroError
ret = LibICU.unorm2_is_normalized(@unorm, text.to_uchars, text.size, pointerof(ustatus))
ICU.check_error!(ustatus)
ret != 0
end
# Tests if the string is normalized (faster but less accurate than `#normalized?`
#
# ```
# ICU::NFCNormalizer.new.normalized_quick?("À") # => Maybe
# ```
def normalized_quick?(text : String) : CheckResult
ustatus = LibICU::UErrorCode::UZeroError
ret = LibICU.unorm2_quick_check(@unorm, text.to_uchars, text.size, pointerof(ustatus))
ICU.check_error!(ustatus)
ret.as(CheckResult)
end
# Tests if the character is normalization-inert
#
# ```
# norm = ICU::NFCNormalizer.new
# norm.inert?("À") # => false
# norm.inert?("A") # => true
# ```
def inert?(chr : Char) : Bool
LibICU.unorm2_is_inert(@unorm, chr.to_uchar) != 0
end
# Gets the decomposition mapping of a character
#
# ```
# ICU::NFCNormalizer.new.decomposition("À") # => [65, 204, 128]
# ```
def decomposition(chr : Char) : String
dec = ICU::UChars.new(8)
ustatus = LibICU::UErrorCode::UZeroError
size = LibICU.unorm2_get_decomposition(@unorm, chr.to_uchar, dec, dec.size, pointerof(ustatus))
ICU.check_error!(ustatus)
dec.to_s(size)
end
end
# NFC Normalizer (see `ICU::Normalizer`)
class ICU::NFCNormalizer < ICU::Normalizer
def initialize
super(:NFC)
end
end
# NFD Normalizer (see `ICU::Normalizer`)
class ICU::NFDNormalizer < ICU::Normalizer
def initialize
super(:NFD)
end
end
# NFKC Normalizer (see `ICU::Normalizer`)
class ICU::NFKCNormalizer < ICU::Normalizer
def initialize
super(:NFKC)
end
end
# NFKD Normalizer (see `ICU::Normalizer`)
class ICU::NFKDNormalizer < ICU::Normalizer
def initialize
super(:NFKD)
end
end
# NFKCC Normalizer (see `ICU::Normalizer`)
class ICU::NFKCCFNormalizer < ICU::Normalizer
def initialize
super(:NFKCCF)
end
end