/
batUChar.ml
228 lines (192 loc) · 5.95 KB
/
batUChar.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
(* $Id: uChar.ml,v 1.4 2004/09/04 16:07:38 yori Exp $ *)
(* Copyright 2002, 2003 Yamagata Yoriyuki. distributed with LGPL *)
(* TODO: Check -- this is actually part of a package distributed with LGPL + linking exception *)
open CamomileLibrary
open UChar
module Info = UCharInfo.Make(CamomileDefaultConfig)
let backslash = uint_code (of_char '\\')
(*The digits, precomputed.*)
let zero = (of_char '0')
let one = (of_char '1')
let two = (of_char '2')
let three = (of_char '3')
let four = (of_char '4')
let five = (of_char '5')
let six = (of_char '6')
let seven = (of_char '7')
let eight = (of_char '8')
let nine = (of_char '9')
let to_char = char_of
let to_int = int_of
let is_lowercase c = match Info.general_category c with `Ll -> true | _ -> false
let is_uppercase c = match Info.general_category c with `Lu -> true | _ -> false
let is_whitespace c = match uint_code c with
| 010 | 013 | 009 | 026 | 032 | 012 -> true
| _ -> match Info.general_category c with `Zs | `Zl | `Zp -> true
| _ -> false
let is_newline c = match uint_code c with
| 010 | 013 -> true
| _ -> match Info.general_category c with `Zl -> true
| _ -> false
(* Text containing one single unicode character *)
module Text : UnicodeString.Type with type t = t =
struct
type t = uchar
let get x = function
| 0 -> x
| _ -> invalid_arg "UChar.Text.get"
let init n f = match n with
| 1 -> f 0
| _ -> invalid_arg "UChar.Text.init"
let length _ = 1
type index = int
let look x = function
| 0 -> x
| _ -> invalid_arg "UChar.Text.look"
let nth _ n = n
let next _ n = n + 1
let prev _ n = n - 1
let out_of_range _ n = n <> 0
let iter f x = f x
let compare : t -> t -> int = compare
let first _ = 0
let last _ = 0
let move _ x y = x + y
let compare_index _ x y = x - y
module Buf =
struct
type buf = uchar option ref
let create _ = ref None
let contents b = match !b with
| None -> invalid_arg "UChar.Text.Buf.contents"
| Some x -> x
let clear b = b := None
let reset = clear
let add_char b x = match !b with
| Some _ -> invalid_arg "UChar.Text.Buf.add_char"
| _ -> b := Some x
let add_string = add_char
let add_buffer b1 b2 = match !b2 with
| None -> ()
| Some x -> add_char b1 x
end
end
module Case = CaseMap.Make(CamomileDefaultConfig)(Text)
let lowercase x = Case.lowercase x
let uppercase x = Case.uppercase x
let icompare = Case.compare_caseless
module IUChar = struct
type t = uchar
let compare = icompare
end
let print out c =
let code = uint_code c in
if code = backslash then BatInnerIO.nwrite out "\\\\"
else if code < 0x80 then
BatInnerIO.nwrite out (UTF8.init 1 (fun _ -> c)) (*Note [UTF8.t] is [string] -- not [ExtUTF8.t]*)
else
let n2 = code land 0xffff in
let n1 = code lsr 16 in
if n1 = 0 then
BatInnerIO.Printf.fprintf out "\\u%04X" n2
else
BatInnerIO.Printf.fprintf out "\\U%04X%04X" n1 n2
let t_printer paren out c =
if paren then BatInnerIO.write out '(';
let n = code c in
if n >= 0 && n <= 255 then
BatInnerIO.Printf.fprintf out "UChar.of_char %C" (Char.chr n)
else
BatInnerIO.Printf.fprintf out "UChar.of_int 0x%04x" n;
if paren then BatInnerIO.write out ')'
let of_digit = function
| 0 -> zero
| 1 -> one
| 2 -> two
| 3 -> three
| 4 -> four
| 5 -> five
| 6 -> six
| 7 -> seven
| 8 -> eight
| 9 -> nine
| _ -> raise (Invalid_argument "UChar.of_digit")
type script =
[ `Arabic
| `Armenian
| `Bengali
| `Bopomofo
| `Buhid
| `Canadian_Aboriginal
| `Cherokee
| `Common
| `Cyrillic
| `Deseret
| `Devanagari
| `Ethiopic
| `Georgian
| `Gothic
| `Greek
| `Gujarati
| `Gurmukhi
| `Han
| `Hangul
| `Hanunoo
| `Hebrew
| `Hiragana
| `Inherited
| `Kannada
| `Katakana
| `Khmer
| `Lao
| `Latin
| `Malayalam
| `Mongolian
| `Myanmar
| `Ogham
| `Old_Italic
| `Oriya
| `Runic
| `Sinhala
| `Syriac
| `Tagalog
| `Tagbanwa
| `Tamil
| `Telugu
| `Thaana
| `Thai
| `Tibetan
| `Yi ]
type category =
[ `Lu (** Letter, Uppercase *)
| `Ll (** Letter, Lowercase *)
| `Lt (** Letter, Titlecase *)
| `Mn (** Mark, Non-Spacing *)
| `Mc (** Mark, Spacing Combining *)
| `Me (** Mark, Enclosing *)
| `Nd (** Number, Decimal Digit *)
| `Nl (** Number, Letter *)
| `No (** Number, Other *)
| `Zs (** Separator, Space *)
| `Zl (** Separator, Line *)
| `Zp (** Separator, Paragraph *)
| `Cc (** Other, Control *)
| `Cf (** Other, Format *)
| `Cs (** Other, Surrogate *)
| `Co (** Other, Private Use *)
| `Cn (** Other, Not Assigned *)
| `Lm (** Letter, Modifier *)
| `Lo (** Letter, Other *)
| `Pc (** Punctuation, Connector *)
| `Pd (** Punctuation, Dash *)
| `Ps (** Punctuation, Open *)
| `Pe (** Punctuation, Close *)
| `Pi (** Punctuation, Initial quote *)
| `Pf (** Punctuation, Final quote *)
| `Po (** Punctuation, Other *)
| `Sm (** Symbol, Math *)
| `Sc (** Symbol, Currency *)
| `Sk (** Symbol, Modifier *)
| `So (** Symbol, Other *) ]
let script = Info.script
let category = Info.general_category