Permalink
Browse files

deps: ICU 60 bump

- Update to released ICU 60.1, including:
  - CLDR 32 (many new languages and data improvements)
  - Unicode 10 (8,518 new characters, including four new scripts,
  7,494 new Han characters, and 56 new emoji characters)
  - UTF-8 malformed bytes now handled according to W3C/WHATWG spec

Fixes: #15540
PR-URL: #16876
Reviewed-By: James M Snell <jasnell@gmail.com>
Reviewed-By: Michael Dawson <michael_dawson@ca.ibm.com>
  • Loading branch information...
srl295 committed Sep 21, 2017
1 parent 3b3ceaf commit 44d3e17985befbd45457d5ad7f0a0387849e1b2f
Showing with 23,876 additions and 10,365 deletions.
  1. +1 −1 LICENSE
  2. +2 −2 configure
  3. +1 −1 deps/icu-small/LICENSE
  4. +2 −2 deps/icu-small/README-SMALL-ICU.txt
  5. +61 −47 deps/icu-small/source/common/bmpset.cpp
  6. +8 −7 deps/icu-small/source/common/bmpset.h
  7. +11 −18 deps/icu-small/source/common/brkeng.cpp
  8. +4 −9 deps/icu-small/source/common/brkeng.h
  9. +18 −5 deps/icu-small/source/common/brkiter.cpp
  10. +123 −0 deps/icu-small/source/common/bytesinkutil.cpp
  11. +53 −0 deps/icu-small/source/common/bytesinkutil.h
  12. +6 −0 deps/icu-small/source/common/bytestream.cpp
  13. +1 −1 deps/icu-small/source/common/caniter.cpp
  14. +8 −4 deps/icu-small/source/common/cmemory.h
  15. +17 −38 deps/icu-small/source/common/dictbe.cpp
  16. +10 −12 deps/icu-small/source/common/dictbe.h
  17. +515 −85 deps/icu-small/source/common/edits.cpp
  18. +1 −1 deps/icu-small/source/common/filteredbrk.cpp
  19. +73 −0 deps/icu-small/source/common/filterednormalizer2.cpp
  20. +34 −0 deps/icu-small/source/common/hash.h
  21. +1 −1 deps/icu-small/source/common/listformatter.cpp
  22. +2 −2 deps/icu-small/source/common/loadednormalizer2impl.cpp
  23. +3 −3 deps/icu-small/source/common/locavailable.cpp
  24. +5 −3 deps/icu-small/source/common/locdispnames.cpp
  25. +3 −2 deps/icu-small/source/common/locdspnm.cpp
  26. +2 −3 deps/icu-small/source/common/loclikely.cpp
  27. +51 −19 deps/icu-small/source/common/locmap.cpp
  28. +937 −929 deps/icu-small/source/common/norm2_nfc_data.h
  29. +40 −14 deps/icu-small/source/common/norm2allmodes.h
  30. +54 −12 deps/icu-small/source/common/normalizer2.cpp
  31. +1,129 −547 deps/icu-small/source/common/normalizer2impl.cpp
  32. +233 −71 deps/icu-small/source/common/normalizer2impl.h
  33. +884 −840 deps/icu-small/source/common/propname_data.h
  34. +56 −81 deps/icu-small/source/common/putil.cpp
  35. +12 −21 deps/icu-small/source/common/putilimp.h
  36. +240 −736 deps/icu-small/source/common/rbbi.cpp
  37. +630 −0 deps/icu-small/source/common/rbbi_cache.cpp
  38. +203 −0 deps/icu-small/source/common/rbbi_cache.h
  39. 0 deps/icu-small/source/common/rbbicst.pl
  40. +33 −43 deps/icu-small/source/common/rbbidata.cpp
  41. +10 −7 deps/icu-small/source/common/rbbidata.h
  42. +49 −17 deps/icu-small/source/common/rbbirb.cpp
  43. +6 −0 deps/icu-small/source/common/rbbirb.h
  44. +5 −0 deps/icu-small/source/common/rbbiscan.cpp
  45. +29 −63 deps/icu-small/source/common/rbbisetb.cpp
  46. +9 −4 deps/icu-small/source/common/rbbisetb.h
  47. +356 −358 deps/icu-small/source/common/ubidi_props_data.h
  48. +2 −0 deps/icu-small/source/common/ucase.cpp
  49. +9 −3 deps/icu-small/source/common/ucase.h
  50. +305 −292 deps/icu-small/source/common/ucase_props_data.h
  51. +197 −330 deps/icu-small/source/common/ucasemap.cpp
  52. +69 −17 deps/icu-small/source/common/ucasemap_imp.h
  53. +38 −11 deps/icu-small/source/common/ucasemap_titlecase_brkiter.cpp
  54. +1 −4 deps/icu-small/source/common/uchar.cpp
  55. +3,325 −3,289 deps/icu-small/source/common/uchar_props_data.h
  56. +1 −1 deps/icu-small/source/common/ucharstriebuilder.cpp
  57. +1 −1 deps/icu-small/source/common/ucln_cmn.h
  58. +2 −1 deps/icu-small/source/common/ucnv_ct.cpp
  59. +4 −4 deps/icu-small/source/common/ucnv_lmb.cpp
  60. +11 −3 deps/icu-small/source/common/ucnv_u16.cpp
  61. +88 −236 deps/icu-small/source/common/ucnv_u8.cpp
  62. +3 −2 deps/icu-small/source/common/ucnvlat1.cpp
  63. +78 −93 deps/icu-small/source/common/ucnvmbcs.cpp
  64. +94 −79 deps/icu-small/source/common/ucurr.cpp
  65. +6 −2 deps/icu-small/source/common/udata.cpp
  66. +19 −3 deps/icu-small/source/common/uhash.cpp
  67. +19 −0 deps/icu-small/source/common/uhash.h
  68. +2 −2 deps/icu-small/source/common/uinvchar.cpp
  69. +1 −1 deps/icu-small/source/common/ulist.cpp
  70. +10 −6 deps/icu-small/source/common/uloc.cpp
  71. +3 −3 deps/icu-small/source/common/uloc_tag.cpp
  72. +5 −3 deps/icu-small/source/common/umapfile.cpp
  73. +1 −1 deps/icu-small/source/common/umutex.cpp
  74. +10 −14 deps/icu-small/source/common/unicode/brkiter.h
  75. +25 −8 deps/icu-small/source/common/unicode/bytestream.h
  76. +157 −20 deps/icu-small/source/common/unicode/casemap.h
  77. +26 −26 deps/icu-small/source/common/unicode/char16ptr.h
  78. +1 −1 deps/icu-small/source/common/unicode/docmain.h
  79. +186 −9 deps/icu-small/source/common/unicode/edits.h
  80. +38 −4 deps/icu-small/source/common/unicode/filteredbrk.h
  81. +0 −37 deps/icu-small/source/common/unicode/localpointer.h
  82. +1 −1 deps/icu-small/source/common/unicode/locid.h
  83. +127 −13 deps/icu-small/source/common/unicode/normalizer2.h
  84. +28 −41 deps/icu-small/source/common/unicode/platform.h
  85. +59 −76 deps/icu-small/source/common/unicode/rbbi.h
  86. +10 −0 deps/icu-small/source/common/unicode/simpleformatter.h
  87. +198 −0 deps/icu-small/source/common/unicode/stringoptions.h
  88. +9 −9 deps/icu-small/source/common/unicode/stringtriebuilder.h
  89. +14 −17 deps/icu-small/source/common/unicode/ubiditransform.h
  90. +2 −1 deps/icu-small/source/common/unicode/ubrk.h
  91. +2 −51 deps/icu-small/source/common/unicode/ucasemap.h
  92. +54 −26 deps/icu-small/source/common/unicode/uchar.h
  93. +1 −1 deps/icu-small/source/common/unicode/uclean.h
  94. +6 −3 deps/icu-small/source/common/unicode/uconfig.h
  95. +3 −7 deps/icu-small/source/common/unicode/udisplaycontext.h
  96. +5 −18 deps/icu-small/source/common/unicode/unistr.h
  97. +8 −8 deps/icu-small/source/common/unicode/unorm.h
  98. +1 −24 deps/icu-small/source/common/unicode/unorm2.h
  99. +3 −0 deps/icu-small/source/common/unicode/urename.h
  100. +8 −1 deps/icu-small/source/common/unicode/uscript.h
  101. +0 −10 deps/icu-small/source/common/unicode/ustring.h
  102. +1 −1 deps/icu-small/source/common/unicode/utext.h
  103. +17 −17 deps/icu-small/source/common/unicode/utf.h
  104. +126 −6 deps/icu-small/source/common/unicode/utf16.h
  105. +88 −60 deps/icu-small/source/common/unicode/utf8.h
  106. +17 −2 deps/icu-small/source/common/unicode/utf_old.h
  107. +5 −10 deps/icu-small/source/common/unicode/uvernum.h
  108. +2 −2 deps/icu-small/source/common/unifiedcache.cpp
  109. +1 −1 deps/icu-small/source/common/unifiedcache.h
  110. +8 −13 deps/icu-small/source/common/uniset_props.cpp
  111. +9 −12 deps/icu-small/source/common/unisetspan.cpp
  112. +0 −2 deps/icu-small/source/common/unistr.cpp
  113. +21 −1 deps/icu-small/source/common/unistr_case.cpp
  114. +13 −23 deps/icu-small/source/common/unistr_titlecase_brkiter.cpp
  115. +8 −0 deps/icu-small/source/common/uprops.cpp
  116. +5 −4 deps/icu-small/source/common/uprops.h
  117. +1 −0 deps/icu-small/source/common/uresbund.cpp
  118. +10 −7 deps/icu-small/source/common/uscript_props.cpp
  119. +59 −0 deps/icu-small/source/common/ustr_imp.h
  120. +165 −34 deps/icu-small/source/common/ustr_titlecase_brkiter.cpp
  121. +33 −37 deps/icu-small/source/common/ustrcase.cpp
  122. +234 −435 deps/icu-small/source/common/ustrtrns.cpp
  123. +6 −10 deps/icu-small/source/common/utext.cpp
  124. +138 −145 deps/icu-small/source/common/utf_impl.cpp
  125. +2 −2 deps/icu-small/source/common/utrie2.cpp
  126. +17 −21 deps/icu-small/source/common/utrie2.h
  127. +7 −7 deps/icu-small/source/common/uts46.cpp
  128. BIN deps/icu-small/source/data/in/{icudt59l.dat → icudt60l.dat}
  129. +7 −7 deps/icu-small/source/i18n/affixpatternparser.cpp
  130. +9 −5 deps/icu-small/source/i18n/anytrans.cpp
  131. +9 −6 deps/icu-small/source/i18n/calendar.cpp
  132. +2 −0 deps/icu-small/source/i18n/coll.cpp
  133. +2 −1 deps/icu-small/source/i18n/collationdatareader.cpp
  134. +1 −1 deps/icu-small/source/i18n/collationdatawriter.cpp
  135. +1 −1 deps/icu-small/source/i18n/collationfastlatinbuilder.cpp
  136. +53 −53 deps/icu-small/source/i18n/collationfcd.cpp
  137. +17 −2 deps/icu-small/source/i18n/collationiterator.h
  138. +1 −1 deps/icu-small/source/i18n/collationweights.cpp
  139. +22 −2 deps/icu-small/source/i18n/currunit.cpp
  140. +2 −1 deps/icu-small/source/i18n/datefmt.cpp
  141. +1 −1 deps/icu-small/source/i18n/dayperiodrules.cpp
  142. +24 −17 deps/icu-small/source/i18n/dcfmtsym.cpp
  143. +14 −14 deps/icu-small/source/i18n/decNumber.cpp
  144. +2 −2 deps/icu-small/source/i18n/decfmtst.cpp
  145. +13 −11 deps/icu-small/source/i18n/decimfmt.cpp
  146. +4 −3 deps/icu-small/source/i18n/decimfmtimpl.cpp
  147. +1 −1 deps/icu-small/source/i18n/digitformatter.cpp
  148. +4 −3 deps/icu-small/source/i18n/digitlst.cpp
  149. +13 −9 deps/icu-small/source/i18n/dtfmtsym.cpp
  150. +187 −71 deps/icu-small/source/i18n/dtptngen.cpp
  151. +5 −3 deps/icu-small/source/i18n/dtptngen_impl.h
  152. +5 −0 deps/icu-small/source/i18n/gregoimp.cpp
  153. +11 −0 deps/icu-small/source/i18n/gregoimp.h
  154. +26 −21 deps/icu-small/source/i18n/measfmt.cpp
  155. +83 −70 deps/icu-small/source/i18n/measunit.cpp
  156. +4 −1 deps/icu-small/source/i18n/msgfmt.cpp
  157. +8 −5 deps/icu-small/source/i18n/nfrs.cpp
  158. +3 −1 deps/icu-small/source/i18n/nfrs.h
  159. +4 −4 deps/icu-small/source/i18n/nfsubs.cpp
  160. +42 −0 deps/icu-small/source/i18n/nounit.cpp
  161. +403 −0 deps/icu-small/source/i18n/number_affixutils.cpp
  162. +224 −0 deps/icu-small/source/i18n/number_affixutils.h
  163. +326 −0 deps/icu-small/source/i18n/number_compact.cpp
  164. +91 −0 deps/icu-small/source/i18n/number_compact.h
  165. +1,011 −0 deps/icu-small/source/i18n/number_decimalquantity.cpp
  166. +438 −0 deps/icu-small/source/i18n/number_decimalquantity.h
  167. +102 −0 deps/icu-small/source/i18n/number_decimfmtprops.cpp
  168. +108 −0 deps/icu-small/source/i18n/number_decimfmtprops.h
  169. +369 −0 deps/icu-small/source/i18n/number_fluent.cpp
  170. +464 −0 deps/icu-small/source/i18n/number_formatimpl.cpp
  171. +125 −0 deps/icu-small/source/i18n/number_formatimpl.h
  172. +55 −0 deps/icu-small/source/i18n/number_grouping.cpp
  173. +48 −0 deps/icu-small/source/i18n/number_integerwidth.cpp
  174. +165 −0 deps/icu-small/source/i18n/number_longnames.cpp
  175. +48 −0 deps/icu-small/source/i18n/number_longnames.h
  176. +303 −0 deps/icu-small/source/i18n/number_modifiers.cpp
  177. +254 −0 deps/icu-small/source/i18n/number_modifiers.h
  178. +75 −0 deps/icu-small/source/i18n/number_notation.cpp
  179. +84 −0 deps/icu-small/source/i18n/number_padding.cpp
  180. +351 −0 deps/icu-small/source/i18n/number_patternmodifier.cpp
  181. +259 −0 deps/icu-small/source/i18n/number_patternmodifier.h
  182. +839 −0 deps/icu-small/source/i18n/number_patternstring.cpp
  183. +266 −0 deps/icu-small/source/i18n/number_patternstring.h
  184. +347 −0 deps/icu-small/source/i18n/number_rounding.cpp
  185. +141 −0 deps/icu-small/source/i18n/number_roundingutils.h
  186. +138 −0 deps/icu-small/source/i18n/number_scientific.cpp
  187. +62 −0 deps/icu-small/source/i18n/number_scientific.h
  188. +460 −0 deps/icu-small/source/i18n/number_stringbuilder.cpp
  189. +135 −0 deps/icu-small/source/i18n/number_stringbuilder.h
  190. +293 −0 deps/icu-small/source/i18n/number_types.h
  191. +130 −0 deps/icu-small/source/i18n/number_utils.h
  192. +18 −0 deps/icu-small/source/i18n/numfmt.cpp
  193. +7 −0 deps/icu-small/source/i18n/numsys.cpp
  194. +1 −1 deps/icu-small/source/i18n/persncal.cpp
  195. +58 −19 deps/icu-small/source/i18n/plurrule.cpp
  196. +82 −6 deps/icu-small/source/i18n/plurrule_impl.h
  197. +2 −2 deps/icu-small/source/i18n/precision.cpp
  198. +73 −13 deps/icu-small/source/i18n/rbnf.cpp
  199. 0 deps/icu-small/source/i18n/regexcst.pl
  200. +1 −1 deps/icu-small/source/i18n/reldatefmt.cpp
  201. +1 −1 deps/icu-small/source/i18n/rematch.cpp
  202. +22 −8 deps/icu-small/source/i18n/smpdtfmt.cpp
  203. +98 −55 deps/icu-small/source/i18n/transreg.cpp
  204. +6 −4 deps/icu-small/source/i18n/transreg.h
  205. +35 −10 deps/icu-small/source/i18n/tzfmt.cpp
  206. +5 −5 deps/icu-small/source/i18n/tzgnames.cpp
  207. +22 −12 deps/icu-small/source/i18n/tznames_impl.cpp
  208. +5 −0 deps/icu-small/source/i18n/tznames_impl.h
  209. +1 −0 deps/icu-small/source/i18n/ucln_in.h
  210. +5 −5 deps/icu-small/source/i18n/ucol_res.cpp
  211. +9 −2 deps/icu-small/source/i18n/ucol_sit.cpp
  212. +2 −4 deps/icu-small/source/i18n/umsg.cpp
  213. +2 −2 deps/icu-small/source/i18n/unicode/calendar.h
  214. +2 −2 deps/icu-small/source/i18n/unicode/coll.h
  215. +18 −0 deps/icu-small/source/i18n/unicode/currunit.h
  216. +25 −1 deps/icu-small/source/i18n/unicode/dcfmtsym.h
  217. +0 −24 deps/icu-small/source/i18n/unicode/decimfmt.h
  218. +0 −2 deps/icu-small/source/i18n/unicode/dtitvinf.h
  219. +19 −18 deps/icu-small/source/i18n/unicode/dtptngen.h
  220. +10 −2 deps/icu-small/source/i18n/unicode/fpositer.h
  221. +1 −3 deps/icu-small/source/i18n/unicode/measfmt.h
  222. +9 −4 deps/icu-small/source/i18n/unicode/measunit.h
  223. +111 −0 deps/icu-small/source/i18n/unicode/nounit.h
  224. +1,998 −0 deps/icu-small/source/i18n/unicode/numberformatter.h
  225. +48 −1 deps/icu-small/source/i18n/unicode/numfmt.h
  226. +3 −3 deps/icu-small/source/i18n/unicode/plurrule.h
  227. +16 −1 deps/icu-small/source/i18n/unicode/rbnf.h
  228. 0 deps/icu-small/source/i18n/unicode/selfmt.h
  229. +10 −0 deps/icu-small/source/i18n/unicode/smpdtfmt.h
  230. +2 −4 deps/icu-small/source/i18n/unicode/tznames.h
  231. +2 −2 deps/icu-small/source/i18n/unicode/ucoleitr.h
  232. +9 −4 deps/icu-small/source/i18n/unicode/unum.h
  233. +22 −27 deps/icu-small/source/i18n/unicode/uspoof.h
  234. +72 −25 deps/icu-small/source/i18n/unum.cpp
  235. +4 −4 deps/icu-small/source/i18n/uspoof.cpp
  236. +1 −0 deps/icu-small/source/i18n/uspoof_conf.cpp
  237. +2 −0 deps/icu-small/source/i18n/uspoof_conf.h
  238. +33 −35 deps/icu-small/source/i18n/utf8collationiterator.cpp
  239. +8 −18 deps/icu-small/source/i18n/vtzone.cpp
  240. +3 −3 deps/icu-small/source/i18n/windtfmt.cpp
  241. +4 −4 deps/icu-small/source/i18n/winnmfmt.cpp
  242. +14 −14 deps/icu-small/source/i18n/wintzimpl.cpp
  243. +1 −2 deps/icu-small/source/i18n/zonemeta.cpp
  244. +2 −7 deps/icu-small/source/tools/escapesrc/escapesrc.cpp
  245. +2 −2 deps/icu-small/source/tools/genrb/parse.cpp
  246. +17 −5 deps/icu-small/source/tools/genrb/wrtjava.cpp
  247. +5 −0 deps/icu-small/source/tools/genrb/wrtxml.cpp
  248. +1 −1 deps/icu-small/source/tools/toolutil/package.cpp
  249. +2 −1 deps/icu-small/source/tools/toolutil/pkg_genc.cpp
  250. +41 −2 deps/icu-small/source/tools/toolutil/ppucd.cpp
  251. +6 −1 deps/icu-small/source/tools/toolutil/ppucd.h
  252. +1 −0 deps/icu-small/source/tools/toolutil/swapimpl.cpp
  253. +2 −2 deps/icu-small/source/tools/toolutil/uparse.cpp
  254. +1 −0 tools/icu/icu-generic.gyp
View
@@ -230,7 +230,7 @@ The externally maintained libraries used by Node.js are:
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyrighy (c) 1999 TaBE Project.
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
View
@@ -1092,8 +1092,8 @@ def glob_to_var(dir_base, dir_sub, patch_dir):
def configure_intl(o):
icus = [
{
'url': 'https://ssl.icu-project.org/files/icu4c/59.1/icu4c-59_1-src.zip',
'md5': '29a41f9bb576b06c7eef0487a84a7674',
'url': 'https://ssl.icu-project.org/files/icu4c/60.1/icu4c-60_1-src.zip',
'md5': 'e6cb990ac2a3161d31a3def8435f80cb',
},
]
def icu_download(path):
View
@@ -131,7 +131,7 @@ property of their respective owners.
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyrighy (c) 1999 TaBE Project.
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
@@ -1,8 +1,8 @@
Small ICU sources - auto generated by shrink-icu-src.py
This directory contains the ICU subset used by --with-intl=small-icu (the default)
It is a strict subset of ICU 59 source files with the following exception(s):
* deps/icu-small/source/data/in/icudt59l.dat : Reduced-size data file
It is a strict subset of ICU 60 source files with the following exception(s):
* deps/icu-small/source/data/in/icudt60l.dat : Reduced-size data file
To rebuild this directory, see ../../tools/icu/README.md
@@ -28,7 +28,7 @@ U_NAMESPACE_BEGIN
BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list(parentList), listLength(parentListLength) {
uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
uprv_memset(latin1Contains, 0, sizeof(latin1Contains));
uprv_memset(table7FF, 0, sizeof(table7FF));
uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
@@ -45,14 +45,16 @@ BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
}
list4kStarts[0x11]=listLength-1;
containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]);
initBits();
overrideIllegal();
}
BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
containsFFFD(otherBMPSet.containsFFFD),
list(newParentList), listLength(newParentListLength) {
uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains));
uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
@@ -120,21 +122,38 @@ void BMPSet::initBits() {
UChar32 start, limit;
int32_t listIndex=0;
// Set asciiBytes[].
// Set latin1Contains[].
do {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(start>=0x80) {
if(start>=0x100) {
break;
}
do {
asciiBytes[start++]=1;
} while(start<limit && start<0x80);
} while(limit<=0x80);
latin1Contains[start++]=1;
} while(start<limit && start<0x100);
} while(limit<=0x100);
// Find the first range overlapping with (or after) 80..FF again,
// to include them in table7FF as well.
for(listIndex=0;;) {
start=list[listIndex++];
if(listIndex<listLength) {
limit=list[listIndex++];
} else {
limit=0x110000;
}
if(limit>0x80) {
if(start<0x80) {
start=0x80;
}
break;
}
}
// Set table7FF[].
while(start<0x800) {
@@ -204,19 +223,14 @@ void BMPSet::initBits() {
* for faster validity checking at runtime.
* No need to set 0 values where they were reset to 0 in the constructor
* and not modified by initBits().
* (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* (table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
* Need to set 0 values for surrogates D800..DFFF.
*/
void BMPSet::overrideIllegal() {
uint32_t bits, mask;
int32_t i;
if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
// contains(FFFD)==TRUE
for(i=0x80; i<0xc0; ++i) {
asciiBytes[i]=1;
}
if(containsFFFD) {
bits=3; // Lead bytes 0xC0 and 0xC1.
for(i=0; i<64; ++i) {
table7FF[i]|=bits;
@@ -233,7 +247,6 @@ void BMPSet::overrideIllegal() {
bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
}
} else {
// contains(FFFD)==FALSE
mask=~(0x10001<<0xd); // Lead byte 0xED.
for(i=32; i<64; ++i) { // Second half of 4k block.
bmpBlockBits[i]&=mask;
@@ -277,8 +290,8 @@ int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
UBool
BMPSet::contains(UChar32 c) const {
if((uint32_t)c<=0x7f) {
return (UBool)asciiBytes[c];
if((uint32_t)c<=0xff) {
return (UBool)latin1Contains[c];
} else if((uint32_t)c<=0x7ff) {
return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
} else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
@@ -314,8 +327,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span
do {
c=*s;
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@@ -354,8 +367,8 @@ BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition
// span not
do {
c=*s;
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@@ -403,8 +416,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(!asciiBytes[c]) {
if(c<=0xff) {
if(!latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@@ -446,8 +459,8 @@ BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondi
// span not
for(;;) {
c=*(--limit);
if(c<=0x7f) {
if(asciiBytes[c]) {
if(c<=0xff) {
if(latin1Contains[c]) {
break;
}
} else if(c<=0x7ff) {
@@ -497,22 +510,22 @@ const uint8_t *
BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
const uint8_t *limit=s+length;
uint8_t b=*s;
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// Initial all-ASCII span.
if(spanCondition) {
do {
if(!asciiBytes[b] || ++s==limit) {
if(!latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b] || ++s==limit) {
if(latin1Contains[b] || ++s==limit) {
return s;
}
b=*s;
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
length=(int32_t)(limit-s);
}
@@ -540,20 +553,20 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// single trail byte, check for preceding 3- or 4-byte lead byte
if(length>=2 && (b=*(limit-2))>=0xe0) {
limit-=2;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
} else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
// 4-byte lead byte with only two trail bytes
limit-=3;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
} else {
// lead byte with no trail bytes
--limit;
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
limit0=limit;
}
}
@@ -563,26 +576,26 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
while(s<limit) {
b=*s;
if(b<0xc0) {
// ASCII; or trail bytes with the result of contains(FFFD).
if(U8_IS_SINGLE(b)) {
// ASCII
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return s;
} else if(++s==limit) {
return limit0;
}
b=*s;
} while(b<0xc0);
} while(U8_IS_SINGLE(b));
}
}
++s; // Advance past the lead byte.
@@ -619,16 +632,17 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
if( ( (0x10000<=c && c<=0x10ffff) ?
containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
asciiBytes[0x80]
containsFFFD
) != spanCondition
) {
return s-1;
}
s+=3;
continue;
}
} else /* 0xc0<=b<0xe0 */ {
} else {
if( /* handle U+0000..U+07FF inline */
b>=0xc0 &&
(t1=(uint8_t)(*s-0x80)) <= 0x3f
) {
if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
@@ -642,7 +656,7 @@ BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanConditi
// Give an illegal sequence the same value as the result of contains(FFFD).
// Handle each byte of an illegal sequence separately to simplify the code;
// no need to optimize error handling.
if(asciiBytes[0x80]!=spanCondition) {
if(containsFFFD!=spanCondition) {
return s-1;
}
}
@@ -667,26 +681,26 @@ BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCon
do {
b=s[--length];
if((int8_t)b>=0) {
if(U8_IS_SINGLE(b)) {
// ASCII sub-span
if(spanCondition) {
do {
if(!asciiBytes[b]) {
if(!latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
} else {
do {
if(asciiBytes[b]) {
if(latin1Contains[b]) {
return length+1;
} else if(length==0) {
return 0;
}
b=s[--length];
} while((int8_t)b>=0);
} while(U8_IS_SINGLE(b));
}
}
@@ -28,11 +28,12 @@ U_NAMESPACE_BEGIN
* Helper class for frozen UnicodeSets, implements contains() and span()
* optimized for BMP code points. Structured to be UTF-8-friendly.
*
* ASCII: Look up bytes.
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF,
* with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
* Supplementary characters: Binary search over
* the supplementary part of the parent set's inversion list.
*/
class BMPSet : public UMemory {
public:
@@ -96,12 +97,12 @@ class BMPSet : public UMemory {
inline UBool containsSlow(UChar32 c, int32_t lo, int32_t hi) const;
/*
* One byte per ASCII character, or trail byte in lead position.
* 0 or 1 for ASCII characters.
* The value for trail bytes is the result of contains(FFFD)
* for faster validity checking at runtime.
* One byte 0 or 1 per Latin-1 character.
*/
UBool asciiBytes[0xc0];
UBool latin1Contains[0x100];
/* TRUE if contains(U+FFFD). */
UBool containsFFFD;
/*
* One bit per code point from U+0000..U+07FF.
Oops, something went wrong.

0 comments on commit 44d3e17

Please sign in to comment.