Skip to content

Commit

Permalink
Combine control into one character group
Browse files Browse the repository at this point in the history
Same as with punct, we're currently not interested in distinguishing
between Cc and Cf, so only store their union.
  • Loading branch information
nikic committed Aug 24, 2021
1 parent d0897b3 commit 425c2e3
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 56 deletions.
62 changes: 29 additions & 33 deletions ext/mbstring/php_unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,39 +40,38 @@
#define UC_ZS 6 /* Separator, Space */
#define UC_ZL 7 /* Separator, Line */
#define UC_ZP 8 /* Separator, Paragraph */
#define UC_CC 9 /* Other, Control */
#define UC_CF 10 /* Other, Format */
#define UC_OS 11 /* Other, Surrogate */
#define UC_CO 12 /* Other, Private Use */
#define UC_CN 13 /* Other, Not Assigned */
#define UC_LU 14 /* Letter, Uppercase */
#define UC_LL 15 /* Letter, Lowercase */
#define UC_LT 16 /* Letter, Titlecase */
#define UC_LM 17 /* Letter, Modifier */
#define UC_LO 18 /* Letter, Other */
#define UC_SM 19 /* Symbol, Math */
#define UC_SC 20 /* Symbol, Currency */
#define UC_SK 21 /* Symbol, Modifier */
#define UC_SO 22 /* Symbol, Other */
#define UC_L 23 /* Left-To-Right */
#define UC_R 24 /* Right-To-Left */
#define UC_EN 25 /* European Number */
#define UC_ES 26 /* European Number Separator */
#define UC_ET 27 /* European Number Terminator */
#define UC_AN 28 /* Arabic Number */
#define UC_CS 29 /* Common Number Separator */
#define UC_B 30 /* Block Separator */
#define UC_S 31 /* Segment Separator */
#define UC_WS 32 /* Whitespace */
#define UC_ON 33 /* Other Neutrals */
#define UC_AL 34 /* Arabic Letter */
#define UC_OS 9 /* Other, Surrogate */
#define UC_CO 10 /* Other, Private Use */
#define UC_CN 11 /* Other, Not Assigned */
#define UC_LU 12 /* Letter, Uppercase */
#define UC_LL 13 /* Letter, Lowercase */
#define UC_LT 14 /* Letter, Titlecase */
#define UC_LM 15 /* Letter, Modifier */
#define UC_LO 16 /* Letter, Other */
#define UC_SM 17 /* Symbol, Math */
#define UC_SC 18 /* Symbol, Currency */
#define UC_SK 19 /* Symbol, Modifier */
#define UC_SO 20 /* Symbol, Other */
#define UC_L 21 /* Left-To-Right */
#define UC_R 22 /* Right-To-Left */
#define UC_EN 23 /* European Number */
#define UC_ES 24 /* European Number Separator */
#define UC_ET 25 /* European Number Terminator */
#define UC_AN 26 /* Arabic Number */
#define UC_CS 27 /* Common Number Separator */
#define UC_B 28 /* Block Separator */
#define UC_S 29 /* Segment Separator */
#define UC_WS 30 /* Whitespace */
#define UC_ON 31 /* Other Neutrals */
#define UC_AL 32 /* Arabic Letter */

/* Merged property categories */
#define UC_P 35
#define UC_C 33

This comment has been minimized.

Copy link
@alexdowad

alexdowad Aug 24, 2021

Contributor

This line could also benefit from a comment.

#define UC_P 34

/* Derived properties from DerivedCoreProperties.txt */
#define UC_CASED 36
#define UC_CASE_IGNORABLE 37
#define UC_CASED 35
#define UC_CASE_IGNORABLE 36


MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...);
Expand Down Expand Up @@ -113,7 +112,7 @@ static inline int php_unicode_is_upper(unsigned long code) {
#define php_unicode_is_alpha(cc) php_unicode_is_prop(cc, UC_LU, UC_LL, UC_LM, UC_LO, UC_LT, -1)
#define php_unicode_is_digit(cc) php_unicode_is_prop1(cc, UC_ND)
#define php_unicode_is_alnum(cc) php_unicode_is_prop(cc, UC_LU, UC_LL, UC_LM, UC_LO, UC_LT, UC_ND, -1)
#define php_unicode_is_cntrl(cc) php_unicode_is_prop(cc, UC_CC, UC_CF, -1)
#define php_unicode_is_cntrl(cc) php_unicode_is_prop1(cc, UC_C)
#define php_unicode_is_blank(cc) php_unicode_is_prop1(cc, UC_ZS)
#define php_unicode_is_punct(cc) php_unicode_is_prop1(cc, UC_P)
#define php_unicode_is_graph(cc) php_unicode_is_prop(cc, \
Expand All @@ -126,9 +125,6 @@ static inline int php_unicode_is_upper(unsigned long code) {
UC_SM, UC_SM, UC_SC, UC_SK, UC_SO, UC_ZS, -1)
#define php_unicode_is_title(cc) php_unicode_is_prop1(cc, UC_LT)

#define php_unicode_is_isocntrl(cc) php_unicode_is_prop1(cc, UC_CC)
#define php_unicode_is_fmtcntrl(cc) php_unicode_is_prop1(cc, UC_CF)

#define php_unicode_is_symbol(cc) php_unicode_is_prop(cc, UC_SM, UC_SC, UC_SO, UC_SK, -1)
#define php_unicode_is_number(cc) php_unicode_is_prop(cc, UC_ND, UC_NO, UC_NL, -1)
#define php_unicode_is_nonspacing(cc) php_unicode_is_prop1(cc, UC_MN)
Expand Down
16 changes: 10 additions & 6 deletions ext/mbstring/ucgendat/ucgendat.php
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,12 @@ public function __construct() {
*/
$this->propIndexes = array_flip([
"Mn", "Mc", "Me", "Nd", "Nl", "No",
"Zs", "Zl", "Zp", "Cc", "Cf", "Cs",
"Co", "Cn", "Lu", "Ll", "Lt", "Lm",
"Lo", "Sm", "Sc", "Sk", "So", "L",
"R", "EN", "ES", "ET", "AN", "CS",
"B", "S", "WS", "ON", "AL",
"P", "Cased", "Case_Ignorable"
"Zs", "Zl", "Zp", "Cs", "Co", "Cn",
"Lu", "Ll", "Lt", "Lm", "Lo", "Sm",
"Sc", "Sk", "So", "L", "R", "EN",
"ES", "ET", "AN", "CS", "B", "S",
"WS", "ON", "AL",
"C", "P", "Cased", "Case_Ignorable"
]);
$this->numProps = count($this->propIndexes);

Expand Down Expand Up @@ -135,6 +135,10 @@ function propToIndex(string $prop) : int {
if (in_array($prop, ["Pc", "Pd", "Ps", "Pe", "Po", "Pi", "Pf"])) {
$prop = "P";
}
/* Same for control. */
if (in_array($prop, ["Cc", "Cf"])) {
$prop = "C";
}

if (!isset($this->propIndexes[$prop])) {
throw new Exception("Unknown property $prop");
Expand Down
34 changes: 17 additions & 17 deletions ext/mbstring/unicode_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
* the project's page doesn't seem to be live anymore, so you can use
* OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */

static const unsigned short _ucprop_size = 38;
static const unsigned short _ucprop_size = 37;

static const unsigned short _ucprop_offsets[] = {
0x0000, 0x028e, 0x03ec, 0x03f6, 0x0470, 0x0488, 0x0516, 0x0524,
0x0526, 0x0528, 0x052c, 0x0554, 0x0556, 0x055c, 0x055c, 0x0a58,
0x0f62, 0x0f76, 0x0ff0, 0x13c2, 0x1442, 0x146c, 0x14a8, 0x1614,
0x1b90, 0x1c1e, 0x1c38, 0x1c4a, 0x1c7a, 0x1c88, 0x1ca2, 0x1cac,
0x1cb2, 0x1cc0, 0x20ae, 0x212a, 0x229c, 0x23b6, 0x26ea, 0x0000
0x0526, 0x0528, 0x052a, 0x0530, 0x0530, 0x0a2c, 0x0f36, 0x0f4a,
0x0fc4, 0x1396, 0x1416, 0x1440, 0x147c, 0x15e8, 0x1b64, 0x1bf2,
0x1c0c, 0x1c1e, 0x1c4e, 0x1c5c, 0x1c76, 0x1c80, 0x1c86, 0x1c94,
0x2082, 0x20fe, 0x212a, 0x229c, 0x23b6, 0x26ea, 0x0000, 0x0000
};

static const unsigned int _ucprop_ranges[] = {
Expand Down Expand Up @@ -351,17 +351,6 @@ static const unsigned int _ucprop_ranges[] = {
0x00002000, 0x0000200a, 0x0000202f, 0x0000202f,
0x0000205f, 0x0000205f, 0x00003000, 0x00003000,
0x00002028, 0x00002028, 0x00002029, 0x00002029,
0x00000000, 0x0000001f, 0x0000007f, 0x0000009f,
0x000000ad, 0x000000ad, 0x00000600, 0x00000605,
0x0000061c, 0x0000061c, 0x000006dd, 0x000006dd,
0x0000070f, 0x0000070f, 0x000008e2, 0x000008e2,
0x0000180e, 0x0000180e, 0x0000200b, 0x0000200f,
0x0000202a, 0x0000202e, 0x00002060, 0x00002064,
0x00002066, 0x0000206f, 0x0000feff, 0x0000feff,
0x0000fff9, 0x0000fffb, 0x000110bd, 0x000110bd,
0x000110cd, 0x000110cd, 0x00013430, 0x00013438,
0x0001bca0, 0x0001bca3, 0x0001d173, 0x0001d17a,
0x000e0001, 0x000e0001, 0x000e0020, 0x000e007f,
0x0000d800, 0x0000dfff, 0x0000e000, 0x0000f8ff,
0x000f0000, 0x000ffffd, 0x00100000, 0x0010fffd,
0x00000041, 0x0000005a, 0x000000c0, 0x000000d6,
Expand Down Expand Up @@ -2143,7 +2132,18 @@ static const unsigned int _ucprop_ranges[] = {
0x0001ee79, 0x0001ee7c, 0x0001ee7e, 0x0001ee7e,
0x0001ee80, 0x0001ee89, 0x0001ee8b, 0x0001ee9b,
0x0001eea1, 0x0001eea3, 0x0001eea5, 0x0001eea9,
0x0001eeab, 0x0001eebb, 0x00000021, 0x00000023,
0x0001eeab, 0x0001eebb, 0x00000000, 0x0000001f,
0x0000007f, 0x0000009f, 0x000000ad, 0x000000ad,
0x00000600, 0x00000605, 0x0000061c, 0x0000061c,
0x000006dd, 0x000006dd, 0x0000070f, 0x0000070f,
0x000008e2, 0x000008e2, 0x0000180e, 0x0000180e,
0x0000200b, 0x0000200f, 0x0000202a, 0x0000202e,
0x00002060, 0x00002064, 0x00002066, 0x0000206f,
0x0000feff, 0x0000feff, 0x0000fff9, 0x0000fffb,
0x000110bd, 0x000110bd, 0x000110cd, 0x000110cd,
0x00013430, 0x00013438, 0x0001bca0, 0x0001bca3,
0x0001d173, 0x0001d17a, 0x000e0001, 0x000e0001,
0x000e0020, 0x000e007f, 0x00000021, 0x00000023,
0x00000025, 0x0000002a, 0x0000002c, 0x0000002f,
0x0000003a, 0x0000003b, 0x0000003f, 0x00000040,
0x0000005b, 0x0000005d, 0x0000005f, 0x0000005f,
Expand Down

0 comments on commit 425c2e3

Please sign in to comment.