diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 46bddd17a7365..c34477bcf2000 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -225,7 +225,9 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf if (c < 0x80) { *out++ = c; - } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ + } else if (c < 0xC2) { + *out++ = MBFL_BAD_INPUT; + } else if (c <= 0xDF) { /* 2 byte character */ if (p < e) { unsigned char c2 = *p++; if ((c2 & 0xC0) != 0x80) { @@ -237,7 +239,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf } else { *out++ = MBFL_BAD_INPUT; } - } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ + } else if (c <= 0xEF) { /* 3 byte character */ if ((e - p) >= 2) { unsigned char c2 = *p++; unsigned char c3 = *p++; @@ -262,7 +264,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf } } } - } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ + } else if (c <= 0xF4) { /* 4 byte character */ if ((e - p) >= 3) { unsigned char c2 = *p++; unsigned char c3 = *p++; diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3fd4d43f3bf24..51e279acbdc1f 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1715,13 +1715,85 @@ PHP_FUNCTION(mb_str_split) } } +#ifdef __SSE2__ +/* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r) + * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector + * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all + * 16 of them, returning the sum in an ordinary scalar register */ +static inline uint32_t _mm_sum_epu8(const __m128i v) +{ + /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register + * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers, + * sums up those differences, and stores them as two 16-byte integers in the top and bottom + * halves of the destination XMM register + * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are + * summed up will actually just be the 8-bit values from `v` */ + __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128()); + /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have + * to extract it here; but it stored the sum as two different 16-bit values + * _mm_cvtsi128_si32 extracts one of those values into a scalar register + * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */ + return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4); +} +#endif + +/* This assumes that `string` is valid UTF-8 + * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes) + * Interpreted as signed integers, those are all byte values less than -64 + * A fast way to get the length of a UTF-8 string is to start with its byte length, + * then subtract off the number of continuation bytes */ +static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len) +{ + unsigned char *e = p + len; + +#ifdef __SSE2__ + if (len >= sizeof(__m128i)) { + const __m128i threshold = _mm_set1_epi8(-64); + const __m128i delta = _mm_set1_epi8(1); + __m128i counter = _mm_set1_epi8(0); /* Vector of 16 continuation-byte counters */ + + int reset_counter = 255; + do { + __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */ + __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */ + counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */ + + /* The counters can only go up to 255, so every 255 iterations, fold them into `len` + * and reset them to zero */ + if (--reset_counter == 0) { + len -= _mm_sum_epu8(counter); + counter = _mm_set1_epi8(0); + reset_counter = 255; + } + + p += sizeof(__m128i); + } while (p + sizeof(__m128i) <= e); + + len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */ + } +#endif + + /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */ + while (p < e) { + signed char c = *p++; + if (c < -64) { + len--; + } + } + + return len; +} + static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding) { unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4); if (char_len) { return ZSTR_LEN(string) / char_len; + } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) { + return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string)); } + uint32_t wchar_buf[128]; unsigned char *in = (unsigned char*)ZSTR_VAL(string); size_t in_len = ZSTR_LEN(string); @@ -1789,14 +1861,7 @@ static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char * } static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) { - size_t result = 0; - while (pos > start) { - unsigned char c = *--pos; - if (c < 0x80 || (c & 0xC0) != 0x80) { - result++; - } - } - return result; + return mb_fast_strlen_utf8(start, pos - start); } static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse) diff --git a/ext/mbstring/tests/mb_strlen.phpt b/ext/mbstring/tests/mb_strlen.phpt index e32e9e9370580..b3fb28309bcbe 100644 --- a/ext/mbstring/tests/mb_strlen.phpt +++ b/ext/mbstring/tests/mb_strlen.phpt @@ -62,10 +62,26 @@ mb_internal_encoding('JIS') or print("mb_internal_encoding() failed\n"); print strlen($jis) . "\n"; echo "== UTF-8 ==\n"; -$utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP'); -print mb_strlen($utf8,'UTF-8') . "\n"; -mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n"); -print strlen($utf8) . "\n"; +$utf8 = mb_convert_encoding($euc_jp, 'UTF-8', 'EUC-JP'); +print mb_strlen($utf8,'UTF-8') . " codepoints\n"; +mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n"); +print strlen($utf8) . " bytes\n"; + +$utf8 = "abcde あいうえお 汉字 ελληνικά"; +$long_utf8 = str_repeat($utf8, 100); +print mb_strlen($utf8, 'UTF-8') . "\n"; +print mb_strlen($long_utf8, 'UTF-8') . "\n"; + +echo "== UTF-8 with performance optimizations ==\n"; +// Optimized mb_strlen can be used on UTF-8 strings after they are checked for validity +mb_check_encoding($utf8); +mb_check_encoding($long_utf8); +print mb_strlen($utf8, 'UTF-8') . "\n"; +print mb_strlen($long_utf8, 'UTF-8') . "\n"; + +$str = str_repeat('Σ', 2048); // 2-byte UTF-8 character +mb_check_encoding($str, 'UTF-8'); +print mb_strlen($str, 'UTF-8') . "\n"; // Wrong Parameters echo "== WRONG PARAMETERS ==\n"; @@ -110,7 +126,13 @@ try { 43 90 == UTF-8 == -43 -101 +43 codepoints +101 bytes +23 +2300 +== UTF-8 with performance optimizations == +23 +2300 +2048 == WRONG PARAMETERS == mb_strlen(): Argument #2 ($encoding) must be a valid encoding, "BAD_NAME" given