From 092ad3e4624f86a43648e6afb1c22ae7e3406699 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Thu, 5 Jan 2023 22:41:23 +0200 Subject: [PATCH 1/2] Optimize branch structure of UTF-8 decoder routine I like the asm which gcc -O3 generates on this modified code... and guess what: my CPU likes it too! (The asm is noticeably tighter, without any extra operations in the path which dispatches to the code for decoding a 1-byte, 2-byte, 3-byte, or 4-byte character. It's just CMP, conditional jump, CMP, conditional jump, CMP, conditional jump. ...Though I was admittedly impressed to see gcc could implement the boolean expression `c >= 0xC2 && c <= 0xDF` with just 3 instructions: add, CMP, then conditional jump. Pretty slick stuff there, guys.) Benchmark results: UTF-8, short - to UTF-16LE faster by 7.36% (0.0001 vs 0.0002) UTF-8, short - to UTF-16BE faster by 6.24% (0.0001 vs 0.0002) UTF-8, medium - to UTF-16BE faster by 4.56% (0.0003 vs 0.0003) UTF-8, medium - to UTF-16LE faster by 4.00% (0.0003 vs 0.0003) UTF-8, long - to UTF-16BE faster by 1.02% (0.0215 vs 0.0217) UTF-8, long - to UTF-16LE faster by 1.01% (0.0209 vs 0.0211) --- ext/mbstring/libmbfl/filters/mbfilter_utf8.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 46bddd17a7365..c34477bcf2000 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -225,7 +225,9 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf if (c < 0x80) { *out++ = c; - } else if (c >= 0xC2 && c <= 0xDF) { /* 2 byte character */ + } else if (c < 0xC2) { + *out++ = MBFL_BAD_INPUT; + } else if (c <= 0xDF) { /* 2 byte character */ if (p < e) { unsigned char c2 = *p++; if ((c2 & 0xC0) != 0x80) { @@ -237,7 +239,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf } else { *out++ = MBFL_BAD_INPUT; } - } else if (c >= 0xE0 && c <= 0xEF) { /* 3 byte character */ + } else if (c <= 0xEF) { /* 3 byte character */ if ((e - p) >= 2) { unsigned char c2 = *p++; unsigned char c3 = *p++; @@ -262,7 +264,7 @@ static size_t mb_utf8_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf } } } - } else if (c >= 0xF0 && c <= 0xF4) { /* 4 byte character */ + } else if (c <= 0xF4) { /* 4 byte character */ if ((e - p) >= 3) { unsigned char c2 = *p++; unsigned char c3 = *p++; From 92105da742bd6ba3a74353f5715d5f496611b040 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Fri, 30 Dec 2022 11:11:26 +0200 Subject: [PATCH 2/2] Add fast SSE2-based implementation of mb_strlen for known-valid UTF-8 strings One small piece of this was obtained from Stack Overflow. According to Stack Overflow's Terms of Service, all user-contributed code on SO is provided under a Creative Commons license. I believe this license is compatible with the code being included in PHP. Benchmarking results (UTF-8 only, for strings which have already been checked using mb_check_encoding): For very short (0-5 byte) strings, mb_strlen is 12% faster. The speedup gets greater and greater on longer input strings; for strings around 100KB, mb_strlen is 23 times faster. Currently the 'fast' code is gated behind a GC flag check which ensures it is only used on strings which have already been checked for UTF-8 validity. This is because the accelerated code will return different results on some invalid UTF-8 strings. --- ext/mbstring/mbstring.c | 81 ++++++++++++++++++++++++++++--- ext/mbstring/tests/mb_strlen.phpt | 34 ++++++++++--- 2 files changed, 101 insertions(+), 14 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 3fd4d43f3bf24..51e279acbdc1f 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1715,13 +1715,85 @@ PHP_FUNCTION(mb_str_split) } } +#ifdef __SSE2__ +/* Thanks to StackOverflow user 'Paul R' (https://stackoverflow.com/users/253056/paul-r) + * From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector + * Takes a 128-bit XMM register, treats each byte as an 8-bit integer, and sums up all + * 16 of them, returning the sum in an ordinary scalar register */ +static inline uint32_t _mm_sum_epu8(const __m128i v) +{ + /* We don't have any dedicated instruction to sum up 8-bit values from a 128-bit register + * _mm_sad_epu8 takes the differences between corresponding bytes of two different XMM registers, + * sums up those differences, and stores them as two 16-byte integers in the top and bottom + * halves of the destination XMM register + * By using a zeroed-out XMM register as one operand, we ensure the "differences" which are + * summed up will actually just be the 8-bit values from `v` */ + __m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128()); + /* If _mm_sad_epu8 had stored the sum of those bytes as a single integer, we would just have + * to extract it here; but it stored the sum as two different 16-bit values + * _mm_cvtsi128_si32 extracts one of those values into a scalar register + * _mm_extract_epi16 extracts the other one into another scalar register; then we just add them */ + return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4); +} +#endif + +/* This assumes that `string` is valid UTF-8 + * In UTF-8, the only bytes which do not start a new codepoint are 0x80-0xBF (continuation bytes) + * Interpreted as signed integers, those are all byte values less than -64 + * A fast way to get the length of a UTF-8 string is to start with its byte length, + * then subtract off the number of continuation bytes */ +static size_t mb_fast_strlen_utf8(unsigned char *p, size_t len) +{ + unsigned char *e = p + len; + +#ifdef __SSE2__ + if (len >= sizeof(__m128i)) { + const __m128i threshold = _mm_set1_epi8(-64); + const __m128i delta = _mm_set1_epi8(1); + __m128i counter = _mm_set1_epi8(0); /* Vector of 16 continuation-byte counters */ + + int reset_counter = 255; + do { + __m128i operand = _mm_loadu_si128((__m128i*)p); /* Load 16 bytes */ + __m128i lt = _mm_cmplt_epi8(operand, threshold); /* Find all which are continuation bytes */ + counter = _mm_add_epi8(counter, _mm_and_si128(lt, delta)); /* Update the 16 counters */ + + /* The counters can only go up to 255, so every 255 iterations, fold them into `len` + * and reset them to zero */ + if (--reset_counter == 0) { + len -= _mm_sum_epu8(counter); + counter = _mm_set1_epi8(0); + reset_counter = 255; + } + + p += sizeof(__m128i); + } while (p + sizeof(__m128i) <= e); + + len -= _mm_sum_epu8(counter); /* Fold in any remaining non-zero values in the 16 counters */ + } +#endif + + /* Check for continuation bytes in the 0-15 remaining bytes at the end of the string */ + while (p < e) { + signed char c = *p++; + if (c < -64) { + len--; + } + } + + return len; +} + static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding) { unsigned int char_len = encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2 | MBFL_ENCTYPE_WCS4); if (char_len) { return ZSTR_LEN(string) / char_len; + } else if (php_mb_is_no_encoding_utf8(encoding->no_encoding) && GC_FLAGS(string) & IS_STR_VALID_UTF8) { + return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string)); } + uint32_t wchar_buf[128]; unsigned char *in = (unsigned char*)ZSTR_VAL(string); size_t in_len = ZSTR_LEN(string); @@ -1789,14 +1861,7 @@ static unsigned char* offset_to_pointer_utf8(unsigned char *str, unsigned char * } static size_t pointer_to_offset_utf8(unsigned char *start, unsigned char *pos) { - size_t result = 0; - while (pos > start) { - unsigned char c = *--pos; - if (c < 0x80 || (c & 0xC0) != 0x80) { - result++; - } - } - return result; + return mb_fast_strlen_utf8(start, pos - start); } static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const mbfl_encoding *enc, ssize_t offset, bool reverse) diff --git a/ext/mbstring/tests/mb_strlen.phpt b/ext/mbstring/tests/mb_strlen.phpt index e32e9e9370580..b3fb28309bcbe 100644 --- a/ext/mbstring/tests/mb_strlen.phpt +++ b/ext/mbstring/tests/mb_strlen.phpt @@ -62,10 +62,26 @@ mb_internal_encoding('JIS') or print("mb_internal_encoding() failed\n"); print strlen($jis) . "\n"; echo "== UTF-8 ==\n"; -$utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP'); -print mb_strlen($utf8,'UTF-8') . "\n"; -mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n"); -print strlen($utf8) . "\n"; +$utf8 = mb_convert_encoding($euc_jp, 'UTF-8', 'EUC-JP'); +print mb_strlen($utf8,'UTF-8') . " codepoints\n"; +mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n"); +print strlen($utf8) . " bytes\n"; + +$utf8 = "abcde あいうえお 汉字 ελληνικά"; +$long_utf8 = str_repeat($utf8, 100); +print mb_strlen($utf8, 'UTF-8') . "\n"; +print mb_strlen($long_utf8, 'UTF-8') . "\n"; + +echo "== UTF-8 with performance optimizations ==\n"; +// Optimized mb_strlen can be used on UTF-8 strings after they are checked for validity +mb_check_encoding($utf8); +mb_check_encoding($long_utf8); +print mb_strlen($utf8, 'UTF-8') . "\n"; +print mb_strlen($long_utf8, 'UTF-8') . "\n"; + +$str = str_repeat('Σ', 2048); // 2-byte UTF-8 character +mb_check_encoding($str, 'UTF-8'); +print mb_strlen($str, 'UTF-8') . "\n"; // Wrong Parameters echo "== WRONG PARAMETERS ==\n"; @@ -110,7 +126,13 @@ try { 43 90 == UTF-8 == -43 -101 +43 codepoints +101 bytes +23 +2300 +== UTF-8 with performance optimizations == +23 +2300 +2048 == WRONG PARAMETERS == mb_strlen(): Argument #2 ($encoding) must be a valid encoding, "BAD_NAME" given