From 950a7db9fec125c666d9485e4db79c364fe4c810 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 2 May 2022 22:50:20 +0200 Subject: [PATCH] Use fast text conversion filters to implement mb_check_encoding Benchmarking reveals that this is about 8% slower for UTF-8 strings which have a bad codepoint at the very beginning of the string. For good strings, or those where the first bad codepoint is much later in the string, it is significantly faster (2-3 times faster in many cases). --- ext/mbstring/mbstring.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index c9aabfa957bb5..7e51d4555468f 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3860,31 +3860,34 @@ PHP_FUNCTION(mb_get_info) } /* }}} */ -static int mbfl_filt_check_errors(int c, void* data) -{ - if (c == MBFL_BAD_INPUT) { - (*((mbfl_convert_filter**)data))->num_illegalchar++; - } - return 0; -} - MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding) { - mbfl_convert_filter *filter = mbfl_convert_filter_new(encoding, &mbfl_encoding_wchar, mbfl_filt_check_errors, NULL, &filter); - - while (length--) { - unsigned char c = *input++; - (filter->filter_function)(c, filter); - if (filter->num_illegalchar) { - mbfl_convert_filter_delete(filter); + uint32_t wchar_buf[128]; + unsigned char *in = (unsigned char*)input; + unsigned int state = 0; + + /* If the input string is not encoded in the given encoding, there is a significant chance + * that this will be seen in the first bytes. Therefore, rather than converting an entire + * buffer of 128 codepoints, convert and check just a few codepoints first */ + size_t out_len = encoding->to_wchar(&in, &length, wchar_buf, 8, &state); + ZEND_ASSERT(out_len <= 8); + for (int i = 0; i < out_len; i++) { + if (wchar_buf[i] == MBFL_BAD_INPUT) { return 0; } } - (filter->filter_flush)(filter); - int result = !filter->num_illegalchar; - mbfl_convert_filter_delete(filter); - return result; + while (length) { + out_len = encoding->to_wchar(&in, &length, wchar_buf, 128, &state); + ZEND_ASSERT(out_len <= 128); + for (int i = 0; i < out_len; i++) { + if (wchar_buf[i] == MBFL_BAD_INPUT) { + return 0; + } + } + } + + return 1; } static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)