Skip to content

Commit

Permalink
Use fast conversion filters to implement php_mb_ord
Browse files Browse the repository at this point in the history
Even for single-character strings, this is about 50% faster for
ASCII, UTF-8, and UTF-16. For long strings, the performance gain is
enormous, since the old code would convert the ENTIRE string, just
to pick out the first codepoint.
  • Loading branch information
alexdowad committed Jun 12, 2022
1 parent 9468fa7 commit 880803a
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 21 deletions.
4 changes: 4 additions & 0 deletions ext/mbstring/libmbfl/mbfl/mbfl_encoding.h
Expand Up @@ -143,6 +143,10 @@ typedef struct {
typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);

/* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
* the buffer must be at least this size (to work with all supported text encodings) */
#define MBSTRING_MIN_WCHAR_BUFSIZE 5

static inline void mb_convert_buf_init(mb_convert_buf *buf, size_t initsize, uint32_t repl_char, unsigned int err_mode)
{
buf->state = buf->errors = 0;
Expand Down
30 changes: 9 additions & 21 deletions ext/mbstring/mbstring.c
Expand Up @@ -3993,29 +3993,17 @@ static inline zend_long php_mb_ord(const char *str, size_t str_len, zend_string
return -2;
}

{
mbfl_wchar_device dev;
mbfl_convert_filter *filter;
zend_long cp;

mbfl_wchar_device_init(&dev);
filter = mbfl_convert_filter_new(enc, &mbfl_encoding_wchar, mbfl_wchar_device_output, 0, &dev);
/* If this assertion fails this means some memory allocation failure which is a bug */
ZEND_ASSERT(filter != NULL);

mbfl_convert_filter_feed_string(filter, (unsigned char*)str, str_len);
mbfl_convert_filter_flush(filter);

if (dev.pos < 1 || filter->num_illegalchar || dev.buffer[0] == MBFL_BAD_INPUT) {
cp = -1;
} else {
cp = dev.buffer[0];
}
/* Some legacy text encodings have a minimum required wchar buffer size;
* the ones which need the most are SJIS-Mac, UTF-7, and UTF7-IMAP */
uint32_t wchar_buf[MBSTRING_MIN_WCHAR_BUFSIZE];
unsigned int state = 0;
size_t out_len = enc->to_wchar((unsigned char**)&str, &str_len, wchar_buf, MBSTRING_MIN_WCHAR_BUFSIZE, &state);
ZEND_ASSERT(out_len <= MBSTRING_MIN_WCHAR_BUFSIZE);

mbfl_convert_filter_delete(filter);
mbfl_wchar_device_clear(&dev);
return cp;
if (!out_len || wchar_buf[0] == MBFL_BAD_INPUT) {
return -1;
}
return wchar_buf[0];
}


Expand Down

0 comments on commit 880803a

Please sign in to comment.