diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index 8772d8d5be4a3..190e9608d4806 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -34,6 +34,60 @@ #include "mbfilter.h" #include "mbfilter_utf16.h" +/* Macros to create char length table */ +#define B2(n) n,n +#define B4(n) B2(n),B2(n) +#define B8(n) B4(n),B4(n) +#define B16(n) B8(n),B8(n) +#define B32(n) B16(n),B16(n) +#define B64(n) B32(n),B32(n) +#define B128(n) B64(n),B64(n) +#define B256(n) B128(n),B128(n) +#define B512(n) B256(n),B256(n) +#define B1024(n) B512(n),B512(n) +#define B2048(n) B1024(n),B1024(n) +#define B4096(n) B2048(n),B2048(n) +#define B8192(n) B4096(n),B4096(n) +#define B16384(n) B8192(n),B8192(n) + +/* UTF-16 character length table */ +const char unsigned mblen_table_utf16_le[65536] = { + B16384(2), + B16384(2), + B16384(2), + B4096(2), + B2048(2), + B1024(4), /* surrogate pairs: 0xD800-0xDFFF. High surrogate first: 0xD800, last: 0xDBFF */ + B1024(2), /* Low surrogate first: 0xDC00, last: 0xDFFF */ + B8192(2), +}; + +/* macro to make swapped length table */ +#define BY B128(2),B64(2),B16(2),B8(2),B4(4),B4(2),B32(2) + +/* swapped bytes table */ +const char unsigned mblen_table_utf16_be[65536] = { + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, + BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY,BY, +}; + static const char *mbfl_encoding_utf16_aliases[] = {"utf16", NULL}; const mbfl_encoding mbfl_encoding_utf16 = { @@ -41,7 +95,7 @@ const mbfl_encoding mbfl_encoding_utf16 = { "UTF-16", "UTF-16", (const char *(*)[])&mbfl_encoding_utf16_aliases, - NULL, + mblen_table_utf16_be, MBFL_ENCTYPE_MWC2BE, &vtbl_utf16_wchar, &vtbl_wchar_utf16 @@ -52,7 +106,7 @@ const mbfl_encoding mbfl_encoding_utf16be = { "UTF-16BE", "UTF-16BE", NULL, - NULL, + mblen_table_utf16_be, MBFL_ENCTYPE_MWC2BE, &vtbl_utf16be_wchar, &vtbl_wchar_utf16be @@ -63,7 +117,7 @@ const mbfl_encoding mbfl_encoding_utf16le = { "UTF-16LE", "UTF-16LE", NULL, - NULL, + mblen_table_utf16_le, MBFL_ENCTYPE_MWC2LE, &vtbl_utf16le_wchar, &vtbl_wchar_utf16le diff --git a/ext/mbstring/libmbfl/mbfl/brg_endian.h b/ext/mbstring/libmbfl/mbfl/brg_endian.h new file mode 100644 index 0000000000000..7226eb3bec51a --- /dev/null +++ b/ext/mbstring/libmbfl/mbfl/brg_endian.h @@ -0,0 +1,142 @@ +/* + --------------------------------------------------------------------------- + Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The redistribution and use of this software (with or without changes) + is allowed without the payment of fees or royalties provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- + Issue Date: 20/12/2007 + Changes for ARM 9/9/2010 +*/ + +#ifndef _BRG_ENDIAN_H +#define _BRG_ENDIAN_H + +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ + +#if 0 +/* Include files where endian defines and byteswap functions may reside */ +#if defined( __sun ) +# include +#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) +# include +#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ + defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) +# include +#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) +# if !defined( __MINGW32__ ) && !defined( _AIX ) +# include +# if !defined( __BEOS__ ) +# include +# endif +# endif +#endif +#endif + +/* Now attempt to set the define for platform byte order using any */ +/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ +/* seem to encompass most endian symbol definitions */ + +#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) +# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) +# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( _BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( _LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) +# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) +# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif defined( __BIG_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#elif defined( __LITTLE_ENDIAN__ ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#endif + +/* if the platform byte order could not be determined, then try to */ +/* set this define using common machine defines */ +#if !defined(PLATFORM_BYTE_ORDER) + +#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ + defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ + defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ + defined( vax ) || defined( vms ) || defined( VMS ) || \ + defined( __VMS ) || defined( _M_X64 ) +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN + +#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ + defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ + defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ + defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ + defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ + defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ + defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN + +#elif defined(__arm__) +# ifdef __BIG_ENDIAN +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +# else +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +# endif +#elif 1 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 /* **** EDIT HERE IF NECESSARY **** */ +# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN +#else +# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order +#endif + +#endif + +#endif diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 331ce4941c818..a4cef23ea10b8 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -1,7 +1,3 @@ -/* - * charset=UTF-8 - */ - /* * "streamable kanji code filter and converter" * @@ -86,7 +82,9 @@ #include #include +#include +#include "brg_endian.h" #include "mbfilter.h" #include "mbfl_filter_output.h" #include "mbfilter_8bit.h" @@ -163,8 +161,7 @@ mbfl_buffer_converter_new( } -void -mbfl_buffer_converter_delete(mbfl_buffer_converter *convd) +void mbfl_buffer_converter_delete(mbfl_buffer_converter *convd) { if (convd != NULL) { if (convd->filter1) { @@ -178,14 +175,12 @@ mbfl_buffer_converter_delete(mbfl_buffer_converter *convd) } } -void -mbfl_buffer_converter_reset(mbfl_buffer_converter *convd) +void mbfl_buffer_converter_reset(mbfl_buffer_converter *convd) { mbfl_memory_device_reset(&convd->device); } -int -mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter *convd, int mode) +int mbfl_buffer_converter_illegal_mode(mbfl_buffer_converter *convd, int mode) { if (convd != NULL) { if (convd->filter2 != NULL) { @@ -280,8 +275,7 @@ mbfl_buffer_converter_feed2(mbfl_buffer_converter *convd, mbfl_string *string, s } -int -mbfl_buffer_converter_flush(mbfl_buffer_converter *convd) +int mbfl_buffer_converter_flush(mbfl_buffer_converter *convd) { if (convd == NULL) { return -1; @@ -404,8 +398,7 @@ mbfl_encoding_detector_new(const mbfl_encoding **elist, int elistsz, int strict) } -void -mbfl_encoding_detector_delete(mbfl_encoding_detector *identd) +void mbfl_encoding_detector_delete(mbfl_encoding_detector *identd) { int i; @@ -493,8 +486,7 @@ const mbfl_encoding *mbfl_encoding_detector_judge(mbfl_encoding_detector *identd /* * encoding converter */ -mbfl_string * -mbfl_convert_encoding( +mbfl_string *mbfl_convert_encoding( mbfl_string *string, mbfl_string *result, const mbfl_encoding *toenc) @@ -648,39 +640,60 @@ mbfl_identify_encoding(mbfl_string *string, const mbfl_encoding **elist, int eli /* * strlen */ -static int -filter_count_output(int c, void *data) +static int filter_count_output(int c, void *data) { (*(size_t *)data)++; return c; } -size_t -mbfl_strlen(mbfl_string *string) +size_t mbfl_strlen(mbfl_string *string) { size_t len, n, k; unsigned char *p; - const mbfl_encoding *encoding = string->encoding; + const mbfl_encoding *mbfl_encoding = string->encoding; len = 0; - if (encoding->flag & MBFL_ENCTYPE_SBCS) { + if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { len = string->len; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { len = string->len/2; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { len = string->len/4; - } else if (encoding->mblen_table != NULL) { - const unsigned char *mbtab = encoding->mblen_table; + } else if (mbfl_encoding->mblen_table != NULL) { + char unsigned const *mbtab; + /*swap UTF-16LE and UTF-16BE tables on big-endian platform */ + if(PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) { + if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2BE) { + mbtab = mbfl_name2encoding("UTF-16LE")->mblen_table; + } else if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2LE) { + mbtab = mbfl_name2encoding("UTF-16BE")->mblen_table; + } else { + mbtab = mbfl_encoding->mblen_table; + } + } else { + mbtab = mbfl_encoding->mblen_table; + } + n = 0; p = string->val; k = string->len; + /* count */ if (p != NULL) { - while (n < k) { - unsigned m = mbtab[*p]; - n += m; - p += m; - len++; + if (mbfl_encoding->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + while (n < k) { + unsigned m = mbtab[*(uint16_t *)p]; + n += m; + p += m; + len++; + } + } else { + while (n < k) { + unsigned m = mbtab[*p]; + n += m; + p += m; + len++; + } } } } else { @@ -722,8 +735,7 @@ struct collector_strpos_data { size_t matched_pos; }; -static int -collector_strpos(int c, void* data) +static int collector_strpos(int c, void* data) { int *p, *h, *m; ssize_t n; @@ -774,33 +786,55 @@ collector_strpos(int c, void* data) /* * oddlen */ -size_t -mbfl_oddlen(mbfl_string *string) +size_t mbfl_oddlen(mbfl_string *string) { size_t len, n, k; unsigned char *p; - const mbfl_encoding *encoding = string->encoding; + const mbfl_encoding *mbfl_encoding = string->encoding; len = 0; - if (encoding->flag & MBFL_ENCTYPE_SBCS) { + if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { return 0; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { return len % 2; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { return len % 4; - } else if (encoding->mblen_table != NULL) { - const unsigned char *mbtab = encoding->mblen_table; + } else if (mbfl_encoding->mblen_table != NULL) { + char unsigned const *mbtab; + /*swap UTF-16LE and UTF-16BE tables on big-endian platform */ + if(PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) { + if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2BE) { + mbtab = mbfl_name2encoding("UTF-16LE")->mblen_table; + } else if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2LE) { + mbtab = mbfl_name2encoding("UTF-16BE")->mblen_table; + } else { + mbtab = mbfl_encoding->mblen_table; + } + } else { + mbtab = mbfl_encoding->mblen_table; + } + n = 0; p = string->val; k = string->len; + /* count */ if (p != NULL) { - while (n < k) { - unsigned m = mbtab[*p]; - n += m; - p += m; - }; + if (mbfl_encoding->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + while (n < k) { + unsigned char m = mbtab[*(uint16_t *)p]; + n += m; + p += m; + } + } else { + while (n < k) { + unsigned char m = mbtab[*p]; + n += m; + p += m; + } + } } + return n-k; } else { /* how can i do ? */ @@ -809,8 +843,7 @@ mbfl_oddlen(mbfl_string *string) /* NOT REACHED */ } -size_t -mbfl_strpos( +size_t mbfl_strpos( mbfl_string *haystack, mbfl_string *needle, ssize_t offset, @@ -1019,12 +1052,7 @@ mbfl_strpos( * substr_count */ -size_t -mbfl_substr_count( - mbfl_string *haystack, - mbfl_string *needle - ) -{ +size_t mbfl_substr_count(mbfl_string *haystack, mbfl_string *needle) { size_t n, result = 0; unsigned char *p; mbfl_convert_filter *filter; @@ -1102,8 +1130,7 @@ struct collector_substr_data { size_t output; }; -static int -collector_substr(int c, void* data) +static int collector_substr(int c, void* data) { struct collector_substr_data *pc = (struct collector_substr_data*)data; @@ -1120,14 +1147,8 @@ collector_substr(int c, void* data) return c; } -mbfl_string * -mbfl_substr( - mbfl_string *string, - mbfl_string *result, - size_t from, - size_t length) -{ - const mbfl_encoding *encoding = string->encoding; +mbfl_string *mbfl_substr(mbfl_string *string, mbfl_string *result, size_t from, size_t length) { + const mbfl_encoding *mbfl_encoding = string->encoding; size_t n, k, len, start, end; unsigned m; unsigned char *p, *w; @@ -1136,58 +1157,110 @@ mbfl_substr( result->no_language = string->no_language; result->encoding = string->encoding; - if ((encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE | MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) || - encoding->mblen_table != NULL) { + if ((mbfl_encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE | MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) || + mbfl_encoding->mblen_table != NULL) { len = string->len; - if (encoding->flag & MBFL_ENCTYPE_SBCS) { + if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { start = from; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { start = from*2; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { start = from*4; } else { - const unsigned char *mbtab = encoding->mblen_table; + char unsigned const *mbtab; + /*swap UTF-16LE and UTF-16BE tables on big-endian platform */ + if(PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) { + if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2BE) { + mbtab = mbfl_name2encoding("UTF-16LE")->mblen_table; + } else if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2LE) { + mbtab = mbfl_name2encoding("UTF-16BE")->mblen_table; + } else { + mbtab = mbfl_encoding->mblen_table; + } + } else { + mbtab = mbfl_encoding->mblen_table; + } + start = 0; n = 0; k = 0; p = string->val; /* search start position */ - while (k <= from) { - start = n; - if (n >= len) { - break; + if (mbfl_encoding->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + while (k <= from) { + start = n; + if (n >= len) { + break; + } + m = mbtab[*(uint16_t *)p]; + n += m; + p += m; + k++; + } + } else { + while (k <= from) { + start = n; + if (n >= len) { + break; + } + m = mbtab[*p]; + n += m; + p += m; + k++; } - m = mbtab[*p]; - n += m; - p += m; - k++; } } if (length == MBFL_SUBSTR_UNTIL_END) { end = len; - } else if (encoding->flag & MBFL_ENCTYPE_SBCS) { + } else if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { end = start + length; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { end = start + length*2; - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { end = start + length*4; } else { - const unsigned char *mbtab = encoding->mblen_table; + char unsigned const *mbtab; + /*swap UTF-16LE and UTF-16BE tables on big-endian platform */ + if(PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) { + if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2BE) { + mbtab = mbfl_name2encoding("UTF-16LE")->mblen_table; + } else if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2LE) { + mbtab = mbfl_name2encoding("UTF-16BE")->mblen_table; + } else { + mbtab = mbfl_encoding->mblen_table; + } + } else { + mbtab = mbfl_encoding->mblen_table; + } + end = start; n = start; k = 0; p = string->val + start; /* detect end position */ - while (k <= length) { - end = n; - if (n >= len) { - break; + if (mbfl_encoding->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + while (k <= length) { + end = n; + if (n >= len) { + break; + } + m = mbtab[*(uint16_t *)p]; + n += m; + p += m; + k++; + } + } else { + while (k <= length) { + end = n; + if (n >= len) { + break; + } + m = mbtab[*p]; + n += m; + p += m; + k++; } - m = mbtab[*p]; - n += m; - p += m; - k++; } } @@ -1271,14 +1344,13 @@ mbfl_substr( /* * strcut */ -mbfl_string * -mbfl_strcut( +mbfl_string *mbfl_strcut( mbfl_string *string, mbfl_string *result, size_t from, size_t length) { - const mbfl_encoding *encoding = string->encoding; + const mbfl_encoding *mbfl_encoding = string->encoding; mbfl_memory_device device; if (from >= string->len) { @@ -1289,18 +1361,18 @@ mbfl_strcut( result->no_language = string->no_language; result->encoding = string->encoding; - if ((encoding->flag & (MBFL_ENCTYPE_SBCS + if ((mbfl_encoding->flag & (MBFL_ENCTYPE_SBCS | MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE | MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) - || encoding->mblen_table != NULL) { + || mbfl_encoding->mblen_table != NULL) { const unsigned char *start = NULL; const unsigned char *end = NULL; unsigned char *w; size_t sz; - if (encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { + if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { from &= -2; if (length >= string->len - from) { @@ -1309,7 +1381,7 @@ mbfl_strcut( start = string->val + from; end = start + (length & -2); - } else if (encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { from &= -4; if (length >= string->len - from) { @@ -1318,21 +1390,37 @@ mbfl_strcut( start = string->val + from; end = start + (length & -4); - } else if ((encoding->flag & MBFL_ENCTYPE_SBCS)) { + } else if ((mbfl_encoding->flag & MBFL_ENCTYPE_SBCS)) { if (length >= string->len - from) { length = string->len - from; } start = string->val + from; end = start + length; - } else if (encoding->mblen_table != NULL) { - const unsigned char *mbtab = encoding->mblen_table; + } else if (mbfl_encoding->mblen_table != NULL) { + char unsigned const *mbtab; + /*swap UTF-16LE and UTF-16BE tables on big-endian platform */ + if(PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) { + if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2BE) { + mbtab = mbfl_name2encoding("UTF-16LE")->mblen_table; + } else if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2LE) { + mbtab = mbfl_name2encoding("UTF-16BE")->mblen_table; + } else { + mbtab = mbfl_encoding->mblen_table; + } + } else { + mbtab = mbfl_encoding->mblen_table; + } + const unsigned char *p, *q; int m; /* search start position */ - for (m = 0, p = string->val, q = p + from; - p < q; p += (m = mbtab[*p])); + if (mbfl_encoding->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + for (m = 0, p = string->val, q = p + from; p < q; p += (m = mbtab[*(uint16_t *)p])); + } else { + for (m = 0, p = string->val, q = p + from; p < q; p += (m = mbtab[*p])); + } if (p > q) { p -= m; @@ -1344,7 +1432,11 @@ mbfl_strcut( if (length >= string->len - (start - string->val)) { end = string->val + string->len; } else { - for (q = p + length; p < q; p += (m = mbtab[*p])); + if (mbfl_encoding->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + for (q = p + length; p < q; p += (m = mbtab[*(uint16_t *)p])); + } else { + for (q = p + length; p < q; p += (m = mbtab[*p])); + } if (p > q) { p -= m; @@ -1569,15 +1661,13 @@ static size_t is_fullwidth(int c) return 0; } -static int -filter_count_width(int c, void* data) +static int filter_count_width(int c, void* data) { (*(size_t *)data) += (is_fullwidth(c) ? 2: 1); return c; } -size_t -mbfl_strwidth(mbfl_string *string) +size_t mbfl_strwidth(mbfl_string *string) { size_t len, n; unsigned char *p; @@ -1626,8 +1716,7 @@ struct collector_strimwidth_data { int status; }; -static int -collector_strimwidth(int c, void* data) +static int collector_strimwidth(int c, void* data) { struct collector_strimwidth_data *pc = (struct collector_strimwidth_data*)data; @@ -1658,8 +1747,7 @@ collector_strimwidth(int c, void* data) return c; } -mbfl_string * -mbfl_strimwidth( +mbfl_string *mbfl_strimwidth( mbfl_string *string, mbfl_string *marker, mbfl_string *result, @@ -1759,8 +1847,7 @@ mbfl_strimwidth( return result; } -mbfl_string * -mbfl_ja_jp_hantozen( +mbfl_string *mbfl_ja_jp_hantozen( mbfl_string *string, mbfl_string *result, int mode) @@ -1877,8 +1964,7 @@ struct mime_header_encoder_data { char lwsp[16]; }; -static int -mime_header_encoder_block_collector(int c, void *data) +static int mime_header_encoder_block_collector(int c, void *data) { size_t n; struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data; @@ -1919,9 +2005,7 @@ mime_header_encoder_block_collector(int c, void *data) return c; } -static int -mime_header_encoder_collector(int c, void *data) -{ +static int mime_header_encoder_collector(int c, void *data){ static int qp_table[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */ @@ -2015,8 +2099,7 @@ mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *res return mbfl_memory_device_result(&pe->outdev, result); } -struct mime_header_encoder_data* -mime_header_encoder_new( +struct mime_header_encoder_data *mime_header_encoder_new( const mbfl_encoding *incode, const mbfl_encoding *outcode, const mbfl_encoding *transenc) @@ -2103,8 +2186,7 @@ mime_header_encoder_new( return pe; } -void -mime_header_encoder_delete(struct mime_header_encoder_data *pe) +void mime_header_encoder_delete(struct mime_header_encoder_data *pe) { if (pe) { mbfl_convert_filter_delete(pe->conv1_filter); @@ -2119,14 +2201,12 @@ mime_header_encoder_delete(struct mime_header_encoder_data *pe) } } -int -mime_header_encoder_feed(int c, struct mime_header_encoder_data *pe) +int mime_header_encoder_feed(int c, struct mime_header_encoder_data *pe) { return (*pe->conv1_filter->filter_function)(c, pe->conv1_filter); } -mbfl_string * -mbfl_mime_header_encode( +mbfl_string *mbfl_mime_header_encode( mbfl_string *string, mbfl_string *result, const mbfl_encoding *outcode, @@ -2190,8 +2270,7 @@ struct mime_header_decoder_data { const mbfl_encoding *outcode; }; -static int -mime_header_decoder_collector(int c, void* data) +static int mime_header_decoder_collector(int c, void* data) { const mbfl_encoding *encoding; struct mime_header_decoder_data *pd = (struct mime_header_decoder_data*)data; @@ -2406,8 +2485,7 @@ mime_header_decoder_new(const mbfl_encoding *outcode) return pd; } -void -mime_header_decoder_delete(struct mime_header_decoder_data *pd) +void mime_header_decoder_delete(struct mime_header_decoder_data *pd) { if (pd) { mbfl_convert_filter_delete(pd->conv2_filter); @@ -2419,14 +2497,12 @@ mime_header_decoder_delete(struct mime_header_decoder_data *pd) } } -int -mime_header_decoder_feed(int c, struct mime_header_decoder_data *pd) +int mime_header_decoder_feed(int c, struct mime_header_decoder_data *pd) { return mime_header_decoder_collector(c, pd); } -mbfl_string * -mbfl_mime_header_decode( +mbfl_string *mbfl_mime_header_decode( mbfl_string *string, mbfl_string *result, const mbfl_encoding *outcode) @@ -2472,8 +2548,7 @@ struct collector_htmlnumericentity_data { int mapsize; }; -static int -collector_encode_htmlnumericentity(int c, void *data) +static int collector_encode_htmlnumericentity(int c, void *data) { struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data; int f, n, s, r, d, size, *mapelm; @@ -2518,8 +2593,7 @@ collector_encode_htmlnumericentity(int c, void *data) return c; } -static int -collector_decode_htmlnumericentity(int c, void *data) +static int collector_decode_htmlnumericentity(int c, void *data) { struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data; int f, n, s, r, d, size, *mapelm; @@ -2696,8 +2770,7 @@ collector_decode_htmlnumericentity(int c, void *data) return c; } -static int -collector_encode_hex_htmlnumericentity(int c, void *data) +static int collector_encode_hex_htmlnumericentity(int c, void *data) { struct collector_htmlnumericentity_data *pc = (struct collector_htmlnumericentity_data *)data; int f, n, s, r, d, size, *mapelm; @@ -2817,8 +2890,7 @@ int mbfl_filt_decode_htmlnumericentity_flush(mbfl_convert_filter *filter) } -mbfl_string * -mbfl_html_numeric_entity( +mbfl_string *mbfl_html_numeric_entity( mbfl_string *string, mbfl_string *result, int *convmap, diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.h b/ext/mbstring/libmbfl/mbfl/mbfilter.h index a6aed129fabba..b633af40f35e4 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.h +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.h @@ -225,26 +225,22 @@ mbfl_substr_count(mbfl_string *haystack, mbfl_string *needle); /* * substr */ -MBFLAPI extern mbfl_string * -mbfl_substr(mbfl_string *string, mbfl_string *result, size_t from, size_t length); +MBFLAPI extern mbfl_string *mbfl_substr(mbfl_string *string, mbfl_string *result, size_t from, size_t length); /* * strcut */ -MBFLAPI extern mbfl_string * -mbfl_strcut(mbfl_string *string, mbfl_string *result, size_t from, size_t length); +MBFLAPI extern mbfl_string *mbfl_strcut(mbfl_string *string, mbfl_string *result, size_t from, size_t length); /* * strwidth */ -MBFLAPI extern size_t -mbfl_strwidth(mbfl_string *string); +MBFLAPI extern size_t mbfl_strwidth(mbfl_string *string); /* * strimwidth */ -MBFLAPI extern mbfl_string * -mbfl_strimwidth(mbfl_string *string, mbfl_string *marker, mbfl_string *result, size_t from, size_t width); +MBFLAPI extern mbfl_string *mbfl_strimwidth(mbfl_string *string, mbfl_string *marker, mbfl_string *result, size_t from, size_t width); /* * MIME header encode diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c index e13838f4ca212..b31a57534cc26 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_convert.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_convert.c @@ -34,6 +34,7 @@ #include +#include "brg_endian.h" #include "mbfl_encoding.h" #include "mbfl_allocators.h" #include "mbfl_filter_output.h" @@ -220,18 +221,14 @@ mbfl_convert_filter_new2( return filter; } -void -mbfl_convert_filter_delete(mbfl_convert_filter *filter) -{ +void mbfl_convert_filter_delete(mbfl_convert_filter *filter) { if (filter) { (*filter->filter_dtor)(filter); mbfl_free((void*)filter); } } -int -mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter) -{ +int mbfl_convert_filter_feed(int c, mbfl_convert_filter *filter) { return (*filter->filter_function)(c, filter); } @@ -246,16 +243,13 @@ mbfl_convert_filter_feed_string(mbfl_convert_filter *filter, const unsigned char return 0; } -int -mbfl_convert_filter_flush(mbfl_convert_filter *filter) -{ +int mbfl_convert_filter_flush(mbfl_convert_filter *filter) { (*filter->filter_flush)(filter); return (filter->flush_function ? (*filter->flush_function)(filter->data) : 0); } void mbfl_convert_filter_reset(mbfl_convert_filter *filter, - const mbfl_encoding *from, const mbfl_encoding *to) -{ + const mbfl_encoding *from, const mbfl_encoding *to) { const struct mbfl_convert_vtbl *vtbl; /* destruct old filter */ diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c index be861629d0e13..ace3549bd20a3 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.c @@ -205,8 +205,7 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = { }; /* encoding resolver */ -const mbfl_encoding * -mbfl_name2encoding(const char *name) +const mbfl_encoding *mbfl_name2encoding(const char *name) { const mbfl_encoding *encoding; int i, j; @@ -249,8 +248,7 @@ mbfl_name2encoding(const char *name) return NULL; } -const mbfl_encoding * -mbfl_no2encoding(enum mbfl_no_encoding no_encoding) +const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding) { const mbfl_encoding *encoding; int i; @@ -265,8 +263,7 @@ mbfl_no2encoding(enum mbfl_no_encoding no_encoding) return NULL; } -enum mbfl_no_encoding -mbfl_name2no_encoding(const char *name) +enum mbfl_no_encoding mbfl_name2no_encoding(const char *name) { const mbfl_encoding *encoding; @@ -278,9 +275,7 @@ mbfl_name2no_encoding(const char *name) } } -const char * -mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding) -{ +const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding) { const mbfl_encoding *encoding; encoding = mbfl_no2encoding(no_encoding); @@ -291,15 +286,11 @@ mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding) } } -const mbfl_encoding ** -mbfl_get_supported_encodings(void) -{ +const mbfl_encoding **mbfl_get_supported_encodings(void) { return mbfl_encoding_ptr_list; } -const char * -mbfl_no2preferred_mime_name(enum mbfl_no_encoding no_encoding) -{ +const char *mbfl_no2preferred_mime_name(enum mbfl_no_encoding no_encoding) { const mbfl_encoding *encoding; encoding = mbfl_no2encoding(no_encoding); diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index cdca12b00ee0d..f040af299b9ec 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -35,6 +35,7 @@ #include "main/php_output.h" #include "ext/standard/info.h" +#include "libmbfl/mbfl/brg_endian.h" #include "libmbfl/mbfl/mbfl_allocators.h" #include "libmbfl/mbfl/mbfilter_8bit.h" #include "libmbfl/mbfl/mbfilter_pass.h" @@ -63,7 +64,7 @@ #if HAVE_ONIG #include "php_onig_compat.h" -#include +#include "oniguruma/oniguruma.h" #undef UChar #elif HAVE_PCRE || HAVE_BUNDLED_PCRE #include "ext/pcre/php_pcre.h" @@ -232,6 +233,12 @@ ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_output_handler, 0, 0, 2) ZEND_ARG_INFO(0, status) ZEND_END_ARG_INFO() +ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_str_split, 0, 0, 1) + ZEND_ARG_INFO(0, str) + ZEND_ARG_INFO(0, split_length) + ZEND_ARG_INFO(0, encoding) +ZEND_END_ARG_INFO() + ZEND_BEGIN_ARG_INFO_EX(arginfo_mb_strlen, 0, 0, 1) ZEND_ARG_INFO(0, str) ZEND_ARG_INFO(0, encoding) @@ -529,6 +536,7 @@ static const zend_function_entry mbstring_functions[] = { PHP_FE(mb_parse_str, arginfo_mb_parse_str) PHP_FE(mb_output_handler, arginfo_mb_output_handler) PHP_FE(mb_preferred_mime_name, arginfo_mb_preferred_mime_name) + PHP_FE(mb_str_split, arginfo_mb_str_split) PHP_FE(mb_strlen, arginfo_mb_strlen) PHP_FE(mb_strpos, arginfo_mb_strpos) PHP_FE(mb_strrpos, arginfo_mb_strrpos) @@ -2282,6 +2290,217 @@ PHP_FUNCTION(mb_output_handler) } /* }}} */ +/* {{{ proto array mb_str_split(string str [, int split_length] [, string encoding]) + Convert a multibyte string to an array. If split_length is specified, + break the string down into chunks each split_length characters long. */ + +/* structure to pass split params to the callback */ +struct mbfl_split_params { + zval *return_value; /* php function return value structure pointer */ + mbfl_string *result_string; /* string to store result chunk */ + size_t mb_chunk_size; /* chunk size in chars */ + size_t split_length; /* defined chunk size in chars */ + mbfl_convert_filter *next_filter; /* widechar to encoding filter */ +}; + +/* callback function to fill split array */ +static int mbfl_split_output(int c, void *data) +{ + struct mbfl_split_params *params = (struct mbfl_split_params *)data; /* cast passed data */ + + (*params->next_filter->filter_function)(c, params->next_filter); /* decoder filter */ + + size_t mb_chunk_size = ++params->mb_chunk_size; /* increment chunk size */ + if(params->split_length == mb_chunk_size) { /* if current chunk size reached defined chunk size */ + mbfl_convert_filter_flush(params->next_filter);/* concatenate separate decoded chars to the solid string */ + mbfl_memory_device *device = (mbfl_memory_device *)params->next_filter->data; /* chars container */ + mbfl_string *chunk = params->result_string; + mbfl_memory_device_result(device, chunk); /* make chunk */ + add_next_index_stringl(params->return_value, chunk->val, chunk->len); /* add chunk to the array */ + efree(chunk->val); + params->mb_chunk_size = 0; /* reset mb_chunk size */ + } + return 0; +} + +PHP_FUNCTION(mb_str_split) +{ + zend_string *str, *encoding = NULL; + size_t mb_len, chunks, chunk_len; + char unsigned const *last, *p; + mbfl_string string, result_string; + zend_long split_length = 1; + + ZEND_PARSE_PARAMETERS_START(1, 3) + Z_PARAM_STR(str) + Z_PARAM_OPTIONAL + Z_PARAM_LONG(split_length) + Z_PARAM_STR(encoding) + ZEND_PARSE_PARAMETERS_END(); + + if (split_length <= 0) { + php_error_docref(NULL, E_WARNING, "The length of each segment must be greater than zero"); + RETURN_FALSE; + } + + /* fill mbfl_string structure */ + string.no_language = MBSTRG(language); + string.encoding = php_mb_get_encoding(encoding); + if (!string.encoding) { + RETURN_FALSE; + } + + p = (char unsigned *)ZSTR_VAL(str); /* string cursor pointer */ + string.val = (char unsigned *)p; + string.len = ZSTR_LEN(str); + last = string.val + string.len; + + + /* + * +----------------------------------------------------------------------+ + * | 3 scenarios | + * +----------------------------------------------------------------------+ + */ + const mbfl_encoding *mbfl_encoding = string.encoding; + + /* + * +----------------------------------------------------------------------+ + * | first scenario: "fixed width encodings" | + * +----------------------------------------------------------------------+ + */ + if (mbfl_encoding->flag & MBFL_ENCTYPE_SBCS) { /* 1 byte */ + mb_len = string.len; + chunk_len = (size_t)split_length; /* chunk length in bytes */ + + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE )) { /* 2 bytes */ + mb_len = string.len>>1; /* eq. string.len / 2 String length in chars */ + chunk_len = split_length<<1; /* eq. split_length * 2 chunk length in bytes */ + + } else if (mbfl_encoding->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE )) { /* 4 bytes */ + mb_len = string.len>>2; /* eq. string.len / 4 */ + chunk_len = split_length<<2; /* eq. split_length * 4 */ + /* + * +----------------------------------------------------------------------+ + * | second scenario: "variable width encodings with length table" | + * +----------------------------------------------------------------------+ + */ + } else if (mbfl_encoding->mblen_table != NULL) { + char unsigned const *mbtab; + /*swap UTF-16LE and UTF-16BE tables on big-endian platform */ + if(PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) { + if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2BE) { + mbtab = mbfl_name2encoding("UTF-16LE")->mblen_table; + } else if (mbfl_encoding->flag == MBFL_ENCTYPE_MWC2LE) { + mbtab = mbfl_name2encoding("UTF-16BE")->mblen_table; + } else { + mbtab = mbfl_encoding->mblen_table; + } + } else { + mbtab = mbfl_encoding->mblen_table; + } + + /* assume that we have 2-bytes characters */ + array_init_size(return_value, (string.len>>1 + split_length) / split_length); + + if(mbfl_encoding->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + while (p < last) { /* split cycle work until the cursor has reached the last byte */ + char const *chunk_p = p; /* chunk first byte pointer */ + chunk_len = 0; /* chunk length in bytes */ + for (uint32_t char_count = 0; char_count < split_length; ++char_count) { + char unsigned const m = mbtab[*(uint16_t *)p]; /* single character length table */ + chunk_len += m; + p += m; + } + if (p > last) chunk_len -= p - last; /* check if chunk is in bounds */ + add_next_index_stringl(return_value, chunk_p, chunk_len); + } + } else { + while (p < last) { /* split cycle work until the cursor has reached the last byte */ + char const *chunk_p = p; /* chunk first byte pointer */ + chunk_len = 0; /* chunk length in bytes */ + for (uint32_t char_count = 0; char_count < split_length; ++char_count) { + char unsigned const m = mbtab[*p]; /* single character length table */ + chunk_len += m; + p += m; + } + if (p > last) chunk_len -= p - last; /* check if chunk is in bounds */ + add_next_index_stringl(return_value, chunk_p, chunk_len); + } + } + return; + + /* + * +----------------------------------------------------------------------+ + * | third scenario: "else multibyte encodings" | + * +----------------------------------------------------------------------+ + */ + } else { + mbfl_convert_filter *filter, *decoder; + + /* assume that we have 2-bytes characters */ + array_init_size(return_value, (string.len>>1 + split_length) / split_length); + p = (char const *)string.val; /* reset string cursor position */ + + /* decoder filter to decode wchar to encoding */ + mbfl_memory_device device; + mbfl_memory_device_init(&device, split_length + 1, 0); + + decoder = mbfl_convert_filter_new( + &mbfl_encoding_wchar, + string.encoding, + mbfl_memory_device_output, + 0, + &device); + if (decoder == NULL){ + mbfl_convert_filter_delete(decoder); + RETURN_FALSE; /* something wrong with the filter */ + } + + /* wchar filter */ + mbfl_string_init(&result_string); /* mbfl_string to store chunk in the callback */ + struct mbfl_split_params params = { /* init callback function params structure */ + .return_value = return_value, + .result_string = &result_string, + .mb_chunk_size = 0, + .split_length = (size_t)split_length, + .next_filter = decoder, + }; + + filter = mbfl_convert_filter_new( + string.encoding, + &mbfl_encoding_wchar, + mbfl_split_output, + 0, + ¶ms); + if (filter == NULL){ + mbfl_convert_filter_delete(decoder); + mbfl_convert_filter_delete(filter); + RETURN_FALSE; /* something wrong with the filter */ + } + + while (p < last) { /* cycle each byte with callback function */ + (*filter->filter_function)(*p++, filter); + } + + mbfl_convert_filter_delete(decoder); + mbfl_convert_filter_delete(filter); + return; + } + + + /* 1,2,4-bytes fixed width encodings tail part */ + chunks = (mb_len + split_length - 1) / split_length; // (round up idiom) + array_init_size(return_value, chunks); + if (chunks != 0) { + for (size_t i = 0; i < chunks - 1; p += chunk_len, ++i) { + add_next_index_stringl(return_value, p, chunk_len); + } + add_next_index_stringl(return_value, p, last - p); + } + +} +/* }}} */ + /* {{{ proto int mb_strlen(string str [, string encoding]) Get character numbers of a string */ PHP_FUNCTION(mb_strlen) @@ -5030,11 +5249,28 @@ static int php_mb_encoding_translation(void) /* {{{ MBSTRING_API size_t php_mb_mbchar_bytes_ex() */ MBSTRING_API size_t php_mb_mbchar_bytes_ex(const char *s, const mbfl_encoding *enc) { - if (enc != NULL) { - if (enc->flag & MBFL_ENCTYPE_MBCS) { - if (enc->mblen_table != NULL) { - if (s != NULL) return enc->mblen_table[*(unsigned char *)s]; + if (enc != NULL && s != NULL) { + if (enc->mblen_table != NULL) { + char unsigned const *mbtab; + /*swap UTF-16LE and UTF-16BE tables on big-endian platform */ + if(PLATFORM_BYTE_ORDER == IS_BIG_ENDIAN) { + if (enc->flag == MBFL_ENCTYPE_MWC2BE) { + mbtab = mbfl_name2encoding("UTF-16LE")->mblen_table; + } else if (enc->flag == MBFL_ENCTYPE_MWC2LE) { + mbtab = mbfl_name2encoding("UTF-16BE")->mblen_table; + } else { + mbtab = enc->mblen_table; + } + } else { + mbtab = enc->mblen_table; } + /* UTF-16LE or UTF-16BE */ + if(enc->flag & (MBFL_ENCTYPE_MWC2BE | MBFL_ENCTYPE_MWC2LE)) { + return mbtab[*(uint16_t *)s]; + } else{ + return mbtab[*(unsigned char *)s]; + } + } else if (enc->flag & (MBFL_ENCTYPE_WCS2BE | MBFL_ENCTYPE_WCS2LE)) { return 2; } else if (enc->flag & (MBFL_ENCTYPE_WCS4BE | MBFL_ENCTYPE_WCS4LE)) { diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h index d6fdea9ff0248..7077b01ce1fd0 100644 --- a/ext/mbstring/mbstring.h +++ b/ext/mbstring/mbstring.h @@ -78,6 +78,7 @@ PHP_FUNCTION(mb_substitute_character); PHP_FUNCTION(mb_preferred_mime_name); PHP_FUNCTION(mb_parse_str); PHP_FUNCTION(mb_output_handler); +PHP_FUNCTION(mb_str_split); PHP_FUNCTION(mb_strlen); PHP_FUNCTION(mb_strpos); PHP_FUNCTION(mb_strrpos); diff --git a/ext/mbstring/tests/mb_str_split_jp.phpt b/ext/mbstring/tests/mb_str_split_jp.phpt new file mode 100644 index 0000000000000..e8d7cffe0acc9 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_jp.phpt @@ -0,0 +1,73 @@ +--TEST-- +mb_str_split() tests for the japanese language +--SKIPIF-- + +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- + +--EXPECT-- +BIG-5: a4e9 a5bb +EUC-JP: c6fc cbdc +ISO-2022-JP: 1b2442467c1b2842 1b24424b5c1b2842 +SJIS: 93fa 967b +UTF-16BE: 65e5 672c +UTF-16LE: e565 2c67 +UTF-32BE: 000065e5 0000672c +UTF-32LE: e5650000 2c670000 +UTF-8: e697a5 e69cac diff --git a/ext/mbstring/tests/mb_str_split_ru.phpt b/ext/mbstring/tests/mb_str_split_ru.phpt new file mode 100644 index 0000000000000..3c54b89464511 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_ru.phpt @@ -0,0 +1,72 @@ +--TEST-- +mb_str_split() tests for the russian language +--SKIPIF-- + +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- + +--EXPECT-- +EUC-JP: a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 a7e2 a7d1 a7db 20 +CP866: e0 a0 a9 20 e0 a0 a9 20 e0 a0 a9 20 +KOI8-R: d2 c1 ca 20 d2 c1 ca 20 d2 c1 ca 20 +UTF-16BE: 0440 0430 0439 0020 0440 0430 0439 0020 0440 0430 0439 0020 +UTF-16LE: 4004 3004 3904 2000 4004 3004 3904 2000 4004 3004 3904 2000 +UTF-32BE: 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 00000440 00000430 00000439 00000020 +UTF-32LE: 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 40040000 30040000 39040000 20000000 +UTF-8: d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 d180 d0b0 d0b9 20 diff --git a/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt b/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt new file mode 100644 index 0000000000000..47b79368530c4 --- /dev/null +++ b/ext/mbstring/tests/mb_str_split_utf8_utf16.phpt @@ -0,0 +1,81 @@ +--TEST-- +mb_str_split() tests UTF-8 illegal chars & UTF-16 surrogate pairs +--SKIPIF-- + +--INI-- +output_handler= +mbstring.func_overload=0 +--FILE-- + +--EXPECT-- +UTF-8: l:2 v:3132 l:5 v:33f09280a9 +BAD UTF-8: l:2 v:3132 l:3 v:33f092 +UTF-16BE: l:4 v:d800dc00 l:4 v:dbffdfff +UTF-16LE: l:4 v:00d800dc l:4 v:ffdbffdf +BAD UTF-16BE: l:4 v:d800dc00 l:2 v:dc00 l:2 v:dc00 +BAD UTF-16LE: l:4 v:00d800dc l:2 v:00dc l:2 v:00dc + + diff --git a/ext/mbstring/tests/mb_substr_basic.phpt b/ext/mbstring/tests/mb_substr_basic.phpt index 2edb91291f09b..2789ee321b687 100644 --- a/ext/mbstring/tests/mb_substr_basic.phpt +++ b/ext/mbstring/tests/mb_substr_basic.phpt @@ -24,6 +24,10 @@ $string_ascii = 'ABCDEF'; //Japanese string in UTF-8 $string_mb = base64_decode('5pel5pys6Kqe44OG44Kt44K544OI44Gn44GZ44CCMDEyMzTvvJXvvJbvvJfvvJjvvJnjgII='); +//Japanese string in UTF-16 +$string_utf16 = mb_convert_encoding($string_mb, "UTF-16", "UTF-8"); + + echo "\n-- ASCII string 1 --\n"; var_dump(mb_substr($string_ascii, 3)); @@ -38,6 +42,11 @@ echo "\n-- Multibyte string 2 --\n"; $result_2 = mb_substr($string_mb, 2, 7, 'utf-8'); var_dump(base64_encode($result_2)); +echo "\n-- Multibyte string utf-16 --\n"; +$result_3 = mb_substr($string_utf16, 2, 7, 'utf-16'); +var_dump(base64_encode(mb_convert_encoding($result_3, "UTF-8", "UTF-16"))); + + echo "Done"; ?> --EXPECT-- @@ -54,4 +63,7 @@ string(12) "peacrOiqng==" -- Multibyte string 2 -- string(28) "6Kqe44OG44Kt44K544OI44Gn44GZ" + +-- Multibyte string utf-16 -- +string(28) "6Kqe44OG44Kt44K544OI44Gn44GZ" Done