Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Major overhaul of mbstring (part 28) #10099

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
132 changes: 0 additions & 132 deletions ext/mbstring/libmbfl/mbfl/mbfilter.c
Expand Up @@ -429,138 +429,6 @@ const mbfl_encoding *mbfl_identify_encoding(mbfl_string *string, const mbfl_enco
return enc;
}

/*
* strpos
*/
struct collector_strpos_data {
mbfl_convert_filter *next_filter;
mbfl_wchar_device needle;
size_t needle_len;
size_t start;
size_t output;
size_t found_pos;
size_t needle_pos;
size_t matched_pos;
};

static int
collector_strpos(int c, void* data)
{
int *p, *h, *m;
ssize_t n;
struct collector_strpos_data *pc = (struct collector_strpos_data*)data;

if (pc->output >= pc->start) {
if (c == (int)pc->needle.buffer[pc->needle_pos]) {
if (pc->needle_pos == 0) {
pc->found_pos = pc->output; /* found position */
}
pc->needle_pos++; /* needle pointer */
if (pc->needle_pos >= pc->needle_len) {
pc->matched_pos = pc->found_pos; /* matched position */
pc->needle_pos--;
goto retry;
}
} else if (pc->needle_pos != 0) {
retry:
h = (int *)pc->needle.buffer;
h++;
for (;;) {
pc->found_pos++;
p = h;
m = (int *)pc->needle.buffer;
n = pc->needle_pos - 1;
while (n > 0 && *p == *m) {
n--;
p++;
m++;
}
if (n <= 0) {
if (*m != c) {
pc->needle_pos = 0;
}
break;
} else {
h++;
pc->needle_pos--;
}
}
}
}

pc->output++;
return 0;
}

/*
* substr_count
*/

size_t
mbfl_substr_count(
mbfl_string *haystack,
mbfl_string *needle
)
{
size_t n, result = 0;
unsigned char *p;
mbfl_convert_filter *filter;
struct collector_strpos_data pc;

/* needle is converted into wchar */
mbfl_wchar_device_init(&pc.needle);
filter = mbfl_convert_filter_new(
needle->encoding,
&mbfl_encoding_wchar,
mbfl_wchar_device_output, 0, &pc.needle);
ZEND_ASSERT(filter);
mbfl_convert_filter_feed_string(filter, needle->val, needle->len);
mbfl_convert_filter_flush(filter);
mbfl_convert_filter_delete(filter);
pc.needle_len = pc.needle.pos;
if (pc.needle.buffer == NULL) {
return MBFL_ERROR_ENCODING;
}
if (pc.needle_len == 0) {
mbfl_wchar_device_clear(&pc.needle);
return MBFL_ERROR_EMPTY;
}
/* initialize filter and collector data */
filter = mbfl_convert_filter_new(
haystack->encoding,
&mbfl_encoding_wchar,
collector_strpos, 0, &pc);
ZEND_ASSERT(filter);
pc.start = 0;
pc.output = 0;
pc.needle_pos = 0;
pc.found_pos = 0;
pc.matched_pos = MBFL_ERROR_NOT_FOUND;

/* feed data */
p = haystack->val;
n = haystack->len;
if (p != NULL) {
while (n > 0) {
if ((*filter->filter_function)(*p++, filter) < 0) {
pc.matched_pos = MBFL_ERROR_ENCODING;
break;
}
if (pc.matched_pos != MBFL_ERROR_NOT_FOUND) {
++result;
pc.matched_pos = MBFL_ERROR_NOT_FOUND;
pc.needle_pos = 0;
}
n--;
}
}
mbfl_convert_filter_flush(filter);
mbfl_convert_filter_delete(filter);
mbfl_wchar_device_clear(&pc.needle);

return result;
}

/*
* strcut
*/
Expand Down
10 changes: 1 addition & 9 deletions ext/mbstring/libmbfl/mbfl/mbfilter.h
Expand Up @@ -112,13 +112,11 @@
#define MBFL_VERSION_MINOR 3
#define MBFL_VERSION_TEENY 2

/*
* convert filter
*/
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE 0
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR 1
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG 2
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY 3
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8 4 /* For internal use only; deliberately uses invalid UTF-8 byte sequence as error marker */

/*
* convenience macros
Expand Down Expand Up @@ -195,12 +193,6 @@ static inline int mbfl_is_error(size_t len) {
#define MBFL_ERROR_EMPTY ((size_t) -8)
#define MBFL_ERROR_OFFSET ((size_t) -16)

/*
* substr_count
*/
MBFLAPI extern size_t
mbfl_substr_count(mbfl_string *haystack, mbfl_string *needle);

/*
* If specified as length, the substr until the end of the string is taken.
*/
Expand Down
20 changes: 18 additions & 2 deletions ext/mbstring/libmbfl/mbfl/mbfl_convert.c
Expand Up @@ -394,9 +394,15 @@ static size_t mb_illegal_marker(uint32_t bad_cp, uint32_t *out, unsigned int err
{
uint32_t *start = out;

if (bad_cp == MBFL_BAD_INPUT && err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
*out++ = replacement_char;
if (bad_cp == MBFL_BAD_INPUT) {
/* Input string contained a byte sequence which was invalid in the 'from' encoding
* Unless the error handling mode is set to NONE, insert the replacement character */
if (err_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
*out++ = replacement_char;
}
} else {
/* Input string contained a byte sequence which was valid in the 'from' encoding,
* but decoded to a Unicode codepoint which cannot be represented in the 'to' encoding */
switch (err_mode) {
case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
*out++ = replacement_char;
Expand Down Expand Up @@ -427,6 +433,16 @@ void mb_illegal_output(uint32_t bad_cp, mb_from_wchar_fn fn, mb_convert_buf* buf
uint32_t repl_char = buf->replacement_char;
unsigned int err_mode = buf->error_mode;

if (err_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
/* This mode is for internal use only, when converting a string to
* UTF-8 before searching it; it uses a byte which is illegal in
* UTF-8 as an error marker. This ensures that error markers will
* never 'accidentally' match valid text, as could happen when a
* character like '?' is used as an error marker. */
buf->out = mb_convert_buf_add(buf->out, 0xFF);
return;
}

size_t len = mb_illegal_marker(bad_cp, temp, err_mode, repl_char);

/* Avoid infinite loop if `fn` is not able to handle `repl_char` */
Expand Down
91 changes: 69 additions & 22 deletions ext/mbstring/mbstring.c
Expand Up @@ -1961,10 +1961,10 @@ static size_t mb_find_strpos(zend_string *haystack, zend_string *needle, const m
}

out:
if (haystack_u8 != NULL && haystack_u8 != haystack) {
if (haystack_u8 != haystack) {
zend_string_free(haystack_u8);
}
if (needle_u8 != NULL && needle_u8 != needle) {
if (needle_u8 != needle) {
zend_string_free(needle_u8);
}
return result;
Expand Down Expand Up @@ -2263,42 +2263,89 @@ PHP_FUNCTION(mb_strrichr)
#undef MB_STRISTR
#undef MB_STRRICHR

/* {{{ Count the number of substring occurrences */
PHP_FUNCTION(mb_substr_count)
{
mbfl_string haystack, needle;
char *haystack_val, *needle_val;
zend_string *enc_name = NULL;
zend_string *haystack, *needle, *enc_name = NULL, *haystack_u8 = NULL, *needle_u8 = NULL;

ZEND_PARSE_PARAMETERS_START(2, 3)
Z_PARAM_STRING(haystack_val, haystack.len)
Z_PARAM_STRING(needle_val, needle.len)
Z_PARAM_STR(haystack)
Z_PARAM_STR(needle)
Z_PARAM_OPTIONAL
Z_PARAM_STR_OR_NULL(enc_name)
ZEND_PARSE_PARAMETERS_END();

haystack.val = (unsigned char*)haystack_val;
needle.val = (unsigned char*)needle_val;

if (needle.len == 0) {
if (ZSTR_LEN(needle) == 0) {
zend_argument_value_error(2, "must not be empty");
RETURN_THROWS();
}

haystack.encoding = needle.encoding = php_mb_get_encoding(enc_name, 3);
if (!haystack.encoding) {
const mbfl_encoding *enc = php_mb_get_encoding(enc_name, 3);
if (!enc) {
RETURN_THROWS();
}

size_t n = mbfl_substr_count(&haystack, &needle);
/* An error can only occur if needle is empty,
* an encoding error happens (which should not happen at this stage and is a bug)
* or the haystack is more than sizeof(size_t) bytes
* If one of these things occur this is a bug and should be flagged as such */
ZEND_ASSERT(!mbfl_is_error(n));
RETVAL_LONG(n);
if (php_mb_is_no_encoding_utf8(enc->no_encoding)) {
/* No need to do any conversion if haystack/needle are already known-valid UTF-8
* (If they are not valid, then not passing them through conversion filters could affect output) */
if (GC_FLAGS(haystack) & IS_STR_VALID_UTF8) {
haystack_u8 = haystack;
} else {
unsigned int num_errors = 0;
haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
if (!num_errors && !ZSTR_IS_INTERNED(haystack)) {
GC_ADD_FLAGS(haystack, IS_STR_VALID_UTF8);
}
}

if (GC_FLAGS(needle) & IS_STR_VALID_UTF8) {
needle_u8 = needle;
} else {
unsigned int num_errors = 0;
needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
if (!num_errors && !ZSTR_IS_INTERNED(needle)) {
GC_ADD_FLAGS(needle, IS_STR_VALID_UTF8);
}
}
} else {
unsigned int num_errors = 0;
haystack_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(haystack), ZSTR_LEN(haystack), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
needle_u8 = mb_fast_convert((unsigned char*)ZSTR_VAL(needle), ZSTR_LEN(needle), enc, &mbfl_encoding_utf8, 0, MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8, &num_errors);
/* A string with >0 bytes may convert to 0 codepoints; for example, the contents
* may be only escape sequences */
if (ZSTR_LEN(needle_u8) == 0) {
zend_string_free(haystack_u8);
zend_string_free(needle_u8);
zend_argument_value_error(2, "must not be empty");
RETURN_THROWS();
}
}

size_t result = 0;

if (ZSTR_LEN(haystack_u8) < ZSTR_LEN(needle_u8)) {
goto out;
}

const char *p = ZSTR_VAL(haystack_u8), *e = p + ZSTR_LEN(haystack_u8);
while (true) {
p = zend_memnstr(p, ZSTR_VAL(needle_u8), ZSTR_LEN(needle_u8), e);
if (!p) {
break;
}
p += ZSTR_LEN(needle_u8);
result++;
}

out:
if (haystack_u8 != haystack) {
zend_string_free(haystack_u8);
}
if (needle_u8 != needle) {
zend_string_free(needle_u8);
}

RETVAL_LONG(result);
}
/* }}} */

/* {{{ Returns part of a string */
PHP_FUNCTION(mb_substr)
Expand Down