Skip to content

Commit

Permalink
Revert/fix substitution character fallback
Browse files Browse the repository at this point in the history
The introduced checks were not correct in two respects:
 * It was checked whether the source encoding of the string matches
   the internal encoding, while the actually relevant encoding is
   the *target* encoding.
 * Even if the correct encoding is used, the checks are still too
   conservative. Just because something is not a "Unicode-encoding"
   does not mean that it does not map any non-ASCII characters.

I've reverted the added checks and instead adjusted mbfl_convert
to first try to use the provided substitution character and if
that fails, perform the fallback to '?' at that point. This means
that any codepoint mapped in the target encoding should now be
correctly supported and anything else should fall back to '?'.
  • Loading branch information
nikic committed Aug 3, 2017
1 parent a8a9e93 commit fb9bf5b
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 39 deletions.
22 changes: 18 additions & 4 deletions ext/mbstring/libmbfl/mbfl/mbfl_convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -467,14 +467,26 @@ int mbfl_convert_filter_strcat(mbfl_convert_filter *filter, const unsigned char
int
mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
{
int mode_backup, ret, n, m, r;
int mode_backup, substchar_backup, ret, n, m, r;

ret = 0;

mode_backup = filter->illegal_mode;
filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
substchar_backup = filter->illegal_substchar;

/* The used substitution character may not be supported by the target character encoding.
* If that happens, first try to use "?" instead and if that also fails, silently drop the
* character. */
if (filter->illegal_mode == MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR
&& filter->illegal_substchar != 0x3f) {
filter->illegal_substchar = 0x3f;
} else {
filter->illegal_mode = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
}

switch (mode_backup) {
case MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR:
ret = (*filter->filter_function)(filter->illegal_substchar, filter);
ret = (*filter->filter_function)(substchar_backup, filter);
break;
case MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG:
if (c >= 0) {
Expand Down Expand Up @@ -560,14 +572,16 @@ mbfl_filt_conv_illegal_output(int c, mbfl_convert_filter *filter)
}
ret = mbfl_convert_filter_strcat(filter, (const unsigned char *)";");
} else {
ret = (*filter->filter_function)(filter->illegal_substchar, filter);
ret = (*filter->filter_function)(substchar_backup, filter);
}
}
break;
default:
break;
}

filter->illegal_mode = mode_backup;
filter->illegal_substchar = substchar_backup;
filter->num_illegalchar++;

return ret;
Expand Down
37 changes: 4 additions & 33 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,7 @@ static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_leng
if (convd == NULL) {
return -1;
}

mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));

Expand Down Expand Up @@ -3254,29 +3255,9 @@ MBSTRING_API char *php_mb_convert_encoding(const char *input, size_t length, con
php_error_docref(NULL, E_WARNING, "Unable to create character encoding converter");
return NULL;
}
mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));

if (string.no_encoding == MBSTRG(current_internal_encoding)->no_encoding) {
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
} else if (php_mb_is_no_encoding_unicode(string.no_encoding) && php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {

if (php_mb_is_no_encoding_utf8(string.no_encoding)) {

if (MBSTRG(current_filter_illegal_substchar) > 0xd7ff &&
0xe000 > MBSTRG(current_filter_illegal_substchar)
) {
mbfl_buffer_converter_illegal_substchar(convd, 0x3f);
} else {
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
}

} else {
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));
}

} else {
mbfl_buffer_converter_illegal_substchar(convd, 0x3f);
}
mbfl_buffer_converter_illegal_mode(convd, MBSTRG(current_filter_illegal_mode));
mbfl_buffer_converter_illegal_substchar(convd, MBSTRG(current_filter_illegal_substchar));

/* do it */
ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
Expand Down Expand Up @@ -5199,17 +5180,7 @@ static inline char* php_mb_chr(zend_long cp, const char* enc, size_t *output_len
if (php_mb_is_no_encoding_utf8(no_enc)) {

if (0 > cp || cp > 0x10ffff || (cp > 0xd7ff && 0xe000 > cp)) {
if (php_mb_is_no_encoding_utf8(MBSTRG(current_internal_encoding)->no_encoding)) {
cp = MBSTRG(current_filter_illegal_substchar);
} else if (php_mb_is_no_encoding_unicode(MBSTRG(current_internal_encoding)->no_encoding)) {
if (0xd800 > MBSTRG(current_filter_illegal_substchar) || MBSTRG(current_filter_illegal_substchar) > 0xdfff) {
cp = MBSTRG(current_filter_illegal_substchar);
} else {
cp = 0x3f;
}
} else {
cp = 0x3f;
}
cp = MBSTRG(current_filter_illegal_substchar);
}

if (cp < 0x80) {
Expand Down
6 changes: 6 additions & 0 deletions ext/mbstring/tests/bug69086.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,13 @@ mb_substitute_character(0xfffd);
var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP"));
mb_internal_encoding("UCS-4BE");
var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));

mb_internal_encoding("UTF-8");
mb_substitute_character(0xfffd);
var_dump("\u{fffd}" === mb_convert_encoding("\x80", "UTF-8", "EUC-JP-2004"));

?>
--EXPECT--
bool(true)
bool(true)
bool(true)
2 changes: 1 addition & 1 deletion ext/mbstring/tests/mb_chr.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ var_dump(
mb_internal_encoding("EUC-JP");
mb_substitute_character(0xa4a2);
var_dump(
"?" === mb_chr(0xd800, "UTF-8")
"\u{a4a2}" === mb_chr(0xd800, "UTF-8")
);

// Invalid
Expand Down
2 changes: 1 addition & 1 deletion ext/mbstring/tests/mb_substitute_character_variation2.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ var_dump(bin2hex(mb_convert_encoding($string_mb, "ISO-8859-1", "UTF-8")));
string(14) "3f3f3f3f3f3f3f"
string(14) "42424242424242"
string(0) ""
string(0) ""
string(14) "3f3f3f3f3f3f3f"
===DONE===

0 comments on commit fb9bf5b

Please sign in to comment.