Skip to content

Commit

Permalink
Revert/fix mb_substitute_character() codepoint checks
Browse files Browse the repository at this point in the history
The introduced checks did not treat "non-Unicode" encodings correctly,
because they treated the passed integer as encoded in the internal
encoding in that case, while in actuality the substitute character
is always a Unicode codepoint.

Additionally checking the codepoint against the internal encoding
is not correct in any case, because the substitution character must
be mapped in the *target* encoding of the conversion, which does
not necessarily coincide with the internal encoding (the internal
encoding is the default *source* encoding, not *target* encoding).

This reverts the checks back to simple range checks, but in a way
that still resolves #69079: Characters outside the Basic
Multilingual Plane are now accepted and Surrogate Codepoints are
rejected. A distinction between UTF-8 and non-UTF-8 encodings is
not made for surrogate checks (as in the original patch), as
surrogates are always illegal on their own. Specifying a surrogate
as substitution character would only make sense if you could
specify a substitution string with more than one character --
however we do not support that.
  • Loading branch information
nikic committed Aug 3, 2017
1 parent 3557436 commit a8a9e93
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 92 deletions.
72 changes: 12 additions & 60 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -1978,69 +1978,21 @@ PHP_FUNCTION(mb_detect_order)

static inline int php_mb_check_code_point(long cp)
{
enum mbfl_no_encoding no_enc;
char* buf;
char buf_len;

no_enc = MBSTRG(current_internal_encoding)->no_encoding;

if (php_mb_is_no_encoding_utf8(no_enc)) {

if ((cp > 0 && 0xd800 > cp) || (cp > 0xdfff && 0x110000 > cp)) {
return 1;
}

if (cp <= 0 || cp >= 0x110000) {
/* Out of Unicode range */
return 0;
} else if (php_mb_is_no_encoding_unicode(no_enc)) {

if (0 > cp || cp > 0x10ffff) {
return 0;
}

return 1;

// backward compatibility
} else if (php_mb_is_unsupported_no_encoding(no_enc)) {
return cp < 0xffff && cp > 0x0;
}

if (cp < 0x100) {
buf_len = 1;
buf = (char *) safe_emalloc(buf_len, 1, 1);
buf[0] = cp;
buf[1] = 0;
} else if (cp < 0x10000) {
buf_len = 2;
buf = (char *) safe_emalloc(buf_len, 1, 1);
buf[0] = cp >> 8;
buf[1] = cp & 0xff;
buf[2] = 0;
} else if (cp < 0x1000000) {
buf_len = 3;
buf = (char *) safe_emalloc(buf_len, 1, 1);
buf[0] = cp >> 16;
buf[1] = (cp >> 8) & 0xff;
buf[2] = cp & 0xff;
buf[3] = 0;
} else {
buf_len = 4;
buf = (char *) safe_emalloc(buf_len, 1, 1);
buf[0] = cp >> 24;
buf[1] = (cp >> 16) & 0xff;
buf[2] = (cp >> 8) & 0xff;
buf[3] = cp & 0xff;
buf[4] = 0;
}

if (php_mb_check_encoding(buf, buf_len, NULL)) {
efree(buf);

return 1;
if (cp >= 0xd800 && cp <= 0xdfff) {
/* Surrogate code-point. These are never valid on their own and we only allow a single
* substitute character. */
return 0;
}

efree(buf);

return 0;
/* As the we do not know the target encoding of the conversion operation that is going to
* use the substitution character, we cannot check whether the codepoint is actually mapped
* in the given encoding at this point. Thus we have to accept everything. */
return 1;
}

/* {{{ proto mixed mb_substitute_character([mixed substchar])
Expand Down Expand Up @@ -2081,7 +2033,7 @@ PHP_FUNCTION(mb_substitute_character)
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
} else {
php_error_docref(NULL, E_WARNING, "Unknown character.");
php_error_docref(NULL, E_WARNING, "Unknown character");
RETURN_FALSE;
}
}
Expand All @@ -2092,7 +2044,7 @@ PHP_FUNCTION(mb_substitute_character)
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
} else {
php_error_docref(NULL, E_WARNING, "Unknown character.");
php_error_docref(NULL, E_WARNING, "Unknown character");
RETURN_FALSE;
}
break;
Expand Down
29 changes: 25 additions & 4 deletions ext/mbstring/tests/bug69079.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,32 @@ Bug #69079 (enhancement for mb_substitute_character)
<?php extension_loaded('mbstring') or die('skip mbstring not available'); ?>
--FILE--
<?php

mb_internal_encoding('UTF-8');
var_dump(mb_substitute_character(0x1f600));
var_dump(mb_substitute_character(0x1F600));
var_dump(bin2hex(mb_scrub("\xff")));
mb_substitute_character(0x3f); // Reset to '?', as the next call will fail
var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal)
var_dump(bin2hex(mb_scrub("\xff")));

mb_internal_encoding('EUC-JP-2004');
var_dump(mb_substitute_character(0x8fa1ef));

mb_substitute_character(0x63); // Reset to '?', as the next call will fail
mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal)
var_dump(bin2hex(mb_scrub("\x8d")));

mb_substitute_character(0x50aa);
var_dump(bin2hex(mb_scrub("\x8d")));

?>
--EXPECT--
--EXPECTF--
bool(true)
bool(true)
string(8) "f09f9880"

Warning: mb_substitute_character(): Unknown character in %s on line %d
bool(false)
string(2) "3f"

Warning: mb_substitute_character(): Unknown character in %s on line %d
string(2) "63"
string(6) "8fa1ef"
3 changes: 0 additions & 3 deletions ext/mbstring/tests/bug69086.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,7 @@ mb_substitute_character(0xfffd);
var_dump("?" === mb_convert_encoding("\x80", "Shift_JIS", "EUC-JP"));
mb_internal_encoding("UCS-4BE");
var_dump("\x00\x00\xff\xfd" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
mb_substitute_character(0xd800);
var_dump("\x00\x00\x00\x3f" === mb_convert_encoding("\x80", "UCS-4BE", "UTF-8"));
?>
--EXPECT--
bool(true)
bool(true)
bool(true)
13 changes: 6 additions & 7 deletions ext/mbstring/tests/mb_chr.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ mb_substitute_character(0xfffd);
var_dump(
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
);
mb_substitute_character(0xd800);
var_dump(
"?" === mb_chr(0xd800, "UTF-8")
"\u{fffd}" === mb_chr(0xd800, "UTF-8")
);

mb_internal_encoding("EUC-JP");
Expand All @@ -43,15 +42,15 @@ bool(true)
bool(true)
bool(true)

Warning: mb_chr(): Unknown encoding "typo" in %s on line 26
Warning: mb_chr(): Unknown encoding "typo" in %s on line %d

Warning: mb_chr(): Unsupported encoding "pass" in %s on line 27
Warning: mb_chr(): Unsupported encoding "pass" in %s on line %d

Warning: mb_chr(): Unsupported encoding "jis" in %s on line 28
Warning: mb_chr(): Unsupported encoding "jis" in %s on line %d

Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line 29
Warning: mb_chr(): Unsupported encoding "cp50222" in %s on line %d

Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line 30
Warning: mb_chr(): Unsupported encoding "utf-7" in %s on line %d
bool(false)
bool(false)
bool(false)
Expand Down
2 changes: 1 addition & 1 deletion ext/mbstring/tests/mb_substitute_character_basic.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ int(1234)
bool(true)
string(4) "none"

Warning: mb_substitute_character(): Unknown character. in %s on line %d
Warning: mb_substitute_character(): Unknown character in %s on line %d
bool(false)
===DONE===
34 changes: 17 additions & 17 deletions ext/mbstring/tests/mb_substitute_character_variation1.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ fclose($fp);
*** Testing mb_substitute_character() : usage variation ***

--int 0--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--int 1--
Expand All @@ -133,30 +133,30 @@ bool(true)
bool(true)

--int -12345--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--float 10.5--
bool(true)

--float -10.5--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--float 12.3456789000e10--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--float -12.3456789000e10--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--float .5--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--empty array--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--int indexed array--
Expand All @@ -169,25 +169,25 @@ bool(true)
bool(true)

--uppercase NULL--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--lowercase null--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--lowercase true--
bool(true)

--lowercase false--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--uppercase TRUE--
bool(true)

--uppercase FALSE--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--empty string DQ--
Expand All @@ -197,19 +197,19 @@ bool(true)
bool(true)

--string DQ--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--string SQ--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--mixed case string--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--heredoc--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--instance of classWithToString--
Expand All @@ -221,11 +221,11 @@ Error: 8 - Object of class classWithoutToString could not be converted to int, %
bool(true)

--undefined var--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)

--unset var--
Error: 2 - mb_substitute_character(): Unknown character., %s(%d)
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
===DONE===

0 comments on commit a8a9e93

Please sign in to comment.