Skip to content

Commit

Permalink
Error handling for UTF-8 complies with WHATWG specification
Browse files Browse the repository at this point in the history
In 7502c86, I adjusted the number of error markers emitted on
invalid UTF-8 text to be more consistent with mbstring's behavior on
other text encodings (generally, it emits one error marker for one
unexpected byte). I didn't expect that anybody would actually care one
way or the other, but felt that it was better to be consistent than
not.

Later, Martin Auswöger kindly pointed out that the WHATWG encoding
specification, which governs how various text encodings are handled
by web browsers, does actually specify how many error markers should
be generated for any given piece of invalid UTF-8 text.

Until now, we have never really paid much attention to the WHATWG
specification, but we do want to comply with as many relevant
specifications as possible. And since PHP is commonly used for web
applications, compatibility with the behavior of web browsers is
obviously a good thing.
  • Loading branch information
alexdowad committed Apr 16, 2022
1 parent b0ab5d0 commit 04e59c9
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 29 deletions.
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ PHP NEWS
- Hash:
. Fixed bug #81714 (segfault when serializing finalized HashContext). (cmb)

- MBString:
. Number of error markers emitted for invalid UTF-8 text matches WHATWG specification.
This is a return to the behavior of PHP 8.0 and earlier. (alexdowad)

- MySQLi:
. Fixed bug GH-8267 (MySQLi uses unsupported format specifier on Windows).
(cmb)
Expand Down
16 changes: 4 additions & 12 deletions ext/mbstring/libmbfl/filters/mbfilter_utf8.c
Original file line number Diff line number Diff line change
Expand Up @@ -122,9 +122,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
CK((*filter->output_function)(s, filter->data));
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
case 0x20: /* 3byte code 2nd char: 0:0xa0-0xbf,D:0x80-9F,1-C,E-F:0x80-0x9f */
Expand All @@ -139,9 +137,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
case 0x30: /* 4byte code 2nd char: 0:0x90-0xbf,1-3:0x80-0xbf,4:0x80-0x8f */
Expand All @@ -156,9 +152,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
case 0x31: /* 4byte code 3rd char: 0x80-0xbf */
Expand All @@ -167,9 +161,7 @@ int mbfl_filt_conv_utf8_wchar(int c, mbfl_convert_filter *filter)
filter->status++;
} else {
CK(mbfl_filt_put_invalid_char(filter));
if (c < 0x80 || (c >= 0xc2 && c <= 0xf4)) {
goto retry;
}
goto retry;
}
break;
default:
Expand Down
18 changes: 9 additions & 9 deletions ext/mbstring/tests/illformed_utf_sequences.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -22,28 +22,28 @@ var_dump(chk_enc("\x31\x32\x33", 0));
var_dump(chk_enc("\x41\x42\x43", 0));
var_dump(chk_enc("\xc0\xb1\xc0\xb2\xc0\xb3", 6));
var_dump(chk_enc("\xc1\x81\xc1\x82\xc1\x83", 6));
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 6));
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 6));
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 9));
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 8));
var_dump(chk_enc("\xe0\x80\xb1\xe0\x80\xb2\xe0\x80\xb3", 9));
var_dump(chk_enc("\xe0\x81\x81\xe0\x81\x82\xe0\x81\x83", 9));
var_dump(chk_enc("\xf0\x80\x80\xb1\xf0\x80\x80\xb2\xf0\x80\x80\xb3", 12));
var_dump(chk_enc("\xf0\x80\x81\x81\xf0\x80\x81\x82\xf0\x81\x83", 11));
var_dump(chk_enc("\xf8\x80\x80\x80\xb1\xf8\x80\x80\x80\xb2\xf8\x80\x80\x80\xb3", 15));
var_dump(chk_enc("\xf8\x80\x80\x81\x81\xf8\x80\x80\x81\x82\xf8\x80\x80\x81\x83", 15));
var_dump(chk_enc("\xfc\x80\x80\x80\x80\xb1\xfc\x80\x80\x80\x80\xb2\xfc\x80\x80\x80\x80\xb3", 18));
var_dump(chk_enc("\xfc\x80\x80\x80\x81\x81\xfc\x80\x80\x80\x81\x82\xfc\x80\x80\x80\x81\x83", 18));

var_dump(chk_enc("\xc2\xa2\xc2\xa3\xc2\xa5", 0));
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 6));
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 9));
var_dump(chk_enc("\xe0\x82\xa2\xe0\x82\xa3\xe0\x82\xa5", 9));
var_dump(chk_enc("\xf0\x80\x82\xa2\xf0\x80\x82\xa3\xf0\x80\x82\xa5", 12));
var_dump(chk_enc("\xf8\x80\x80\x82\xa2\xf8\x80\x80\x82\xa3\xf8\x80\x80\x82\xa5", 15));
var_dump(chk_enc("\xfc\x80\x80\x80\x82\xa2\xfc\x80\x80\x80\x82\xa3\xfc\x80\x80\x80\x82\xa5", 18));

var_dump(chk_enc("\xc1\xbf", 2));
var_dump(chk_enc("\xc2\x80", 0));
var_dump(chk_enc("\xdf\xbf", 0));
var_dump(chk_enc("\xe0\x9f\xff", 2));
var_dump(chk_enc("\xe0\x9f\xff", 3));
var_dump(chk_enc("\xe0\xa0\x80", 2));
var_dump(chk_enc("\xef\xbf\xbf", 0));
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 3));
var_dump(chk_enc("\xf0\x8f\xbf\xbf", 4));
var_dump(chk_enc("\xf0\x90\x80\x80", 0));
var_dump(chk_enc("\xf7\xbf\xbf\xbf", 4));
var_dump(chk_enc("\xf8\x87\xbf\xbf\xbf", 5));
Expand All @@ -58,7 +58,7 @@ echo "UTF-8 and surrogates area\n";
$out = '';
$cnt = 0;
for ($i = 0xd7ff; $i <= 0xe000; ++$i) {
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 2);
$s = chk_enc(pack('C3', 0xe0 | ($i >> 12), 0x80 | ($i >> 6) & 0x3f, 0x80 | $i & 0x3f), 3);
if ($s === false) {
$cnt++;
} else {
Expand Down
56 changes: 56 additions & 0 deletions ext/mbstring/tests/utf8_error_handling.phpt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
--TEST--
Confirm error handling for UTF-8 complies with WHATWG spec
--EXTENSIONS--
mbstring
--FILE--
<?php
/* The WHATWG specifies not just how web browsers should handle _valid_
* UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
* as how many error markers each invalid byte sequence should decode
* to).
* That specification is followed by the JavaScript Encoding API.
*
* The API documentation for mb_convert_encoding does not specify how
* many error markers we will emit for each possible invalid byte
* sequence, so we might as well comply with the WHATWG specification.
*
* Thanks to Martin Auswöger for pointing this out... and another big
* thanks for providing test cases!
*
* Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
*/
mb_substitute_character(0x25);

$testCases = [
["\x80", "%"],
["\xFF", "%"],
["\xC2\x7F", "%\x7F"],
["\xC2\x80", "\xC2\x80"],
["\xDF\xBF", "\xDF\xBF"],
["\xDF\xC0", "%%"],
["\xE0\xA0\x7F", "%\x7F"],
["\xE0\xA0\x80", "\xE0\xA0\x80"],
["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
["\xEF\xBF\xC0", "%%"],
["\xF0\x90\x80\x7F", "%\x7F"],
["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
["\xF4\x8F\xBF\xC0", "%%"],
["\xFA\x80\x80\x80\x80", "%%%%%"],
["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
];

foreach ($testCases as $testCase) {
$result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
if ($result !== $testCase[1]) {
die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
}
}

echo "All done!\n";

?>
--EXPECT--
All done!
16 changes: 8 additions & 8 deletions ext/mbstring/tests/utf_encodings.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -761,14 +761,14 @@ testValidString('', '', 'UTF-8', 'UTF-32BE');

$invalid = array(
// Codepoints outside of valid 0-0x10FFFF range for Unicode
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0x110000
"\xF4\x90\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x110000
"\xF7\x80\x80\x80" => str_repeat("\x00\x00\x00%", 4), // CP 0x1C0000
"\xF7\xBF\xBF\xBF" => str_repeat("\x00\x00\x00%", 4), // CP 0x1FFFFF

// Reserved range for UTF-16 surrogate pairs
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 2), // CP 0xD800
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDBFF
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 2), // CP 0xDFFF
"\xED\xA0\x80" => str_repeat("\x00\x00\x00%", 3), // CP 0xD800
"\xED\xAF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDBFF
"\xED\xBF\xBF" => str_repeat("\x00\x00\x00%", 3), // CP 0xDFFF

// Truncated characters
"\xDF" => "\x00\x00\x00%", // should have been 2-byte
Expand All @@ -788,8 +788,8 @@ $invalid = array(

// Multi-byte characters which end too soon and go to a junk byte
// (Which isn't even valid to start a new character)
"\xF0\xBF\xBF\xFF" => "\x00\x00\x00%",
"\xF0\xBF\xFF" => "\x00\x00\x00%",
"\xF0\xBF\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),
"\xF0\xBF\xFF" => str_repeat("\x00\x00\x00%", 2),

// Continuation bytes which appear outside of a MB char
"\x80" => "\x00\x00\x00%",
Expand All @@ -799,8 +799,8 @@ $invalid = array(
// Overlong code units
// (Using more bytes than needed to encode a character)
"\xC1\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 2 bytes
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 2), // didn't need 3 bytes
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 3) // didn't need 4 bytes
"\xE0\x9F\xBF" => str_repeat("\x00\x00\x00%", 3), // didn't need 3 bytes
"\xF0\x8F\xBF\xBF" => str_repeat("\x00\x00\x00%", 4) // didn't need 4 bytes
);

testInvalidCodepoints($invalid, 'UTF-8');
Expand Down

0 comments on commit 04e59c9

Please sign in to comment.