Skip to content

Commit

Permalink
ISO-2022-JP-MS treats truncated multi-byte chars as error
Browse files Browse the repository at this point in the history
Sigh. I included tests which were intended to check this case in the
test suite for ISO-2022-JP-MS, but those tests were faulty and didn't
actually test what they were supposed to.

Fixing the tests revealed that there were still bugs in this area.
  • Loading branch information
alexdowad committed Aug 30, 2021
1 parent 57a81af commit 51e0d32
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 9 deletions.
31 changes: 28 additions & 3 deletions ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
#include "unicode_table_jis.h"
#include "cp932_table.h"

static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter);

static const char *mbfl_encoding_2022jpms_aliases[] = {"ISO2022JPMS", NULL};

const mbfl_encoding mbfl_encoding_2022jpms = {
Expand All @@ -53,7 +55,7 @@ const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
mbfl_filt_conv_common_ctor,
NULL,
mbfl_filt_conv_2022jpms_wchar,
mbfl_filt_conv_common_flush,
mbfl_filt_conv_2022jpms_wchar_flush,
NULL,
};

Expand Down Expand Up @@ -144,8 +146,7 @@ int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
} else {
if (c1 > 0x20 && c1 < 0x35) {
w = 0xE000 + ((c1 - 0x21) * 94) + c - 0x21;
}
if (w <= 0) {
} else {
w = (((c1 - 0x21) + 0x7f) << 8) | c | MBFL_WCSPLANE_JIS0208;
}
}
Expand Down Expand Up @@ -206,6 +207,30 @@ int mbfl_filt_conv_2022jpms_wchar(int c, mbfl_convert_filter *filter)
return c;
}


static int mbfl_filt_conv_2022jpms_wchar_flush(mbfl_convert_filter *filter)
{
if (filter->status & 0xF) {
if ((filter->status & 0xF) == 2) {
(*filter->output_function)(0x1B | MBFL_WCSGROUP_THROUGH, filter->data);
} else if ((filter->status & 0xF) == 3) {
(*filter->output_function)(0x1B24 | MBFL_WCSGROUP_THROUGH, filter->data);
} else if ((filter->status & 0xF) == 4) {
(*filter->output_function)(0x1B2428 | MBFL_WCSGROUP_THROUGH, filter->data);
} else if ((filter->status & 0xF) == 5) {
(*filter->output_function)(0x1B28 | MBFL_WCSGROUP_THROUGH, filter->data);
} else {
(*filter->output_function)(filter->cache | MBFL_WCSGROUP_THROUGH, filter->data);
}
}

if (filter->flush_function) {
(*filter->flush_function)(filter->data);
}

return 0;
}

static int cp932ext3_cp932ext2_jis(int c)
{
int idx;
Expand Down
18 changes: 12 additions & 6 deletions ext/mbstring/tests/iso2022jp_ms_encoding.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C
$udcChars = array();
for ($cp = 0xE000; $cp < (0xE000 + (20 * 94)); $cp++) {
$i = $cp - 0xE000;
$bytes = (( (int)($i / 94) + 0x7F - 0x5E) << 8) + (($i % 94) + 0x21);
$bytes = (((int)($i / 94) + 0x21) << 8) + (($i % 94) + 0x21);
$udcChars[pack('n', $bytes)] = pack('N', $cp);
}

Expand Down Expand Up @@ -175,18 +175,16 @@ foreach (array_keys($truncatedChars) as $truncated)
echo "JIS X 0208 (with MS extensions) support OK\n";

$validChars = $udcChars;
/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */
for ($i = 0; $i <= 0x7F; $i++)
$validChars[chr($i)] = chr($i);
for ($i = 0xA1; $i <= 0xDF; $i++)
$validChars[chr($i)] = $jisx0201Chars[chr($i)];
$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2);
findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable);
findInvalidChars($validChars, $invalidChars, $truncatedChars, array_fill_keys(range(0x21, 0x7F), 2));

testAllValidCharsWithPrefix($udcChars, "\x1B\$(?", true);

foreach (array_keys($invalidChars) as $invalid) {
$firstByte = ord($invalid[0]);
$firstByte = ord(substr($invalid, 0, 1));
if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) {
testInvalidString("\x1B\$(?" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE');
} else {
Expand All @@ -201,7 +199,15 @@ echo "UDC support OK\n";
// Test "long" illegal character markers
mb_substitute_character("long");
convertInvalidString("\xE0", "BAD+E0", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8"); // Invalid escape
// Invalid escapes:
convertInvalidString("\x1B", "BAD+1B", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B.", "BAD+1B2E", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$", "BAD+1B24", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$.", "BAD+1B242E", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B(", "BAD+1B28", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B(.", "BAD+1B282E", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$(", "BAD+1B2428", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$(X", "BAD+242858", "ISO-2022-JP-MS", "UTF-8");
convertInvalidString("\x1B\$B\x9F", "BAD+9F", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character

echo "Done!\n";
Expand Down

0 comments on commit 51e0d32

Please sign in to comment.