Skip to content

Commit

Permalink
CP5022{0,1,2} supports 'IBM extension' codes from ku 115-119
Browse files Browse the repository at this point in the history
mbstring has always had the conversion tables to support CP932 codes
in ku 115-119, and the conversion code for CP5022x has an 'if' clause
specifically to handle such characters... but that 'if' clause was dead
code, since a guard clause earlier in the same function prevented it
from accepting 2-byte characters with a starting byte of 0x93-0x97.

Adjust the guard clause so that these characters can be converted as
the original author apparently intended.

The code which handles ku 115-119 is the part which reads:

    } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) {
      w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
  • Loading branch information
alexdowad committed Aug 31, 2021
1 parent 671dcee commit e3f6a9f
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 7 deletions.
2 changes: 1 addition & 1 deletion ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
CK((*filter->output_function)(0x203e, filter->data));
} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) { /* kana */
CK((*filter->output_function)(0xff40 + c, filter->data));
} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x93) { /* kanji first char */
} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
filter->cache = c;
filter->status += 1;
} else if (c >= 0 && c < 0x80) { /* latin, CTLs */
Expand Down
15 changes: 9 additions & 6 deletions ext/mbstring/tests/cp5022x_encoding.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ function shiftJISDecode($bytes) {

/* Read in table of all characters in CP932 charset */
$cp932Chars = array(); /* CP932 -> UTF-16BE */
$nonInvertible = array();
$fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
while ($line = fgets($fp, 256)) {
if ($line[0] == '#')
Expand All @@ -41,9 +42,12 @@ while ($line = fgets($fp, 256)) {
if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
if ($bytes < 256)
continue;
if ($bytes > 0xFA00) // We don't handle these extra characters from ku 114 and above
continue;
$cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);

if ($bytes >= 0xFA00) {
$nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
} else {
$cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
}
}
}

Expand All @@ -61,7 +65,6 @@ for ($i = 0xF0; $i <= 0xF9; $i++) {

/* There are 396 Unicode codepoints which are non-invertible in CP932
* (multiple CP932 byte sequences map to the same codepoint) */
$nonInvertible = array();
for ($i = 0xED00; $i <= 0xEEFF; $i++) {
$bytes = pack('n', shiftJISDecode($i));
if (isset($cp932Chars[$bytes])) {
Expand Down Expand Up @@ -194,7 +197,7 @@ foreach ($nonInvertible as $cp932 => $utf16BE) {
}

/* All invalid 2-byte CP932 characters */
for ($i = 0x21; $i <= 0x7E; $i++) {
for ($i = 0x21; $i <= 0x97; $i++) {
for ($j = 0; $j < 256; $j++) {
$testString = chr($i) . chr($j);
if (!isset($cp932Chars[$testString]) && !isset($nonInvertible[$testString])) {
Expand All @@ -206,7 +209,7 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
}

/* Try truncated 2-byte characters */
for ($i = 0x21; $i <= 0x7E; $i++) {
for ($i = 0x21; $i <= 0x97; $i++) {
testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50220');
testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50221');
testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222');
Expand Down

0 comments on commit e3f6a9f

Please sign in to comment.