CP5022{0,1,2} supports 'IBM extension' codes from ku 115-119

mbstring has always had the conversion tables to support CP932 codes in ku 115-119, and the conversion code for CP5022x has an 'if' clause specifically to handle such characters... but that 'if' clause was dead code, since a guard clause earlier in the same function prevented it from accepting 2-byte characters with a starting byte of 0x93-0x97. Adjust the guard clause so that these characters can be converted as the original author apparently intended. The code which handles ku 115-119 is the part which reads: } else if (s >= cp932ext3_ucs_table_min && s < cp932ext3_ucs_table_max) { w = cp932ext3_ucs_table[s - cp932ext3_ucs_table_min];
php · Aug 31, 2021 · e3f6a9f · e3f6a9f
1 parent 671dcee
commit e3f6a9f
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 7 deletions.
diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c
@@ -165,7 +165,7 @@ int mbfl_filt_conv_cp5022x_wchar(int c, mbfl_convert_filter *filter)
 			CK((*filter->output_function)(0x203e, filter->data));
 		} else if (filter->status == 0x20 && c > 0x20 && c < 0x60) {		/* kana */
 			CK((*filter->output_function)(0xff40 + c, filter->data));
-		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c < 0x93) {		/* kanji first char */
+		} else if ((filter->status == 0x80 || filter->status == 0x90) && c > 0x20 && c <= 0x97) { /* kanji first char */
 			filter->cache = c;
 			filter->status += 1;
 		} else if (c >= 0 && c < 0x80) {		/* latin, CTLs */

diff --git a/ext/mbstring/tests/cp5022x_encoding.phpt b/ext/mbstring/tests/cp5022x_encoding.phpt
@@ -33,6 +33,7 @@ function shiftJISDecode($bytes) {
 
 /* Read in table of all characters in CP932 charset */
 $cp932Chars = array(); /* CP932 -> UTF-16BE */
+$nonInvertible = array();
 $fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
 while ($line = fgets($fp, 256)) {
   if ($line[0] == '#')
@@ -41,9 +42,12 @@ while ($line = fgets($fp, 256)) {
   if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
     if ($bytes < 256)
       continue;
-    if ($bytes > 0xFA00) // We don't handle these extra characters from ku 114 and above
-      continue;
-    $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
+
+    if ($bytes >= 0xFA00) {
+      $nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
+    } else {
+      $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
+    }
   }
 }
 
@@ -61,7 +65,6 @@ for ($i = 0xF0; $i <= 0xF9; $i++) {
 
 /* There are 396 Unicode codepoints which are non-invertible in CP932
  * (multiple CP932 byte sequences map to the same codepoint) */
-$nonInvertible = array();
 for ($i = 0xED00; $i <= 0xEEFF; $i++) {
   $bytes = pack('n', shiftJISDecode($i));
   if (isset($cp932Chars[$bytes])) {
@@ -194,7 +197,7 @@ foreach ($nonInvertible as $cp932 => $utf16BE) {
 }
 
 /* All invalid 2-byte CP932 characters */
-for ($i = 0x21; $i <= 0x7E; $i++) {
+for ($i = 0x21; $i <= 0x97; $i++) {
   for ($j = 0; $j < 256; $j++) {
     $testString = chr($i) . chr($j);
     if (!isset($cp932Chars[$testString]) && !isset($nonInvertible[$testString])) {
@@ -206,7 +209,7 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
 }
 
 /* Try truncated 2-byte characters */
-for ($i = 0x21; $i <= 0x7E; $i++) {
+for ($i = 0x21; $i <= 0x97; $i++) {
   testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50220');
   testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50221');
   testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222');