Add more tests for CP5022{0,1,2} text conversion

alexdowad · alexdowad · commit a0415b22ab81 · 2021-08-31T13:41:34.000+02:00
diff --git a/ext/mbstring/tests/cp5022x_encoding.phpt b/ext/mbstring/tests/cp5022x_encoding.phpt
@@ -34,6 +34,7 @@ function shiftJISDecode($bytes) {
 /* Read in table of all characters in CP932 charset */
 $cp932Chars = array(); /* CP932 -> UTF-16BE */
 $nonInvertible = array();
+$fromUnicode = array();
 $fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
 while ($line = fgets($fp, 256)) {
   if ($line[0] == '#')
@@ -43,10 +44,12 @@ while ($line = fgets($fp, 256)) {
     if ($bytes < 256)
       continue;
 
-    if ($bytes >= 0xFA00) {
+
+    if (isset($fromUnicode[$codepoint])) {
       $nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
     } else {
       $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
+      $fromUnicode[$codepoint] = $bytes;
     }
   }
 }
@@ -63,21 +66,6 @@ for ($i = 0xF0; $i <= 0xF9; $i++) {
   }
 }
 
-/* There are 396 Unicode codepoints which are non-invertible in CP932
- * (multiple CP932 byte sequences map to the same codepoint) */
-for ($i = 0xED00; $i <= 0xEEFF; $i++) {
-  $bytes = pack('n', shiftJISDecode($i));
-  if (isset($cp932Chars[$bytes])) {
-    $nonInvertible[$bytes] = $cp932Chars[$bytes];
-    unset($cp932Chars[$bytes]); // will test these separately
-  }
-}
-foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) {
-  $bytes = pack('n', shiftJISDecode($i));
-  $nonInvertible[$bytes] = $cp932Chars[$bytes];
-  unset($cp932Chars[$bytes]); // will test these separately
-}
-
 /* Read in table of all characters in JISX-0201 charset */
 $jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */
 $fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+');
@@ -89,6 +77,18 @@ while ($line = fgets($fp, 256)) {
     $jisx0201Chars[chr($byte)] = pack('n', $codepoint);
 }
 
+/* Read in table of all characters in JISX-0212 charset */
+$jisx0212Chars = array();
+$fp = fopen(__DIR__ . '/data/JISX0212.txt', 'r+');
+while ($line = fgets($fp, 256)) {
+  if ($line[0] == '#')
+    continue;
+
+  if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
+    $jisx0212Chars[pack('n', $bytes)] = pack('n', $codepoint);
+  }
+}
+
 /* Our conversions between CP5022x (when CP932 charset is selected) and Unicode
  * differ in a number of places from the table provided by the Unicode Consortium */
 $cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */
@@ -151,6 +151,10 @@ for ($i = 0x80; $i < 256; $i++) {
   testInvalid("\x0F" . chr($i),   "\x00%", 'CP50222');
 }
 
+// Switch back to ASCII after a multibyte character
+convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50221', false);
+convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50222', false);
+
 echo "ASCII support OK\n";
 
 /* All valid JIS X 0201 characters
@@ -164,6 +168,7 @@ foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
     testValid($jisx0201, $utf16BE, 'CP50220', false);
     testValid($jisx0201, $utf16BE, 'CP50221', false);
     testValid($jisx0201, $utf16BE, 'CP50222', false);
+    convertValidString($utf16BE, "\x0E" . chr(ord($jisx0201) - 0x80) . "\x0F", 'UTF-16BE', 'CP50222', false);
   } else { /* Latin */
     testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80");
     testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80");
@@ -182,6 +187,11 @@ for ($i = 0x80; $i < 256; $i++) {
   testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222');
 }
 
+/* Go from JIS X 0201 to ASCII or JIS X 0208 */
+convertValidString("\xFF\x61\x00A", "\x0E\x21\x0FA", 'UTF-16BE', 'CP50222', false);
+convertValidString("\xFF\x61\x22\x25", "\x0E\x21\x0F\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', 'CP50222', false);
+convertValidString("\xFF\x61\x20\x3E", "\x0E\x21\x0F\x1B(J\x7E\x1B(B", 'UTF-16BE', 'CP50222');
+
 echo "JIS X 0201 support OK\n";
 
 /* All valid CP932 characters */
@@ -196,6 +206,15 @@ foreach ($nonInvertible as $cp932 => $utf16BE) {
   testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false);
 }
 
+/* There are some conversions we support from Unicode -> CP5022x, but not in the opposite direction */
+foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
+  convertValidString("\x22\x25", "\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', $encoding, false);
+  convertValidString("\xFF\x0D", "\x1B\$B\x21\x5D\x1B(B", 'UTF-16BE', $encoding, false);
+  convertValidString("\xFF\xE0", "\x1B\$B\x21\x71\x1B(B", 'UTF-16BE', $encoding, false);
+  convertValidString("\xFF\xE1", "\x1B\$B\x21\x72\x1B(B", 'UTF-16BE', $encoding, false);
+  convertValidString("\xFF\xE2", "\x1B\$B\x22\x4C\x1B(B", 'UTF-16BE', $encoding, false);
+}
+
 /* All invalid 2-byte CP932 characters */
 for ($i = 0x21; $i <= 0x97; $i++) {
   for ($j = 0; $j < 256; $j++) {
@@ -215,8 +234,36 @@ for ($i = 0x21; $i <= 0x97; $i++) {
   testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222');
 }
 
+/* Test alternative escape sequence to select CP932 */
+testValid("\x1B\$(B\x21\x21", "\x30\x00", 'CP50220', false);
+
 echo "CP932 support OK\n";
 
+foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
+  testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50220', false);
+  testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50221', false);
+  testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50222', false);
+}
+
+for ($i = 0x21; $i <= 0x97; $i++) {
+  for ($j = 0; $j < 256; $j++) {
+    $testString = chr($i) . chr($j);
+    if (!isset($jisx0212Chars[$testString])) {
+      testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50220');
+      testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50221');
+      testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50222');
+    }
+  }
+}
+
+for ($i = 0x21; $i <= 0x97; $i++) {
+  testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50220');
+  testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50221');
+  testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50222');
+}
+
+echo "JIS X 0212 support OK\n";
+
 /* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */
 $fullwidthKatakana = array(
   0xFF61 => 0x2123, /* Ideographic full stop */
@@ -310,6 +357,7 @@ echo "Long error markers OK\n";
 ASCII support OK
 JIS X 0201 support OK
 CP932 support OK
+JIS X 0212 support OK
 Folding of fullwidth katakana for CP50220 OK
 Invalid Unicode is flagged when converting to CP5022x
 Long error markers OK