Skip to content

Commit a0415b2

Browse files
committed
Add more tests for CP5022{0,1,2} text conversion
1 parent e3f6a9f commit a0415b2

File tree

1 file changed

+64
-16
lines changed

1 file changed

+64
-16
lines changed

ext/mbstring/tests/cp5022x_encoding.phpt

Lines changed: 64 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ function shiftJISDecode($bytes) {
3434
/* Read in table of all characters in CP932 charset */
3535
$cp932Chars = array(); /* CP932 -> UTF-16BE */
3636
$nonInvertible = array();
37+
$fromUnicode = array();
3738
$fp = fopen(__DIR__ . '/data/CP932.txt', 'r+');
3839
while ($line = fgets($fp, 256)) {
3940
if ($line[0] == '#')
@@ -43,10 +44,12 @@ while ($line = fgets($fp, 256)) {
4344
if ($bytes < 256)
4445
continue;
4546

46-
if ($bytes >= 0xFA00) {
47+
48+
if (isset($fromUnicode[$codepoint])) {
4749
$nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
4850
} else {
4951
$cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint);
52+
$fromUnicode[$codepoint] = $bytes;
5053
}
5154
}
5255
}
@@ -63,21 +66,6 @@ for ($i = 0xF0; $i <= 0xF9; $i++) {
6366
}
6467
}
6568

66-
/* There are 396 Unicode codepoints which are non-invertible in CP932
67-
* (multiple CP932 byte sequences map to the same codepoint) */
68-
for ($i = 0xED00; $i <= 0xEEFF; $i++) {
69-
$bytes = pack('n', shiftJISDecode($i));
70-
if (isset($cp932Chars[$bytes])) {
71-
$nonInvertible[$bytes] = $cp932Chars[$bytes];
72-
unset($cp932Chars[$bytes]); // will test these separately
73-
}
74-
}
75-
foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) {
76-
$bytes = pack('n', shiftJISDecode($i));
77-
$nonInvertible[$bytes] = $cp932Chars[$bytes];
78-
unset($cp932Chars[$bytes]); // will test these separately
79-
}
80-
8169
/* Read in table of all characters in JISX-0201 charset */
8270
$jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */
8371
$fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+');
@@ -89,6 +77,18 @@ while ($line = fgets($fp, 256)) {
8977
$jisx0201Chars[chr($byte)] = pack('n', $codepoint);
9078
}
9179

80+
/* Read in table of all characters in JISX-0212 charset */
81+
$jisx0212Chars = array();
82+
$fp = fopen(__DIR__ . '/data/JISX0212.txt', 'r+');
83+
while ($line = fgets($fp, 256)) {
84+
if ($line[0] == '#')
85+
continue;
86+
87+
if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) {
88+
$jisx0212Chars[pack('n', $bytes)] = pack('n', $codepoint);
89+
}
90+
}
91+
9292
/* Our conversions between CP5022x (when CP932 charset is selected) and Unicode
9393
* differ in a number of places from the table provided by the Unicode Consortium */
9494
$cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */
@@ -151,6 +151,10 @@ for ($i = 0x80; $i < 256; $i++) {
151151
testInvalid("\x0F" . chr($i), "\x00%", 'CP50222');
152152
}
153153

154+
// Switch back to ASCII after a multibyte character
155+
convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50221', false);
156+
convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50222', false);
157+
154158
echo "ASCII support OK\n";
155159

156160
/* All valid JIS X 0201 characters
@@ -164,6 +168,7 @@ foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
164168
testValid($jisx0201, $utf16BE, 'CP50220', false);
165169
testValid($jisx0201, $utf16BE, 'CP50221', false);
166170
testValid($jisx0201, $utf16BE, 'CP50222', false);
171+
convertValidString($utf16BE, "\x0E" . chr(ord($jisx0201) - 0x80) . "\x0F", 'UTF-16BE', 'CP50222', false);
167172
} else { /* Latin */
168173
testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80");
169174
testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80");
@@ -182,6 +187,11 @@ for ($i = 0x80; $i < 256; $i++) {
182187
testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222');
183188
}
184189

190+
/* Go from JIS X 0201 to ASCII or JIS X 0208 */
191+
convertValidString("\xFF\x61\x00A", "\x0E\x21\x0FA", 'UTF-16BE', 'CP50222', false);
192+
convertValidString("\xFF\x61\x22\x25", "\x0E\x21\x0F\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', 'CP50222', false);
193+
convertValidString("\xFF\x61\x20\x3E", "\x0E\x21\x0F\x1B(J\x7E\x1B(B", 'UTF-16BE', 'CP50222');
194+
185195
echo "JIS X 0201 support OK\n";
186196

187197
/* All valid CP932 characters */
@@ -196,6 +206,15 @@ foreach ($nonInvertible as $cp932 => $utf16BE) {
196206
testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false);
197207
}
198208

209+
/* There are some conversions we support from Unicode -> CP5022x, but not in the opposite direction */
210+
foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) {
211+
convertValidString("\x22\x25", "\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', $encoding, false);
212+
convertValidString("\xFF\x0D", "\x1B\$B\x21\x5D\x1B(B", 'UTF-16BE', $encoding, false);
213+
convertValidString("\xFF\xE0", "\x1B\$B\x21\x71\x1B(B", 'UTF-16BE', $encoding, false);
214+
convertValidString("\xFF\xE1", "\x1B\$B\x21\x72\x1B(B", 'UTF-16BE', $encoding, false);
215+
convertValidString("\xFF\xE2", "\x1B\$B\x22\x4C\x1B(B", 'UTF-16BE', $encoding, false);
216+
}
217+
199218
/* All invalid 2-byte CP932 characters */
200219
for ($i = 0x21; $i <= 0x97; $i++) {
201220
for ($j = 0; $j < 256; $j++) {
@@ -215,8 +234,36 @@ for ($i = 0x21; $i <= 0x97; $i++) {
215234
testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222');
216235
}
217236

237+
/* Test alternative escape sequence to select CP932 */
238+
testValid("\x1B\$(B\x21\x21", "\x30\x00", 'CP50220', false);
239+
218240
echo "CP932 support OK\n";
219241

242+
foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
243+
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50220', false);
244+
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50221', false);
245+
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50222', false);
246+
}
247+
248+
for ($i = 0x21; $i <= 0x97; $i++) {
249+
for ($j = 0; $j < 256; $j++) {
250+
$testString = chr($i) . chr($j);
251+
if (!isset($jisx0212Chars[$testString])) {
252+
testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50220');
253+
testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50221');
254+
testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50222');
255+
}
256+
}
257+
}
258+
259+
for ($i = 0x21; $i <= 0x97; $i++) {
260+
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50220');
261+
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50221');
262+
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50222');
263+
}
264+
265+
echo "JIS X 0212 support OK\n";
266+
220267
/* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */
221268
$fullwidthKatakana = array(
222269
0xFF61 => 0x2123, /* Ideographic full stop */
@@ -310,6 +357,7 @@ echo "Long error markers OK\n";
310357
ASCII support OK
311358
JIS X 0201 support OK
312359
CP932 support OK
360+
JIS X 0212 support OK
313361
Folding of fullwidth katakana for CP50220 OK
314362
Invalid Unicode is flagged when converting to CP5022x
315363
Long error markers OK

0 commit comments

Comments
 (0)