@@ -34,6 +34,7 @@ function shiftJISDecode($bytes) {
34
34
/* Read in table of all characters in CP932 charset */
35
35
$ cp932Chars = array (); /* CP932 -> UTF-16BE */
36
36
$ nonInvertible = array ();
37
+ $ fromUnicode = array ();
37
38
$ fp = fopen (__DIR__ . '/data/CP932.txt ' , 'r+ ' );
38
39
while ($ line = fgets ($ fp , 256 )) {
39
40
if ($ line [0 ] == '# ' )
@@ -43,10 +44,12 @@ while ($line = fgets($fp, 256)) {
43
44
if ($ bytes < 256 )
44
45
continue ;
45
46
46
- if ($ bytes >= 0xFA00 ) {
47
+
48
+ if (isset ($ fromUnicode [$ codepoint ])) {
47
49
$ nonInvertible [pack ('n ' , shiftJISDecode ($ bytes ))] = pack ('n ' , $ codepoint );
48
50
} else {
49
51
$ cp932Chars [pack ('n ' , shiftJISDecode ($ bytes ))] = pack ('n ' , $ codepoint );
52
+ $ fromUnicode [$ codepoint ] = $ bytes ;
50
53
}
51
54
}
52
55
}
@@ -63,21 +66,6 @@ for ($i = 0xF0; $i <= 0xF9; $i++) {
63
66
}
64
67
}
65
68
66
- /* There are 396 Unicode codepoints which are non-invertible in CP932
67
- * (multiple CP932 byte sequences map to the same codepoint) */
68
- for ($ i = 0xED00 ; $ i <= 0xEEFF ; $ i ++) {
69
- $ bytes = pack ('n ' , shiftJISDecode ($ i ));
70
- if (isset ($ cp932Chars [$ bytes ])) {
71
- $ nonInvertible [$ bytes ] = $ cp932Chars [$ bytes ];
72
- unset($ cp932Chars [$ bytes ]); // will test these separately
73
- }
74
- }
75
- foreach ([0x8790 , 0x8791 , 0x8792 , 0x8795 , 0x8796 , 0x8797 , 0x879A , 0x879B , 0x879C ] as $ i ) {
76
- $ bytes = pack ('n ' , shiftJISDecode ($ i ));
77
- $ nonInvertible [$ bytes ] = $ cp932Chars [$ bytes ];
78
- unset($ cp932Chars [$ bytes ]); // will test these separately
79
- }
80
-
81
69
/* Read in table of all characters in JISX-0201 charset */
82
70
$ jisx0201Chars = array (); /* JISX0201 -> UTF-16BE */
83
71
$ fp = fopen (__DIR__ . '/data/JISX0201.txt ' , 'r+ ' );
@@ -89,6 +77,18 @@ while ($line = fgets($fp, 256)) {
89
77
$ jisx0201Chars [chr ($ byte )] = pack ('n ' , $ codepoint );
90
78
}
91
79
80
+ /* Read in table of all characters in JISX-0212 charset */
81
+ $ jisx0212Chars = array ();
82
+ $ fp = fopen (__DIR__ . '/data/JISX0212.txt ' , 'r+ ' );
83
+ while ($ line = fgets ($ fp , 256 )) {
84
+ if ($ line [0 ] == '# ' )
85
+ continue ;
86
+
87
+ if (sscanf ($ line , "0x%x \t0x%x " , $ bytes , $ codepoint ) == 2 ) {
88
+ $ jisx0212Chars [pack ('n ' , $ bytes )] = pack ('n ' , $ codepoint );
89
+ }
90
+ }
91
+
92
92
/* Our conversions between CP5022x (when CP932 charset is selected) and Unicode
93
93
* differ in a number of places from the table provided by the Unicode Consortium */
94
94
$ cp932Chars ["\x21\x41" ] = "\x30\x1C" ; /* WAVE DASH instead of FULLWIDTH TILDE */
@@ -151,6 +151,10 @@ for ($i = 0x80; $i < 256; $i++) {
151
151
testInvalid ("\x0F" . chr ($ i ), "\x00% " , 'CP50222 ' );
152
152
}
153
153
154
+ // Switch back to ASCII after a multibyte character
155
+ convertValidString ("\x30\x00\x00a \x00b \x00c " , "\x1B\$B \x21\x21\x1B(Babc " , 'UTF-16BE ' , 'CP50221 ' , false );
156
+ convertValidString ("\x30\x00\x00a \x00b \x00c " , "\x1B\$B \x21\x21\x1B(Babc " , 'UTF-16BE ' , 'CP50222 ' , false );
157
+
154
158
echo "ASCII support OK \n" ;
155
159
156
160
/* All valid JIS X 0201 characters
@@ -164,6 +168,7 @@ foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
164
168
testValid ($ jisx0201 , $ utf16BE , 'CP50220 ' , false );
165
169
testValid ($ jisx0201 , $ utf16BE , 'CP50221 ' , false );
166
170
testValid ($ jisx0201 , $ utf16BE , 'CP50222 ' , false );
171
+ convertValidString ($ utf16BE , "\x0E" . chr (ord ($ jisx0201 ) - 0x80 ) . "\x0F" , 'UTF-16BE ' , 'CP50222 ' , false );
167
172
} else { /* Latin */
168
173
testValid ("\x1B(J " . $ jisx0201 , $ utf16BE , 'CP50220 ' , $ utf16BE > "\x00\x80" );
169
174
testValid ("\x1B(J " . $ jisx0201 , $ utf16BE , 'CP50221 ' , $ utf16BE > "\x00\x80" );
@@ -182,6 +187,11 @@ for ($i = 0x80; $i < 256; $i++) {
182
187
testInvalid ("\x1B(J " . chr ($ i ), "\x00% " , 'CP50222 ' );
183
188
}
184
189
190
+ /* Go from JIS X 0201 to ASCII or JIS X 0208 */
191
+ convertValidString ("\xFF\x61\x00A " , "\x0E\x21\x0FA " , 'UTF-16BE ' , 'CP50222 ' , false );
192
+ convertValidString ("\xFF\x61\x22\x25" , "\x0E\x21\x0F\x1B\$B \x21\x42\x1B(B " , 'UTF-16BE ' , 'CP50222 ' , false );
193
+ convertValidString ("\xFF\x61\x20\x3E" , "\x0E\x21\x0F\x1B(J \x7E\x1B(B " , 'UTF-16BE ' , 'CP50222 ' );
194
+
185
195
echo "JIS X 0201 support OK \n" ;
186
196
187
197
/* All valid CP932 characters */
@@ -196,6 +206,15 @@ foreach ($nonInvertible as $cp932 => $utf16BE) {
196
206
testValid ("\x1B\$B " . $ cp932 , $ utf16BE , 'CP50222 ' , false );
197
207
}
198
208
209
+ /* There are some conversions we support from Unicode -> CP5022x, but not in the opposite direction */
210
+ foreach (['CP50220 ' , 'CP50221 ' , 'CP50222 ' ] as $ encoding ) {
211
+ convertValidString ("\x22\x25" , "\x1B\$B \x21\x42\x1B(B " , 'UTF-16BE ' , $ encoding , false );
212
+ convertValidString ("\xFF\x0D" , "\x1B\$B \x21\x5D\x1B(B " , 'UTF-16BE ' , $ encoding , false );
213
+ convertValidString ("\xFF\xE0" , "\x1B\$B \x21\x71\x1B(B " , 'UTF-16BE ' , $ encoding , false );
214
+ convertValidString ("\xFF\xE1" , "\x1B\$B \x21\x72\x1B(B " , 'UTF-16BE ' , $ encoding , false );
215
+ convertValidString ("\xFF\xE2" , "\x1B\$B \x22\x4C\x1B(B " , 'UTF-16BE ' , $ encoding , false );
216
+ }
217
+
199
218
/* All invalid 2-byte CP932 characters */
200
219
for ($ i = 0x21 ; $ i <= 0x97 ; $ i ++) {
201
220
for ($ j = 0 ; $ j < 256 ; $ j ++) {
@@ -215,8 +234,36 @@ for ($i = 0x21; $i <= 0x97; $i++) {
215
234
testInvalid ("\x1B\$B " . chr ($ i ), "\x00% " , 'CP50222 ' );
216
235
}
217
236
237
+ /* Test alternative escape sequence to select CP932 */
238
+ testValid ("\x1B\$(B \x21\x21" , "\x30\x00" , 'CP50220 ' , false );
239
+
218
240
echo "CP932 support OK \n" ;
219
241
242
+ foreach ($ jisx0212Chars as $ jisx0212 => $ utf16BE ) {
243
+ testValid ("\x1B\$(D " . $ jisx0212 , $ utf16BE , 'CP50220 ' , false );
244
+ testValid ("\x1B\$(D " . $ jisx0212 , $ utf16BE , 'CP50221 ' , false );
245
+ testValid ("\x1B\$(D " . $ jisx0212 , $ utf16BE , 'CP50222 ' , false );
246
+ }
247
+
248
+ for ($ i = 0x21 ; $ i <= 0x97 ; $ i ++) {
249
+ for ($ j = 0 ; $ j < 256 ; $ j ++) {
250
+ $ testString = chr ($ i ) . chr ($ j );
251
+ if (!isset ($ jisx0212Chars [$ testString ])) {
252
+ testInvalid ("\x1B\$(D " . $ testString , "\x00% " , 'CP50220 ' );
253
+ testInvalid ("\x1B\$(D " . $ testString , "\x00% " , 'CP50221 ' );
254
+ testInvalid ("\x1B\$(D " . $ testString , "\x00% " , 'CP50222 ' );
255
+ }
256
+ }
257
+ }
258
+
259
+ for ($ i = 0x21 ; $ i <= 0x97 ; $ i ++) {
260
+ testInvalid ("\x1B\$(D " . chr ($ i ), "\x00% " , 'CP50220 ' );
261
+ testInvalid ("\x1B\$(D " . chr ($ i ), "\x00% " , 'CP50221 ' );
262
+ testInvalid ("\x1B\$(D " . chr ($ i ), "\x00% " , 'CP50222 ' );
263
+ }
264
+
265
+ echo "JIS X 0212 support OK \n" ;
266
+
220
267
/* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */
221
268
$ fullwidthKatakana = array (
222
269
0xFF61 => 0x2123 , /* Ideographic full stop */
@@ -310,6 +357,7 @@ echo "Long error markers OK\n";
310
357
ASCII support OK
311
358
JIS X 0201 support OK
312
359
CP932 support OK
360
+ JIS X 0212 support OK
313
361
Folding of fullwidth katakana for CP50220 OK
314
362
Invalid Unicode is flagged when converting to CP5022x
315
363
Long error markers OK
0 commit comments