Skip to content

Restore backwards-compatible mappings of 0x5C and 0x7E in SJIS #8719

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 5 additions & 17 deletions ext/mbstring/libmbfl/filters/mbfilter_sjis.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,7 @@ int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter)

switch (filter->status) {
case 0:
if (c == 0x5C) {
CK((*filter->output_function)(0xA5, filter->data));
} else if (c == 0x7E) {
CK((*filter->output_function)(0x203E, filter->data));
} else if (c >= 0 && c < 0x80) { /* ASCII */
if (c >= 0 && c < 0x80) { /* ASCII */
CK((*filter->output_function)(c, filter->data));
} else if (c > 0xA0 && c < 0xE0) { /* Kana */
CK((*filter->output_function)(0xFEC0 + c, filter->data));
Expand Down Expand Up @@ -197,17 +193,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
int c1, c2, s1, s2;

s1 = 0;
if (c == 0x5C) {
/* Unicode 0x5C is a backslash; but Shift-JIS uses 0x5C for the
* Yen sign. JIS X 0208 kuten 0x2140 is a backslash. */
s1 = 0x2140;
} else if (c == 0x7E) {
/* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or
* macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */
s1 = 0x2141;
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
s1 = 0x7E; /* Halfwidth overline/macron */
} else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) {
s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min];
} else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) {
s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min];
Expand All @@ -218,7 +204,9 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter)
}
if (s1 <= 0) {
if (c == 0xA5) { /* YEN SIGN */
s1 = 0x5C;
s1 = 0x216F; /* FULLWIDTH YEN SIGN */
} else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */
s1 = 0x2131; /* FULLWIDTH MACRON */
} else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */
s1 = 0x2140;
} else if (c == 0xFF5E) { /* FULLWIDTH TILDE */
Expand Down
32 changes: 28 additions & 4 deletions ext/mbstring/tests/sjis_encoding.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,37 @@ for ($i = 0; $i < 0x20; $i++) {
$fromUnicode["\x00" . chr($i)] = chr($i);
}

/* U+007E is TILDE; convert to Shift-JIS 0x8160 (WAVE DASH) */
$fromUnicode["\x00\x7E"] = "\x81\x60";
/* According to the relevant Japan Industrial Standards Committee standards,
* SJIS 0x5C is a Yen sign, and 0x7E is an overline.
*
* However, this conflicts with the implementation of SJIS in various legacy
* software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken
* as equivalent to the same ASCII bytes.
*
* Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes
* compatibly with Microsoft products. This was changed in PHP 8.1.0, in an
* attempt to comply with the JISC specifications. However, after discussion
* with various concerned Japanese developers, it seems that the historical
* behavior was more useful in the majority of applications which process
* SJIS-encoded text. */
$validChars["\x5C"] = "\x00\x5C";
$validChars["\x7E"] = "\x00\x7E";
$fromUnicode["\x00\x5C"] = "\x5C";
$fromUnicode["\x00\x7E"] = "\x7E";

/* That means it does not make sense to convert U+203E (OVERLINE)
* to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */
$fromUnicode["\x20\x3E"] = "\x81\x50";
/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */
$fromUnicode["\x00\xAF"] = "\x81\x50";
/* Since we are treating 0x5C as equivalent to U+005C, it does not
* make sense to convert U+00A5 (YEN SIGN) to 0x5C
* Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */
$fromUnicode["\x00\xA5"] = "\x81\x8F";

/* DEL character */
$validChars["\x7F"] = "\x00\x7F";
$fromUnicode["\x00\x7F"] = "\x7F";
/* U+00AF is MACRON; Shift-JIS 0x7E is overline */
$fromUnicode["\x00\xAF"] = "\x7E";
/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */
$validChars["\x81\x5F"] = "\xFF\x3C";
$fromUnicode["\xFF\x3C"] = "\x81\x5F";
Expand Down