diff --git a/NEWS b/NEWS index ae3c521c60b55..482ccc4a5604d 100644 --- a/NEWS +++ b/NEWS @@ -21,6 +21,8 @@ PHP NEWS . mb_detect_encoding recognizes all letters in Czech alphabet (alexdowad) . mb_detect_encoding recognizes all letters in Hungarian alphabet (alexdowad) . Fixed bug GH-8685 (pcre not ready at mbstring startup). (Remi) + . Fixed bug GH-8281 (mb_convert_encoding "\" and "~" convert failed to + Shift_JIS). (alexdowad) - OPcache: . Fixed bug GH-8591 (tracing JIT crash after private instance method change). diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index 188f162bf8785..96456b26e7eb6 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -141,11 +141,7 @@ int mbfl_filt_conv_sjis_wchar(int c, mbfl_convert_filter *filter) switch (filter->status) { case 0: - if (c == 0x5C) { - CK((*filter->output_function)(0xA5, filter->data)); - } else if (c == 0x7E) { - CK((*filter->output_function)(0x203E, filter->data)); - } else if (c >= 0 && c < 0x80) { /* ASCII */ + if (c >= 0 && c < 0x80) { /* ASCII */ CK((*filter->output_function)(c, filter->data)); } else if (c > 0xA0 && c < 0xE0) { /* Kana */ CK((*filter->output_function)(0xFEC0 + c, filter->data)); @@ -197,17 +193,7 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) int c1, c2, s1, s2; s1 = 0; - if (c == 0x5C) { - /* Unicode 0x5C is a backslash; but Shift-JIS uses 0x5C for the - * Yen sign. JIS X 0208 kuten 0x2140 is a backslash. */ - s1 = 0x2140; - } else if (c == 0x7E) { - /* Unicode 0x7E is a tilde, but Shift-JIS uses 0x7E for overline (or - * macron). JIS X 0208 kuten 0x2141 is 'WAVE DASH' */ - s1 = 0x2141; - } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */ - s1 = 0x7E; /* Halfwidth overline/macron */ - } else if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { + if (c >= ucs_a1_jis_table_min && c < ucs_a1_jis_table_max) { s1 = ucs_a1_jis_table[c - ucs_a1_jis_table_min]; } else if (c >= ucs_a2_jis_table_min && c < ucs_a2_jis_table_max) { s1 = ucs_a2_jis_table[c - ucs_a2_jis_table_min]; @@ -218,7 +204,9 @@ int mbfl_filt_conv_wchar_sjis(int c, mbfl_convert_filter *filter) } if (s1 <= 0) { if (c == 0xA5) { /* YEN SIGN */ - s1 = 0x5C; + s1 = 0x216F; /* FULLWIDTH YEN SIGN */ + } else if (c == 0xAF || c == 0x203E) { /* U+00AF is MACRON, U+203E is OVERLINE */ + s1 = 0x2131; /* FULLWIDTH MACRON */ } else if (c == 0xFF3C) { /* FULLWIDTH REVERSE SOLIDUS */ s1 = 0x2140; } else if (c == 0xFF5E) { /* FULLWIDTH TILDE */ diff --git a/ext/mbstring/tests/sjis_encoding.phpt b/ext/mbstring/tests/sjis_encoding.phpt index aece04b0f2221..c94cc464714c1 100644 --- a/ext/mbstring/tests/sjis_encoding.phpt +++ b/ext/mbstring/tests/sjis_encoding.phpt @@ -20,13 +20,37 @@ for ($i = 0; $i < 0x20; $i++) { $fromUnicode["\x00" . chr($i)] = chr($i); } -/* U+007E is TILDE; convert to Shift-JIS 0x8160 (WAVE DASH) */ -$fromUnicode["\x00\x7E"] = "\x81\x60"; +/* According to the relevant Japan Industrial Standards Committee standards, + * SJIS 0x5C is a Yen sign, and 0x7E is an overline. + * + * However, this conflicts with the implementation of SJIS in various legacy + * software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken + * as equivalent to the same ASCII bytes. + * + * Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes + * compatibly with Microsoft products. This was changed in PHP 8.1.0, in an + * attempt to comply with the JISC specifications. However, after discussion + * with various concerned Japanese developers, it seems that the historical + * behavior was more useful in the majority of applications which process + * SJIS-encoded text. */ +$validChars["\x5C"] = "\x00\x5C"; +$validChars["\x7E"] = "\x00\x7E"; +$fromUnicode["\x00\x5C"] = "\x5C"; +$fromUnicode["\x00\x7E"] = "\x7E"; + +/* That means it does not make sense to convert U+203E (OVERLINE) + * to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */ +$fromUnicode["\x20\x3E"] = "\x81\x50"; +/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */ +$fromUnicode["\x00\xAF"] = "\x81\x50"; +/* Since we are treating 0x5C as equivalent to U+005C, it does not + * make sense to convert U+00A5 (YEN SIGN) to 0x5C + * Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */ +$fromUnicode["\x00\xA5"] = "\x81\x8F"; + /* DEL character */ $validChars["\x7F"] = "\x00\x7F"; $fromUnicode["\x00\x7F"] = "\x7F"; -/* U+00AF is MACRON; Shift-JIS 0x7E is overline */ -$fromUnicode["\x00\xAF"] = "\x7E"; /* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */ $validChars["\x81\x5F"] = "\xFF\x3C"; $fromUnicode["\xFF\x3C"] = "\x81\x5F";