Skip to content

Commit

Permalink
Fix mb_detect_encoding's recognition of Slavic names
Browse files Browse the repository at this point in the history
Thanks to Côme Chilliet for reporting that mb_detect_encoding was not
detecting the desired text encoding for strings containing š or Ž.
These characters are used in Czech, Serbian, Croatian, Bosnian,
Macedonian, etc. names.
  • Loading branch information
alexdowad committed May 24, 2022
1 parent 5017240 commit 9bb97ee
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 1 deletion.
2 changes: 2 additions & 0 deletions ext/mbstring/common_codepoints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
0x0118 0x0119 # Polish
0x0141 0x0144 # Polish
0x015A 0x015B # Polish
0x0160 0x0161 # Used in Slavic names
0x0179 0x017C # Polish
0x017D 0x017E # Used in Slavic names
0x0300 0x030A # Diacritical marks
0x0370 0x0377 # Greek
0x037A 0x037F # Greek
Expand Down
2 changes: 1 addition & 1 deletion ext/mbstring/rare_cp_bitvec.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

static uint32_t rare_codepoint_bitvec[] = {
0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000,
0xfcffff0f, 0xffffffff, 0xf3ffffe1, 0xe1ffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xfcffff0f, 0xffffffff, 0xf3ffffe1, 0x81fffffc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
Expand Down
8 changes: 8 additions & 0 deletions ext/mbstring/tests/mb_detect_encoding.phpt
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ END:VCARD
';
echo mb_detect_encoding($test, ['UTF-8', 'UTF-16']), "\n";

$test = 'Dušan';
echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-8

$test = 'Živko';
echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-8

// We once had a problem where all kind of strings would be detected as 'UUENCODE'
echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";
Expand Down Expand Up @@ -246,6 +252,8 @@ ISO-8859-1
UTF-8
UTF-8
UTF-8
UTF-8
UTF-8
SJIS
== DETECT ORDER ==
JIS: JIS
Expand Down

0 comments on commit 9bb97ee

Please sign in to comment.