Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6,994 changes: 6,994 additions & 0 deletions ext/mbstring/common_codepoints.txt

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions ext/mbstring/gen_rare_cp_bitvec.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#!/usr/bin/env php
<?php

if ($argc < 2) {
echo "Usage: php gen_rare_cp_bitvec.php ./common_codepoints.txt\n";
return;
}

$bitvec = array_fill(0, (0xFFFF / 32) + 1, 0xFFFFFFFF);

$input = file_get_contents($argv[1]);
foreach (explode("\n", $input) as $line) {
if (false !== $hashPos = strpos($line, '#')) {
$line = substr($line, 0, $hashPos);
}

$line = trim($line);
if ($line === '') {
continue;
}

$range = explode("\t", $line);
$start = hexdec($range[0]);
$end = hexdec($range[1]);

for ($i = $start; $i <= $end; $i++) {
$bitvec[$i >> 5] &= ~(1 << ($i & 0x1F));
}
}

$result = <<<'HEADER'
/* Machine-generated file; do not edit! See gen_rare_cp_bitvec.php.
*
* The below array has one bit for each Unicode codepoint from U+0000 to U+FFFF.
* The bit is 1 if the codepoint is considered 'rare' for the purpose of
* guessing the text encoding of a string.
*
* Each 'rare' codepoint which appears in a string when it is interpreted
* using a candidate encoding causes the candidate encoding to be treated
* as less likely to be the correct one.
*/

static uint32_t rare_codepoint_bitvec[] = {
HEADER;

for ($i = 0; $i < 0xFFFF / 32; $i++) {
if ($i % 8 === 0) {
$result .= "\n";
} else {
$result .= " ";
}

$result .= "0x" . str_pad(dechex($bitvec[$i]), 8, '0', STR_PAD_LEFT) . ",";
}

$result .= "\n};\n";

file_put_contents(__DIR__ . '/rare_cp_bitvec.h', $result);

echo "Done.\n";
?>
59 changes: 43 additions & 16 deletions ext/mbstring/libmbfl/mbfl/mbfilter.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
#include "filters/mbfilter_utf8.h"

#include "eaw_table.h"
#include "rare_cp_bitvec.h"

/* hex character table "0123456789ABCDEF" */
static char mbfl_hexchar_table[] = {
Expand Down Expand Up @@ -236,26 +237,52 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
/*
* encoding detector
*/
static int mbfl_estimate_encoding_likelihood(int c, void *void_data)
static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
{
mbfl_encoding_detector_data *data = void_data;

/* Receive wchars decoded from test string using candidate encoding
* If the test string was invalid in the candidate encoding, we assume
* it's the wrong one. */
unsigned int c = input_cp;

/* Receive wchars decoded from input string using candidate encoding.
* If the string was invalid in the candidate encoding, we assume
* it's the wrong one. Otherwise, give the candidate many 'demerits'
* for each 'rare' codepoint found, a smaller number for each ASCII
* punctuation character, and 1 for all other codepoints.
*
* The 'common' codepoints should cover the vast majority of
* codepoints we are likely to see in practice, while only covering
* a small minority of the entire Unicode encoding space. Why?
* Well, if the test string happens to be valid in an incorrect
* candidate encoding, the bogus codepoints which it decodes to will
* be more or less random. By treating the majority of codepoints as
* 'rare', we ensure that in almost all such cases, the bogus
* codepoints will include plenty of 'rares', thus giving the
* incorrect candidate encoding lots of demerits. See
* common_codepoints.txt for the actual list used.
*
* So, why give extra demerits for ASCII punctuation characters? It's
* because there are some text encodings, like UTF-7, HZ, and ISO-2022,
* which deliberately only use bytes in the ASCII range. When
* misinterpreted as ASCII/UTF-8, strings in these encodings will
* have an unusually high number of ASCII punctuation characters.
* So giving extra demerits for such characters will improve
* detection accuracy for UTF-7 and similar encodings.
*
* Finally, why 1 demerit for all other characters? That penalizes
* long strings, meaning we will tend to choose a candidate encoding
* in which the test string decodes to a smaller number of
* codepoints. That prevents single-byte encodings in which almost
* every possible input byte decodes to a 'common' codepoint from
* being favored too much. */
if (c == MBFL_BAD_INPUT) {
data->num_illegalchars++;
} else if (c < 0x9 || (c >= 0xE && c <= 0x1F) || (c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
/* Otherwise, count how many control characters and 'private use'
* codepoints we see. Those are rarely used and may indicate that
* the candidate encoding is not the right one. */
data->score += 10;
} else if ((c >= 0x21 && c <= 0x2F) || (c >= 0x3A && c <= 0x40) || (c >= 0x5B && c <= 0x60)) {
/* Punctuation is also less common than letters/digits; further, if
* text in ISO-2022 or similar encodings is mistakenly identified as
* ASCII or UTF-8, the misinterpreted string will tend to have an
* unusually high density of ASCII punctuation characters. */
data->score++;
} else if (c > 0xFFFF) {
data->score += 40;
} else if (c >= 0x21 && c <= 0x2F) {
data->score += 6;
} else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
data->score += 30;
} else {
data->score += 1;
}
return 0;
}
Expand Down
25 changes: 25 additions & 0 deletions ext/mbstring/mbstring.c
Original file line number Diff line number Diff line change
Expand Up @@ -2662,6 +2662,23 @@ PHP_FUNCTION(mb_strtolower)
}
/* }}} */

static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
{
/* mbstring supports some 'text encodings' which aren't really text encodings
* at all, but really 'byte encodings', like Base64, QPrint, and so on.
* These should never be returned by `mb_detect_encoding`. */
int shift = 0;
for (int i = 0; i < *size; i++) {
const mbfl_encoding *encoding = elist[i];
if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
shift++; /* Remove this encoding from the list */
} else if (shift) {
elist[i - shift] = encoding;
}
}
*size -= shift;
}

/* {{{ Encodings of the given string is returned (as a string) */
PHP_FUNCTION(mb_detect_encoding)
{
Expand Down Expand Up @@ -2707,6 +2724,14 @@ PHP_FUNCTION(mb_detect_encoding)
RETURN_THROWS();
}

if (free_elist) {
remove_non_encodings_from_elist(elist, &size);
if (size == 0) {
efree(ZEND_VOIDP(elist));
RETURN_FALSE;
}
}

if (ZEND_NUM_ARGS() < 3) {
strict = MBSTRG(strict_detection);
}
Expand Down
Loading