php · alexdowad · Oct 9, 2021 · Oct 18, 2021
@@ -0,0 +1,61 @@
+#!/usr/bin/env php
+<?php
+
+if ($argc < 2) {
+    echo "Usage: php gen_rare_cp_bitvec.php ./common_codepoints.txt\n";
+    return;
+}
+
+$bitvec = array_fill(0, (0xFFFF / 32) + 1, 0xFFFFFFFF);
+
+$input = file_get_contents($argv[1]);
+foreach (explode("\n", $input) as $line) {
+    if (false !== $hashPos = strpos($line, '#')) {
+        $line = substr($line, 0, $hashPos);
+    }
+
+    $line = trim($line);
+    if ($line === '') {
+        continue;
+    }
+
+    $range = explode("\t", $line);
+    $start = hexdec($range[0]);
+    $end   = hexdec($range[1]);
+
+    for ($i = $start; $i <= $end; $i++) {
+        $bitvec[$i >> 5] &= ~(1 << ($i & 0x1F));
+    }
+}
+
+$result = <<<'HEADER'
+/* Machine-generated file; do not edit! See gen_rare_cp_bitvec.php.
+ *
+ * The below array has one bit for each Unicode codepoint from U+0000 to U+FFFF.
+ * The bit is 1 if the codepoint is considered 'rare' for the purpose of
+ * guessing the text encoding of a string.
+ *
+ * Each 'rare' codepoint which appears in a string when it is interpreted
+ * using a candidate encoding causes the candidate encoding to be treated
+ * as less likely to be the correct one.
+ */
+
+static uint32_t rare_codepoint_bitvec[] = {
+HEADER;
+
+for ($i = 0; $i < 0xFFFF / 32; $i++) {
+    if ($i % 8 === 0) {
+        $result .= "\n";
+    } else {
+        $result .= " ";
+    }
+
+    $result .= "0x" . str_pad(dechex($bitvec[$i]), 8, '0', STR_PAD_LEFT) . ",";
+}
+
+$result .= "\n};\n";
+
+file_put_contents(__DIR__ . '/rare_cp_bitvec.h', $result);
+
+echo "Done.\n";
+?>
@@ -95,6 +95,7 @@
 #include "filters/mbfilter_utf8.h"
 
 #include "eaw_table.h"
+#include "rare_cp_bitvec.h"
 
 /* hex character table "0123456789ABCDEF" */
 static char mbfl_hexchar_table[] = {
@@ -236,26 +237,52 @@ size_t mbfl_buffer_illegalchars(mbfl_buffer_converter *convd)
 /*
  * encoding detector
  */
-static int mbfl_estimate_encoding_likelihood(int c, void *void_data)
+static int mbfl_estimate_encoding_likelihood(int input_cp, void *void_data)
 {
 	mbfl_encoding_detector_data *data = void_data;
-
-	/* Receive wchars decoded from test string using candidate encoding
-	 * If the test string was invalid in the candidate encoding, we assume
-	 * it's the wrong one. */
+	unsigned int c = input_cp;
+
+	/* Receive wchars decoded from input string using candidate encoding.
+	 * If the string was invalid in the candidate encoding, we assume
+	 * it's the wrong one. Otherwise, give the candidate many 'demerits'
+	 * for each 'rare' codepoint found, a smaller number for each ASCII
+	 * punctuation character, and 1 for all other codepoints.
+	 *
+	 * The 'common' codepoints should cover the vast majority of
+	 * codepoints we are likely to see in practice, while only covering
+	 * a small minority of the entire Unicode encoding space. Why?
+	 * Well, if the test string happens to be valid in an incorrect
+	 * candidate encoding, the bogus codepoints which it decodes to will
+	 * be more or less random. By treating the majority of codepoints as
+	 * 'rare', we ensure that in almost all such cases, the bogus
+	 * codepoints will include plenty of 'rares', thus giving the
+	 * incorrect candidate encoding lots of demerits. See
+	 * common_codepoints.txt for the actual list used.
+	 *
+	 * So, why give extra demerits for ASCII punctuation characters? It's
+	 * because there are some text encodings, like UTF-7, HZ, and ISO-2022,
+	 * which deliberately only use bytes in the ASCII range. When
+	 * misinterpreted as ASCII/UTF-8, strings in these encodings will
+	 * have an unusually high number of ASCII punctuation characters.
+	 * So giving extra demerits for such characters will improve
+	 * detection accuracy for UTF-7 and similar encodings.
+	 *
+	 * Finally, why 1 demerit for all other characters? That penalizes
+	 * long strings, meaning we will tend to choose a candidate encoding
+	 * in which the test string decodes to a smaller number of
+	 * codepoints. That prevents single-byte encodings in which almost
+	 * every possible input byte decodes to a 'common' codepoint from
+	 * being favored too much. */
 	if (c == MBFL_BAD_INPUT) {
 		data->num_illegalchars++;
-	} else if (c < 0x9 || (c >= 0xE && c <= 0x1F) || (c >= 0xE000 && c <= 0xF8FF) || c >= 0xF0000) {
-		/* Otherwise, count how many control characters and 'private use'
-		 * codepoints we see. Those are rarely used and may indicate that
-		 * the candidate encoding is not the right one. */
-		data->score += 10;
-	} else if ((c >= 0x21 && c <= 0x2F) || (c >= 0x3A && c <= 0x40) || (c >= 0x5B && c <= 0x60)) {
-		/* Punctuation is also less common than letters/digits; further, if
-		 * text in ISO-2022 or similar encodings is mistakenly identified as
-		 * ASCII or UTF-8, the misinterpreted string will tend to have an
-		 * unusually high density of ASCII punctuation characters. */
-		data->score++;
+	} else if (c > 0xFFFF) {
+		data->score += 40;
+	} else if (c >= 0x21 && c <= 0x2F) {
+		data->score += 6;
+	} else if ((rare_codepoint_bitvec[c >> 5] >> (c & 0x1F)) & 1) {
+		data->score += 30;
+	} else {
+		data->score += 1;
 	}
 	return 0;
 }

@@ -2662,6 +2662,23 @@ PHP_FUNCTION(mb_strtolower)
 }
 /* }}} */
 
+static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size)
+{
+	/* mbstring supports some 'text encodings' which aren't really text encodings
+	 * at all, but really 'byte encodings', like Base64, QPrint, and so on.
+	 * These should never be returned by `mb_detect_encoding`. */
+	int shift = 0;
+	for (int i = 0; i < *size; i++) {
+		const mbfl_encoding *encoding = elist[i];
+		if (encoding->no_encoding <= mbfl_no_encoding_charset_min) {
+			shift++; /* Remove this encoding from the list */
+		} else if (shift) {
+			elist[i - shift] = encoding;
+		}
+	}
+	*size -= shift;
+}
+
 /* {{{ Encodings of the given string is returned (as a string) */
 PHP_FUNCTION(mb_detect_encoding)
 {
@@ -2707,6 +2724,14 @@ PHP_FUNCTION(mb_detect_encoding)
 		RETURN_THROWS();
 	}
 
+	if (free_elist) {
+		remove_non_encodings_from_elist(elist, &size);
+		if (size == 0) {
+			efree(ZEND_VOIDP(elist));
+			RETURN_FALSE;
+		}
+	}
+
 	if (ZEND_NUM_ARGS() < 3) {
 		strict = MBSTRG(strict_detection);
 	}