Skip to content

Commit

Permalink
MDL-62042 core_search: Unicode non-characters cause indexing problems
Browse files Browse the repository at this point in the history
Unicode characters such as U+FFEF can be entered into Moodle data and
cause indexing failures. This change strips them out of search fields.
  • Loading branch information
sammarshallou committed Apr 20, 2018
1 parent 9a6b854 commit 5262ae8
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 0 deletions.
38 changes: 38 additions & 0 deletions lib/classes/text.php
Expand Up @@ -48,6 +48,11 @@
*/
class core_text {

/**
* @var string[] Array of strings representing Unicode non-characters
*/
protected static $noncharacters;

/**
* Return t3lib helper class, which is used for conversion between charsets
*
Expand Down Expand Up @@ -628,6 +633,39 @@ public static function trim_utf8_bom($str) {
return $str;
}

/**
* There are a number of Unicode non-characters including the byte-order mark (which may appear
* multiple times in a string) and also other ranges. These can cause problems for some
* processing.
*
* This function removes the characters using string replace, so that the rest of the string
* remains unchanged.
*
* @param string $value Input string
* @return string Cleaned string value
* @since Moodle 3.4.3
*/
public static function remove_unicode_non_characters($value) {
// Set up list of all Unicode non-characters for fast replacing.
if (!self::$noncharacters) {
self::$noncharacters = [];
// This list of characters is based on the Unicode standard. It includes the last two
// characters of each code planes 0-16 inclusive...
for ($plane = 0; $plane <= 16; $plane++) {
$base = ($plane === 0 ? '' : dechex($plane));
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'fffe;');
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'ffff;');
}
// ...And the character range U+FDD0 to U+FDEF.
for ($char = 0xfdd0; $char <= 0xfdef; $char++) {
self::$noncharacters[] = html_entity_decode('&#x' . dechex($char) . ';');
}
}

// Do character replacement.
return str_replace(self::$noncharacters, '', $value);
}

/**
* Returns encoding options for select boxes, utf-8 and platform encoding first
*
Expand Down
21 changes: 21 additions & 0 deletions lib/tests/text_test.php
Expand Up @@ -412,6 +412,27 @@ public function test_trim_utf8_bom() {
$this->assertSame($str.$bom, core_text::trim_utf8_bom($bom.$str.$bom));
}

/**
* Tests the static remove_unicode_non_characters method.
*/
public function test_remove_unicode_non_characters() {
// Confirm that texts which don't contain these characters are unchanged.
$this->assertSame('Frogs!', core_text::remove_unicode_non_characters('Frogs!'));

// Even if they contain some very scary characters.
$example = html_entity_decode('A&#xfffd;&#x1d15f;B');
$this->assertSame($example, core_text::remove_unicode_non_characters($example));

// Non-characters are removed wherever they may be, with other characters left.
$example = html_entity_decode('&#xfffe;A&#xffff;B&#x8fffe;C&#xfdd0;D&#xfffd;E&#xfdd5;');
$expected = html_entity_decode('ABCD&#xfffd;E');
$this->assertSame($expected, core_text::remove_unicode_non_characters($example));

// If you only have a non-character, you get empty string.
$example = html_entity_decode('&#xfffe;');
$this->assertSame('', core_text::remove_unicode_non_characters($example));
}

/**
* Tests the static get_encodings method.
*/
Expand Down
3 changes: 3 additions & 0 deletions search/classes/document.php
Expand Up @@ -278,6 +278,9 @@ public function set($fieldname, $value) {
if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
$this->data[$fieldname] = intval($value);
} else {
// Remove disallowed Unicode characters.
$value = \core_text::remove_unicode_non_characters($value);

// Replace all groups of line breaks and spaces by single spaces.
$this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
if ($this->data[$fieldname] === null) {
Expand Down
54 changes: 54 additions & 0 deletions search/engine/solr/tests/engine_test.php
Expand Up @@ -758,4 +758,58 @@ public function test_manager_paged_search($fileindexing) {
$this->assertCount(10, $results->results);
$this->assertEquals(1, $results->actualpage);
}

/**
* Tests with bogus content (that can be entered into Moodle) to see if it crashes.
*/
public function test_bogus_content() {
$generator = $this->getDataGenerator();
$course1 = $generator->create_course(['fullname' => 'Course 1']);
$course1context = \context_course::instance($course1->id);

// It is possible to enter into a Moodle database content containing these characters,
// which are Unicode non-characters / byte order marks. If sent to Solr, these cause
// failures.
$boguscontent = html_entity_decode('&#xfffe;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#xffff;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);

// Unicode Standard Version 9.0 - Core Specification, section 23.7, lists 66 non-characters
// in total. Here are some of them - these work OK for me but it may depend on platform.
$boguscontent = html_entity_decode('&#xfdd0;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#xfdef;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#x1fffe;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#x10ffff;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);

// Do the indexing (this will check it doesn't throw warnings).
$this->search->index();

// Confirm that all 6 documents are found in search.
$querydata = new stdClass();
$querydata->q = 'frog';
$results = $this->search->search($querydata);
$this->assertCount(6, $results);
}

/**
* Adds a record to the mock search area, so that the search engine can find it later.
*
* @param int $courseid Course id
* @param int $contextid Context id
* @param string $title Title for search index
* @param string $content Content for search index
*/
protected function create_search_record($courseid, $contextid, $title, $content) {
$record = new \stdClass();
$record->content = $content;
$record->title = $title;
$record->courseid = $courseid;
$record->contextid = $contextid;
$this->generator->create_record($record);
}
}

0 comments on commit 5262ae8

Please sign in to comment.