Permalink
Browse files

Merge branch 'MDL-62042-master' of https://github.com/sammarshallou/m…

  • Loading branch information...
dmonllao committed Apr 23, 2018
2 parents c5a8065 + ffa868a commit 590c774d37684f9c717793fa81049ae1374ba277
Showing with 99 additions and 0 deletions.
  1. +38 −0 lib/classes/text.php
  2. +21 −0 lib/tests/text_test.php
  3. +3 −0 search/classes/document.php
  4. +37 −0 search/engine/solr/tests/engine_test.php
@@ -48,6 +48,11 @@
*/
class core_text {
/**
* @var string[] Array of strings representing Unicode non-characters
*/
protected static $noncharacters;
/**
* Return t3lib helper class, which is used for conversion between charsets
*
@@ -628,6 +633,39 @@ public static function trim_utf8_bom($str) {
return $str;
}
/**
* There are a number of Unicode non-characters including the byte-order mark (which may appear
* multiple times in a string) and also other ranges. These can cause problems for some
* processing.
*
* This function removes the characters using string replace, so that the rest of the string
* remains unchanged.
*
* @param string $value Input string
* @return string Cleaned string value
* @since Moodle 3.5
*/
public static function remove_unicode_non_characters($value) {
// Set up list of all Unicode non-characters for fast replacing.
if (!self::$noncharacters) {
self::$noncharacters = [];
// This list of characters is based on the Unicode standard. It includes the last two
// characters of each code planes 0-16 inclusive...
for ($plane = 0; $plane <= 16; $plane++) {
$base = ($plane === 0 ? '' : dechex($plane));
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'fffe;');
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'ffff;');
}
// ...And the character range U+FDD0 to U+FDEF.
for ($char = 0xfdd0; $char <= 0xfdef; $char++) {
self::$noncharacters[] = html_entity_decode('&#x' . dechex($char) . ';');
}
}
// Do character replacement.
return str_replace(self::$noncharacters, '', $value);
}
/**
* Returns encoding options for select boxes, utf-8 and platform encoding first
*
@@ -412,6 +412,27 @@ public function test_trim_utf8_bom() {
$this->assertSame($str.$bom, core_text::trim_utf8_bom($bom.$str.$bom));
}
/**
* Tests the static remove_unicode_non_characters method.
*/
public function test_remove_unicode_non_characters() {
// Confirm that texts which don't contain these characters are unchanged.
$this->assertSame('Frogs!', core_text::remove_unicode_non_characters('Frogs!'));
// Even if they contain some very scary characters.
$example = html_entity_decode('A&#xfffd;&#x1d15f;B');
$this->assertSame($example, core_text::remove_unicode_non_characters($example));
// Non-characters are removed wherever they may be, with other characters left.
$example = html_entity_decode('&#xfffe;A&#xffff;B&#x8fffe;C&#xfdd0;D&#xfffd;E&#xfdd5;');
$expected = html_entity_decode('ABCD&#xfffd;E');
$this->assertSame($expected, core_text::remove_unicode_non_characters($example));
// If you only have a non-character, you get empty string.
$example = html_entity_decode('&#xfffe;');
$this->assertSame('', core_text::remove_unicode_non_characters($example));
}
/**
* Tests the static get_encodings method.
*/
@@ -291,6 +291,9 @@ public function set($fieldname, $value) {
if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
$this->data[$fieldname] = intval($value);
} else {
// Remove disallowed Unicode characters.
$value = \core_text::remove_unicode_non_characters($value);
// Replace all groups of line breaks and spaces by single spaces.
$this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
if ($this->data[$fieldname] === null) {
@@ -1134,6 +1134,43 @@ public function test_ordering() {
$this->assertEquals('C1P', $results[0]->get('title'));
}
/**
* Tests with bogus content (that can be entered into Moodle) to see if it crashes.
*/
public function test_bogus_content() {
$generator = $this->getDataGenerator();
$course1 = $generator->create_course(['fullname' => 'Course 1']);
$course1context = \context_course::instance($course1->id);
// It is possible to enter into a Moodle database content containing these characters,
// which are Unicode non-characters / byte order marks. If sent to Solr, these cause
// failures.
$boguscontent = html_entity_decode('&#xfffe;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#xffff;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
// Unicode Standard Version 9.0 - Core Specification, section 23.7, lists 66 non-characters
// in total. Here are some of them - these work OK for me but it may depend on platform.
$boguscontent = html_entity_decode('&#xfdd0;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#xfdef;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#x1fffe;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
$boguscontent = html_entity_decode('&#x10ffff;') . 'frog';
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
// Do the indexing (this will check it doesn't throw warnings).
$this->search->index();
// Confirm that all 6 documents are found in search.
$querydata = new stdClass();
$querydata->q = 'frog';
$results = $this->search->search($querydata);
$this->assertCount(6, $results);
}
/**
* Adds a record to the mock search area, so that the search engine can find it later.
*

0 comments on commit 590c774

Please sign in to comment.