Skip to content

Commit 590c774

Browse files
author
David Monllao
committed
Merge branch 'MDL-62042-master' of https://github.com/sammarshallou/moodle
2 parents c5a8065 + ffa868a commit 590c774

File tree

4 files changed

+99
-0
lines changed

4 files changed

+99
-0
lines changed

lib/classes/text.php

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,11 @@
4848
*/
4949
class core_text {
5050

51+
/**
52+
* @var string[] Array of strings representing Unicode non-characters
53+
*/
54+
protected static $noncharacters;
55+
5156
/**
5257
* Return t3lib helper class, which is used for conversion between charsets
5358
*
@@ -628,6 +633,39 @@ public static function trim_utf8_bom($str) {
628633
return $str;
629634
}
630635

636+
/**
637+
* There are a number of Unicode non-characters including the byte-order mark (which may appear
638+
* multiple times in a string) and also other ranges. These can cause problems for some
639+
* processing.
640+
*
641+
* This function removes the characters using string replace, so that the rest of the string
642+
* remains unchanged.
643+
*
644+
* @param string $value Input string
645+
* @return string Cleaned string value
646+
* @since Moodle 3.5
647+
*/
648+
public static function remove_unicode_non_characters($value) {
649+
// Set up list of all Unicode non-characters for fast replacing.
650+
if (!self::$noncharacters) {
651+
self::$noncharacters = [];
652+
// This list of characters is based on the Unicode standard. It includes the last two
653+
// characters of each code planes 0-16 inclusive...
654+
for ($plane = 0; $plane <= 16; $plane++) {
655+
$base = ($plane === 0 ? '' : dechex($plane));
656+
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'fffe;');
657+
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'ffff;');
658+
}
659+
// ...And the character range U+FDD0 to U+FDEF.
660+
for ($char = 0xfdd0; $char <= 0xfdef; $char++) {
661+
self::$noncharacters[] = html_entity_decode('&#x' . dechex($char) . ';');
662+
}
663+
}
664+
665+
// Do character replacement.
666+
return str_replace(self::$noncharacters, '', $value);
667+
}
668+
631669
/**
632670
* Returns encoding options for select boxes, utf-8 and platform encoding first
633671
*

lib/tests/text_test.php

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,27 @@ public function test_trim_utf8_bom() {
412412
$this->assertSame($str.$bom, core_text::trim_utf8_bom($bom.$str.$bom));
413413
}
414414

415+
/**
416+
* Tests the static remove_unicode_non_characters method.
417+
*/
418+
public function test_remove_unicode_non_characters() {
419+
// Confirm that texts which don't contain these characters are unchanged.
420+
$this->assertSame('Frogs!', core_text::remove_unicode_non_characters('Frogs!'));
421+
422+
// Even if they contain some very scary characters.
423+
$example = html_entity_decode('A&#xfffd;&#x1d15f;B');
424+
$this->assertSame($example, core_text::remove_unicode_non_characters($example));
425+
426+
// Non-characters are removed wherever they may be, with other characters left.
427+
$example = html_entity_decode('&#xfffe;A&#xffff;B&#x8fffe;C&#xfdd0;D&#xfffd;E&#xfdd5;');
428+
$expected = html_entity_decode('ABCD&#xfffd;E');
429+
$this->assertSame($expected, core_text::remove_unicode_non_characters($example));
430+
431+
// If you only have a non-character, you get empty string.
432+
$example = html_entity_decode('&#xfffe;');
433+
$this->assertSame('', core_text::remove_unicode_non_characters($example));
434+
}
435+
415436
/**
416437
* Tests the static get_encodings method.
417438
*/

search/classes/document.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,9 @@ public function set($fieldname, $value) {
291291
if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
292292
$this->data[$fieldname] = intval($value);
293293
} else {
294+
// Remove disallowed Unicode characters.
295+
$value = \core_text::remove_unicode_non_characters($value);
296+
294297
// Replace all groups of line breaks and spaces by single spaces.
295298
$this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
296299
if ($this->data[$fieldname] === null) {

search/engine/solr/tests/engine_test.php

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,6 +1134,43 @@ public function test_ordering() {
11341134
$this->assertEquals('C1P', $results[0]->get('title'));
11351135
}
11361136

1137+
/**
1138+
* Tests with bogus content (that can be entered into Moodle) to see if it crashes.
1139+
*/
1140+
public function test_bogus_content() {
1141+
$generator = $this->getDataGenerator();
1142+
$course1 = $generator->create_course(['fullname' => 'Course 1']);
1143+
$course1context = \context_course::instance($course1->id);
1144+
1145+
// It is possible to enter into a Moodle database content containing these characters,
1146+
// which are Unicode non-characters / byte order marks. If sent to Solr, these cause
1147+
// failures.
1148+
$boguscontent = html_entity_decode('&#xfffe;') . 'frog';
1149+
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
1150+
$boguscontent = html_entity_decode('&#xffff;') . 'frog';
1151+
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
1152+
1153+
// Unicode Standard Version 9.0 - Core Specification, section 23.7, lists 66 non-characters
1154+
// in total. Here are some of them - these work OK for me but it may depend on platform.
1155+
$boguscontent = html_entity_decode('&#xfdd0;') . 'frog';
1156+
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
1157+
$boguscontent = html_entity_decode('&#xfdef;') . 'frog';
1158+
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
1159+
$boguscontent = html_entity_decode('&#x1fffe;') . 'frog';
1160+
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
1161+
$boguscontent = html_entity_decode('&#x10ffff;') . 'frog';
1162+
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
1163+
1164+
// Do the indexing (this will check it doesn't throw warnings).
1165+
$this->search->index();
1166+
1167+
// Confirm that all 6 documents are found in search.
1168+
$querydata = new stdClass();
1169+
$querydata->q = 'frog';
1170+
$results = $this->search->search($querydata);
1171+
$this->assertCount(6, $results);
1172+
}
1173+
11371174
/**
11381175
* Adds a record to the mock search area, so that the search engine can find it later.
11391176
*

0 commit comments

Comments
 (0)