From f25ad012c54f8659301f75134be4bee02a715bc1 Mon Sep 17 00:00:00 2001 From: Petr Skoda Date: Tue, 20 Jun 2023 09:30:46 +0200 Subject: [PATCH] MDL-78525 core: fix word and character counting --- lib/moodlelib.php | 16 +++++++++-- lib/tests/moodlelib_test.php | 35 ++++++++++++++++++++--- mod/forum/classes/local/entities/post.php | 4 +-- mod/forum/post.php | 7 ++--- 4 files changed, 50 insertions(+), 12 deletions(-) diff --git a/lib/moodlelib.php b/lib/moodlelib.php index d979952162968..d00252caba914 100644 --- a/lib/moodlelib.php +++ b/lib/moodlelib.php @@ -8381,9 +8381,10 @@ function moodle_setlocale($locale='') { * * @category string * @param string $string The text to be searched for words. May be HTML. + * @param int|null $format * @return int The count of words in the specified string */ -function count_words($string) { +function count_words($string, $format = null) { // Before stripping tags, add a space after the close tag of anything that is not obviously inline. // Also, br is a special case because it definitely delimits a word, but has no close tag. $string = preg_replace('~ @@ -8400,6 +8401,11 @@ function count_words($string) {
| # Special cases that are not close tags. ) ~x', '$1 ', $string); // Add a space after the close tag. + if ($format !== null && $format != FORMAT_PLAIN) { + // Match the usual text cleaning before display. + // Ideally we should apply multilang filter only here, other filters might add extra text. + $string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]); + } // Now remove HTML tags. $string = strip_tags($string); // Decode HTML entities. @@ -8421,9 +8427,15 @@ function count_words($string) { * * @category string * @param string $string The text to be searched for letters. May be HTML. + * @param int|null $format * @return int The count of letters in the specified text. */ -function count_letters($string) { +function count_letters($string, $format = null) { + if ($format !== null && $format != FORMAT_PLAIN) { + // Match the usual text cleaning before display. + // Ideally we should apply multilang filter only here, other filters might add extra text. + $string = format_text($string, $format, ['filter' => false, 'noclean' => false, 'para' => false]); + } $string = strip_tags($string); // Tags are out now. $string = html_entity_decode($string, ENT_COMPAT); $string = preg_replace('/[[:space:]]*/', '', $string); // Whitespace are out now. diff --git a/lib/tests/moodlelib_test.php b/lib/tests/moodlelib_test.php index 591b4e35ab9d1..bc8ac172a8e39 100644 --- a/lib/tests/moodlelib_test.php +++ b/lib/tests/moodlelib_test.php @@ -3958,9 +3958,11 @@ public function test_username_load_fields_from_object() { * @dataProvider count_words_testcases * @param int $expectedcount number of words in $string. * @param string $string the test string to count the words of. + * @param int|null $format */ - public function test_count_words(int $expectedcount, string $string): void { - $this->assertEquals($expectedcount, count_words($string)); + public function test_count_words(int $expectedcount, string $string, $format = null): void { + $this->assertEquals($expectedcount, count_words($string, $format), + "'$string' with format '$format' does not match count $expectedcount"); } /** @@ -3969,6 +3971,13 @@ public function test_count_words(int $expectedcount, string $string): void { * @return array of test cases. */ public function count_words_testcases(): array { + // Copy-pasting example from MDL-64240. + $copypasted = <<Snoot is booped

+ + Boop the Snoot. +EOT; + // The counts here should match MS Word and Libre Office. return [ [0, ''], @@ -4005,6 +4014,16 @@ public function count_words_testcases(): array { [1, "SO42-"], [6, '4+4=8 i.e. O(1) a,b,c,d I’m black&blue_really'], [1, 'ab'], + [1, 'ab', FORMAT_PLAIN], + [1, 'ab', FORMAT_HTML], + [1, 'ab', FORMAT_MOODLE], + [1, 'ab', FORMAT_MARKDOWN], + [1, 'aa pokus'], + [2, 'aa pokus', FORMAT_HTML], + [6, $copypasted], + [6, $copypasted, FORMAT_PLAIN], + [3, $copypasted, FORMAT_HTML], + [3, $copypasted, FORMAT_MOODLE], ]; } @@ -4014,9 +4033,11 @@ public function count_words_testcases(): array { * @dataProvider count_letters_testcases * @param int $expectedcount number of characters in $string. * @param string $string the test string to count the letters of. + * @param int|null $format */ - public function test_count_letters(int $expectedcount, string $string): void { - $this->assertEquals($expectedcount, count_letters($string)); + public function test_count_letters(int $expectedcount, string $string, $format = null): void { + $this->assertEquals($expectedcount, count_letters($string, $format), + "'$string' with format '$format' does not match count $expectedcount"); } /** @@ -4030,6 +4051,12 @@ public function count_letters_testcases(): array { [1, 'x'], [1, '&'], [4, '

frog

'], + [4, '

frog

', FORMAT_PLAIN], + [4, '

frog

', FORMAT_MOODLE], + [4, '

frog

', FORMAT_HTML], + [4, '

frog

', FORMAT_MARKDOWN], + [2, 'aa pokus'], + [7, 'aa pokus', FORMAT_HTML], ]; } diff --git a/mod/forum/classes/local/entities/post.php b/mod/forum/classes/local/entities/post.php index 90cd0fc6ba888..119e1d02cd220 100644 --- a/mod/forum/classes/local/entities/post.php +++ b/mod/forum/classes/local/entities/post.php @@ -350,8 +350,8 @@ public function get_charcount() : ?int { */ public static function add_message_counts(\stdClass $record) : void { if (!empty($record->message)) { - $record->wordcount = count_words($record->message); - $record->charcount = count_letters($record->message); + $record->wordcount = count_words($record->message, $record->messageformat); + $record->charcount = count_letters($record->message, $record->messageformat); } } } diff --git a/mod/forum/post.php b/mod/forum/post.php index a3f2925cc0a70..9ea0f2aef3144 100644 --- a/mod/forum/post.php +++ b/mod/forum/post.php @@ -799,10 +799,9 @@ // WARNING: the $fromform->message array has been overwritten, do not use it anymore! $fromform->messagetrust = trusttext_trusted($modcontext); - // Clean message text, unless markdown which should be saved as it is, otherwise editing messes things up. - if ($fromform->messageformat != FORMAT_MARKDOWN) { - $fromform = trusttext_pre_edit($fromform, 'message', $modcontext); - } + // Do not clean text here, text cleaning can be done only after conversion to HTML. + // Word counting now uses text formatting, there is no need to abuse trusttext_pre_edit() here. + if ($fromform->edit) { // Updating a post. unset($fromform->groupid);