Skip to content

Commit

Permalink
MDL-46256 count_words: fix handling of paragraphs
Browse files Browse the repository at this point in the history
  • Loading branch information
timhunt committed Jan 21, 2021
1 parent 82a050d commit 6a62cbe
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 2 deletions.
19 changes: 18 additions & 1 deletion lib/moodlelib.php
Expand Up @@ -8352,10 +8352,27 @@ function moodle_setlocale($locale='') {
* Words are defined as things between whitespace.
*
* @category string
* @param string $string The text to be searched for words.
* @param string $string The text to be searched for words. May be HTML.
* @return int The count of words in the specified string
*/
function count_words($string) {
// Before stripping tags, add a space after the close tag of anything that is not obviously inline.
// Also, br is a special case because it definitely delimits a word, but has no close tag.
$string = preg_replace('~
( # Capture the tag we match.
</ # Start of close tag.
(?! # Do not match any of these specific close tag names.
a> | b> | del> | em> | i> |
ins> | s> | small> |
strong> | sub> | sup> | u>
)
\w+ # But, apart from those execptions, match any tag name.
> # End of close tag.
|
<br> | <br\s*/> # Special cases that are not close tags.
)
~x', '$1 ', $string); // Add a space after the close tag.
// Now remove HTML tags.
$string = strip_tags($string);
// Decode HTML entities.
$string = html_entity_decode($string);
Expand Down
16 changes: 15 additions & 1 deletion lib/tests/moodlelib_test.php
Expand Up @@ -3792,7 +3792,7 @@ public function test_username_load_fields_from_object() {
}

/**
* Test function count_words().
* Test function {@see count_words()}.
*
* @dataProvider count_words_testcases
* @param int $expectedcount number of words in $string.
Expand All @@ -3809,16 +3809,30 @@ public function test_count_words(int $expectedcount, string $string): void {
*/
public function count_words_testcases(): array {
return [
[0, ''],
[4, 'one two three four'],
[3, "one two three'four"],
[3, 'one+two three’four'],
[2, 'one"two three-four'],
[4, 'one@two three_four'],
[4, 'one\two three/four'],
[4, '<p>one two<br></br>three four</p>'],
[4, '<p>one two<br>three four</p>'],
[4, '<p>one two<br />three four</p>'], // XHTML style.
[4, ' one ... two &nbsp; three...four '],
[4, 'one.2 3,four'],
[4, '1³ £2 €3.45 $6,789'],
[4, 'one—two ブルース カンベッル'],
[4, 'one…two ブルース … カンベッル'],
[4, '<p>one two</p><p>three four</p>'],
[4, '<p>one two</p><p><br/></p><p>three four</p>'],
[4, '<p>one</p><ul><li>two</li><li>three</li></ul><p>four.</p>'],
[1, '<p>em<b>phas</b>is.</p>'],
[1, '<p>em<i>phas</i>is.</p>'],
[1, '<p>em<strong>phas</strong>is.</p>'],
[1, '<p>em<em>phas</em>is.</p>'],
[2, "one\ntwo"],
[1, "SO<sub>4</sub><sup>2-</sup>"],
];
}

Expand Down

0 comments on commit 6a62cbe

Please sign in to comment.