Skip to content

Commit

Permalink
Merge branch 'MDL-70796_39' of https://github.com/timhunt/moodle into…
Browse files Browse the repository at this point in the history
… MOODLE_39_STABLE
  • Loading branch information
sarjona committed Feb 10, 2021
2 parents f418bf8 + 842e7aa commit 8e6d2ba
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 17 deletions.
16 changes: 8 additions & 8 deletions lib/moodlelib.php
Original file line number Diff line number Diff line change
Expand Up @@ -8358,14 +8358,14 @@ function count_words($string) {
$string = strip_tags($string);
// Decode HTML entities.
$string = html_entity_decode($string);
// Replace underscores (which are classed as word characters) with spaces.
$string = preg_replace('/_/u', ' ', $string);
// Remove any characters that shouldn't be treated as word boundaries.
$string = preg_replace('/[\'"’-]/u', '', $string);
// Remove dots and commas from within numbers only.
$string = preg_replace('/([0-9])[.,]([0-9])/u', '$1$2', $string);

return count(preg_split('/\w\b/u', $string)) - 1;

// Now, the word count is the number of blocks of characters separated
// by any sort of space. That seems to be the definition used by all other systems.
// To be precise about what is considered to separate words:
// * Anything that Unicode considers a 'Separator'
// * Anything that Unicode considers a 'Control character'
// * An em- or en- dash.
return count(preg_split('~[\p{Z}\p{Cc}—–]+~u', $string, -1, PREG_SPLIT_NO_EMPTY));
}

/**
Expand Down
28 changes: 19 additions & 9 deletions lib/tests/moodlelib_test.php
Original file line number Diff line number Diff line change
Expand Up @@ -3761,22 +3761,27 @@ public function test_count_words(int $expectedcount, string $string): void {
* @return array of test cases.
*/
public function count_words_testcases(): array {
// The counts here should match MS Word and Libre Office.
return [
[0, ''],
[4, 'one two three four'],
[3, "one two three'four"],
[3, 'one+two three’four'],
[2, 'one"two three-four'],
[4, 'one@two three_four'],
[4, 'one\two three/four'],
[1, "a'b"],
[1, '1+1=2'],
[1, ' one-sided '],
[2, 'one two'],
[1, 'email@example.com'],
[2, 'first\part second/part'],
[4, '<p>one two<br></br>three four</p>'],
[4, '<p>one two<br>three four</p>'],
[4, '<p>one two<br />three four</p>'], // XHTML style.
[4, ' one ... two &nbsp; three...four '],
[4, 'one.2 3,four'],
[3, ' one ... three '],
[1, 'just...one'],
[3, ' one & three '],
[1, 'just&one'],
[2, 'em—dash'],
[2, 'en–dash'],
[4, '1³ £2 €3.45 $6,789'],
[4, 'one—two ブルース カンベッル'],
[4, 'one…two ブルース … カンベッル'],
[2, 'ブルース カンベッル'], // MS word counts this as 11, but we don't handle that yet.
[4, '<p>one two</p><p>three four</p>'],
[4, '<p>one two</p><p><br/></p><p>three four</p>'],
[4, '<p>one</p><ul><li>two</li><li>three</li></ul><p>four.</p>'],
Expand All @@ -3785,7 +3790,12 @@ public function count_words_testcases(): array {
[1, '<p>em<strong>phas</strong>is.</p>'],
[1, '<p>em<em>phas</em>is.</p>'],
[2, "one\ntwo"],
[2, "one\rtwo"],
[2, "one\ttwo"],
[2, "one\vtwo"],
[2, "one\ftwo"],
[1, "SO<sub>4</sub><sup>2-</sup>"],
[6, '4+4=8 i.e. O(1) a,b,c,d I’m black&blue_really'],
];
}

Expand Down

0 comments on commit 8e6d2ba

Please sign in to comment.