Skip to content

Commit 8d53e4e

Browse files
committed
Bug 1987845 - Text Fragments: Correctly deal with punctuation when identifying the first/last word of the target range. r=smaug
Differential Revision: https://phabricator.services.mozilla.com/D264576
1 parent c08e2d2 commit 8d53e4e

File tree

4 files changed

+236
-31
lines changed

4 files changed

+236
-31
lines changed

dom/base/TextDirectiveCreator.cpp

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -380,20 +380,10 @@ void RangeBasedTextDirectiveCreator::CollectContextTermWordBoundaryDistances() {
380380
mStartWordEndDistances.Clear();
381381
TEXT_FRAGMENT_LOG("Start term cannot be extended.");
382382
} else {
383-
// Find the start position for the second word, which is used as the base
384-
// for the word end distance.
385-
auto [firstWordEndPos, secondWordBeginPos] =
386-
intl::WordBreaker::FindWord(mStartContent, mStartWordEndDistances[0]);
387-
MOZ_DIAGNOSTIC_ASSERT(firstWordEndPos == mStartWordEndDistances[0]);
388-
mStartFirstWordLengthIncludingWhitespace = secondWordBeginPos;
389-
mStartFoldCaseContent = Substring(mStartFoldCaseContent,
390-
mStartFirstWordLengthIncludingWhitespace);
391-
mStartWordEndDistances.RemoveElementAt(0);
392-
for (auto& distance : mStartWordEndDistances) {
393-
MOZ_DIAGNOSTIC_ASSERT(distance >=
394-
mStartFirstWordLengthIncludingWhitespace);
395-
distance = distance - mStartFirstWordLengthIncludingWhitespace;
396-
}
383+
mStartFirstWordLengthIncludingWhitespace =
384+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
385+
TextScanDirection::Right>(mStartFoldCaseContent,
386+
mStartWordEndDistances);
397387
TEXT_FRAGMENT_LOG(
398388
"Word end distances for start term, starting at the beginning of the "
399389
"second word: {}",
@@ -416,22 +406,10 @@ void RangeBasedTextDirectiveCreator::CollectContextTermWordBoundaryDistances() {
416406
mEndWordBeginDistances.Clear();
417407
TEXT_FRAGMENT_LOG("End term cannot be extended.");
418408
} else {
419-
// Find the end position of the second to last word, which is used as the
420-
// base for the word begin distances.
421-
auto [secondLastWordEndPos, lastWordBeginPos] = intl::WordBreaker::FindWord(
422-
mEndContent, mEndContent.Length() - mEndWordBeginDistances[0] - 1);
423-
MOZ_DIAGNOSTIC_ASSERT(lastWordBeginPos ==
424-
mEndContent.Length() - mEndWordBeginDistances[0]);
425409
mEndLastWordLengthIncludingWhitespace =
426-
mEndContent.Length() - secondLastWordEndPos;
427-
428-
mEndFoldCaseContent =
429-
Substring(mEndFoldCaseContent, 0, secondLastWordEndPos);
430-
mEndWordBeginDistances.RemoveElementAt(0);
431-
for (auto& distance : mEndWordBeginDistances) {
432-
MOZ_DIAGNOSTIC_ASSERT(distance >= mEndLastWordLengthIncludingWhitespace);
433-
distance = distance - mEndLastWordLengthIncludingWhitespace;
434-
}
410+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
411+
TextScanDirection::Left>(mEndFoldCaseContent,
412+
mEndWordBeginDistances);
435413
TEXT_FRAGMENT_LOG(
436414
"Word begin distances for end term, starting at the end of the second "
437415
"last word: {}",

dom/base/TextDirectiveUtil.h

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,30 @@ class TextDirectiveUtil final {
225225
static bool WordIsJustWhitespaceOrPunctuation(const nsAString& aString,
226226
uint32_t aWordBegin,
227227
uint32_t aWordEnd);
228+
229+
/**
230+
* @brief Finds the position of the beginning of the second word (in
231+
* `direction`), then removes everything up to that position from
232+
* `aString` and `aWordDistances`.
233+
*
234+
* This function modifies both `aString` and `aWordDistances`.
235+
* It expects `aString` to be non-empty, and to contain at least two words,
236+
* as indicated by `aWordDistances` containing at least two elements.
237+
*
238+
* @tparam direction Either left-to-right or right-to-left.
239+
* @param aString The string to modify. Must not be empty.
240+
* @param aWordDistances The array of word boundary distances. The distances
241+
* are always sorted and contain monotonically
242+
* increasing values. For LTR, the distances are based
243+
* off the beginning of the string. For RTL, the
244+
* distances are based off the end of the string. Must
245+
* contain at least two elements.
246+
* @return The length of the first word including whitespace and
247+
* punctuation up to the beginning of the second word.
248+
*/
249+
template <TextScanDirection direction>
250+
static uint32_t RemoveFirstWordFromStringAndDistanceArray(
251+
nsAString& aString, nsTArray<uint32_t>& aWordDistances);
228252
};
229253

230254
class TimeoutWatchdog final {
@@ -647,6 +671,50 @@ template <TextScanDirection direction>
647671
return commonLength;
648672
}
649673

674+
template <TextScanDirection direction>
675+
/*static*/ uint32_t
676+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray(
677+
nsAString& aString, nsTArray<uint32_t>& aWordDistances) {
678+
MOZ_DIAGNOSTIC_ASSERT(!aString.IsEmpty());
679+
MOZ_DIAGNOSTIC_ASSERT(aWordDistances.Length() > 1);
680+
auto lengthOfFirstWordPlusWhitespaceAndPunctuation = aWordDistances[0];
681+
auto chIsWhitespaceOrPunctuation = [&](uint32_t distance) {
682+
const char16_t ch = aString.CharAt(direction == TextScanDirection::Right
683+
? distance
684+
: aString.Length() - distance - 1);
685+
return nsContentUtils::IsHTMLWhitespace(ch) ||
686+
mozilla::IsPunctuationForWordSelect(ch);
687+
};
688+
while (lengthOfFirstWordPlusWhitespaceAndPunctuation < aString.Length() &&
689+
chIsWhitespaceOrPunctuation(
690+
lengthOfFirstWordPlusWhitespaceAndPunctuation)) {
691+
++lengthOfFirstWordPlusWhitespaceAndPunctuation;
692+
}
693+
if (lengthOfFirstWordPlusWhitespaceAndPunctuation == aString.Length()) {
694+
// In this case the string only contains whitespace or punctuation after the
695+
// first word.
696+
aWordDistances.Clear();
697+
return lengthOfFirstWordPlusWhitespaceAndPunctuation;
698+
}
699+
// Adjust all distances to be relative to the new start position.
700+
// In the case that the loop above jumps over punctuation which is actually
701+
// considered to be a word, the distance underflows (or becomes zero).
702+
// These obsolete distances are then removed.
703+
for (auto& wordDistance : aWordDistances) {
704+
wordDistance -= lengthOfFirstWordPlusWhitespaceAndPunctuation;
705+
}
706+
aWordDistances.RemoveElementsBy([&aString](uint32_t distance) {
707+
return distance == 0 || distance > aString.Length();
708+
});
709+
if constexpr (direction == TextScanDirection::Right) {
710+
aString = Substring(aString, lengthOfFirstWordPlusWhitespaceAndPunctuation);
711+
} else {
712+
aString = Substring(
713+
aString, 0,
714+
aString.Length() - lengthOfFirstWordPlusWhitespaceAndPunctuation);
715+
}
716+
return lengthOfFirstWordPlusWhitespaceAndPunctuation;
717+
}
650718
} // namespace mozilla::dom
651719

652720
#endif

dom/base/test/gtest/TestTextDirective.cpp

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,29 @@ TEST(TestTextDirective, ComputeWordBoundaryDistancesPunctuationOnly)
5050
EXPECT_EQ(wordEndDistances[0], 5u);
5151
}
5252

53+
TEST(TestTextDirective,
54+
ComputeWordBoundaryDistancesWithPunctuationSeparatedByWhitespace)
55+
{
56+
nsString text(u"foo ... bar");
57+
nsTArray<uint32_t> wordEndDistances =
58+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
59+
text);
60+
EXPECT_EQ(wordEndDistances.Length(), 2u);
61+
EXPECT_EQ(wordEndDistances[0], 3u); // "foo"
62+
EXPECT_EQ(wordEndDistances[1], 11u); // "foo ... bar"
63+
}
64+
65+
TEST(TestTextDirective, ComputeWordBoundaryDistancesEndingInPunctuation)
66+
{
67+
nsString text(u"foo ...");
68+
nsTArray<uint32_t> wordEndDistances =
69+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
70+
text);
71+
EXPECT_EQ(wordEndDistances.Length(), 2u);
72+
EXPECT_EQ(wordEndDistances[0], 3u); // "foo"
73+
EXPECT_EQ(wordEndDistances[1], 7u); // "foo ..."
74+
}
75+
5376
TEST(TestTextDirective, ComputeWordBoundaryDistancesWithEmptyString)
5477
{
5578
nsString text(u"");
@@ -59,3 +82,128 @@ TEST(TestTextDirective, ComputeWordBoundaryDistancesWithEmptyString)
5982
EXPECT_EQ(wordEndDistances.Length(), 1u);
6083
EXPECT_EQ(wordEndDistances[0], 0u);
6184
}
85+
86+
TEST(TestTextDirective,
87+
RemoveFirstWordFromStringAndDistanceArrayPunctuationAfterFirstWordLTR)
88+
{
89+
nsString text(u"Hello, world! This is a test.");
90+
nsTArray<uint32_t> wordEndDistances =
91+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
92+
text);
93+
EXPECT_EQ(wordEndDistances.Length(), 7u);
94+
uint32_t firstWordLength =
95+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
96+
TextScanDirection::Right>(text, wordEndDistances);
97+
EXPECT_EQ(firstWordLength, 7u); // "Hello, "
98+
EXPECT_EQ(text, u"world! This is a test.");
99+
EXPECT_EQ(wordEndDistances.Length(), 6u);
100+
EXPECT_EQ(wordEndDistances[0], 5u); // "world"
101+
EXPECT_EQ(wordEndDistances[1], 11u); // "world! This"
102+
EXPECT_EQ(wordEndDistances[2], 14u); // "world! This is"
103+
EXPECT_EQ(wordEndDistances[3], 16u); // "world! This is a"
104+
EXPECT_EQ(wordEndDistances[4], 21u); // "world! This is a test"
105+
EXPECT_EQ(wordEndDistances[5], 22u); // "world! This is a test."
106+
}
107+
108+
TEST(TestTextDirective, RemoveFirstWordFromStringInParenthesisLTR)
109+
{
110+
nsString text(u"(Hello) world");
111+
nsTArray<uint32_t> wordEndDistances =
112+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
113+
text);
114+
EXPECT_EQ(wordEndDistances.Length(), 2u);
115+
uint32_t firstWordLength =
116+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
117+
TextScanDirection::Right>(text, wordEndDistances);
118+
EXPECT_EQ(firstWordLength, 8u); // "(Hello) "
119+
EXPECT_EQ(text, u"world");
120+
EXPECT_EQ(wordEndDistances.Length(), 1u);
121+
EXPECT_EQ(wordEndDistances[0], 5u); // "world"
122+
}
123+
124+
TEST(TestTextDirective, RemoveFirstWordFromStringAndDistanceArrayRTL)
125+
{
126+
nsString text(u"Hello, world! This is a test.");
127+
nsTArray<uint32_t> wordEndDistances =
128+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>(
129+
text);
130+
EXPECT_EQ(wordEndDistances.Length(), 6u);
131+
uint32_t firstWordLength =
132+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
133+
TextScanDirection::Left>(text, wordEndDistances);
134+
EXPECT_EQ(firstWordLength, 6u); // " test."
135+
EXPECT_EQ(text, u"Hello, world! This is a");
136+
EXPECT_EQ(wordEndDistances.Length(), 5u);
137+
EXPECT_EQ(wordEndDistances[0], 1u); // "a"
138+
EXPECT_EQ(wordEndDistances[1], 4u); // "is a"
139+
EXPECT_EQ(wordEndDistances[2], 9u); // "This is a"
140+
EXPECT_EQ(wordEndDistances[3], 16u); // "world! This is a"
141+
EXPECT_EQ(wordEndDistances[4], 23u); // "Hello, world! This is a"
142+
}
143+
144+
TEST(TestTextDirective, RemoveFirstWordFromStringInParenthesisRTL)
145+
{
146+
nsString text(u"Hello (world)");
147+
nsTArray<uint32_t> wordEndDistances =
148+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>(
149+
text);
150+
EXPECT_EQ(wordEndDistances.Length(), 2u);
151+
uint32_t firstWordLength =
152+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
153+
TextScanDirection::Left>(text, wordEndDistances);
154+
EXPECT_EQ(firstWordLength, 8u); // " (world)"
155+
EXPECT_EQ(text, u"Hello");
156+
EXPECT_EQ(wordEndDistances.Length(), 1u);
157+
EXPECT_EQ(wordEndDistances[0], 5u); // "Hello"
158+
}
159+
160+
TEST(TestTextDirective,
161+
RemoveFirstWordFromStringAndDistanceArrayMultiplePunctuationLTR)
162+
{
163+
nsString text(u"...foo!!! bar?");
164+
nsTArray<uint32_t> wordEndDistances =
165+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
166+
text);
167+
EXPECT_EQ(wordEndDistances.Length(), 3u);
168+
uint32_t firstWordLength =
169+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
170+
TextScanDirection::Right>(text, wordEndDistances);
171+
EXPECT_EQ(firstWordLength, 10u); // "...foo!!! "
172+
EXPECT_EQ(text, u"bar?");
173+
EXPECT_EQ(wordEndDistances.Length(), 2u);
174+
EXPECT_EQ(wordEndDistances[0], 3u); // "bar"
175+
EXPECT_EQ(wordEndDistances[1], 4u); // "bar?"
176+
}
177+
178+
TEST(TestTextDirective,
179+
RemoveFirstWordFromStringAndDistanceArrayMultiplePunctuationRTL)
180+
{
181+
nsString text(u"foo!!! ...bar");
182+
nsTArray<uint32_t> wordEndDistances =
183+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Left>(
184+
text);
185+
EXPECT_EQ(wordEndDistances.Length(), 2u);
186+
uint32_t firstWordLength =
187+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
188+
TextScanDirection::Left>(text, wordEndDistances);
189+
EXPECT_EQ(firstWordLength, 10u); // "!!! ...bar"
190+
EXPECT_EQ(text, u"foo");
191+
EXPECT_EQ(wordEndDistances.Length(), 1u);
192+
EXPECT_EQ(wordEndDistances[0], 3u); // "foo"
193+
}
194+
195+
TEST(TestTextDirective,
196+
RemoveFirstWordFromStringAndDistanceArrayEndsInPunctuationLTR)
197+
{
198+
nsString text(u"foo ...");
199+
nsTArray<uint32_t> wordEndDistances =
200+
TextDirectiveUtil::ComputeWordBoundaryDistances<TextScanDirection::Right>(
201+
text);
202+
EXPECT_EQ(wordEndDistances.Length(), 2u);
203+
uint32_t firstWordLength =
204+
TextDirectiveUtil::RemoveFirstWordFromStringAndDistanceArray<
205+
TextScanDirection::Right>(text, wordEndDistances);
206+
EXPECT_EQ(firstWordLength, 7u); // "foo ..."
207+
EXPECT_EQ(text, u"foo ...");
208+
EXPECT_EQ(wordEndDistances.Length(), 0u);
209+
}

dom/base/test/test_text-fragments-create-text-directive.html

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,12 @@
7272
<td>foo</td>
7373
</tr>
7474
</table>
75-
<!-- (crash)Test case for Bug 1987845; example paragraph found on https://en.wikipedia.org/wiki/Nuclear_weapon-->
75+
<!-- (crash)Test cases for Bug 1987845; example paragraph found on https://en.wikipedia.org/wiki/Nuclear_weapon-->
7676
<p>Nuclear<!-- This is needed so that there is an additional match to consider --></p>
7777
<p id="rangeBasedWihSpaceInTheMiddle">Nuclear weapons have only twice been used in warfare, both times by the United States against Japan at the end of World War II. On August 6, 1945, the United States Army Air Forces (USAAF) detonated a uranium gun-type fission bomb nicknamed "Little Boy" over the Japanese city of Hiroshima; three days later, on August 9, the USAAF[4] detonated a plutonium implosion-type fission bomb nicknamed "Fat Man" over the Japanese city of Nagasaki. These bombings caused injuries that resulted in the deaths of approximately 200,000 civilians and military personnel.[5] The ethics of these bombings and their role in Japan's surrender are to this day, still subjects of debate</p>
78-
78+
<!-- note that "text" must already exist in the page to trigger this. -->
79+
<p id="rangeBasedWithPunctuationAfterFirstWordStart">Text: the colon</p>
80+
<p id="rangeBasedWithPunctuationAfterFirstWordEnd">is important here</p>
7981
<script>
8082
document.addEventListener("DOMContentLoaded", () => {
8183
const empty = document.createTextNode("");
@@ -452,6 +454,15 @@
452454
endOffset: rangeBasedWihSpaceInTheMiddle.firstChild.length,
453455
content: "Nuclear weapons have only twice been used in warfare, both times by the United States against Japan at the end of World War II. On August 6, 1945, the United States Army Air Forces (USAAF) detonated a uranium gun-type fission bomb nicknamed \"Little Boy\" over the Japanese city of Hiroshima; three days later, on August 9, the USAAF[4] detonated a plutonium implosion-type fission bomb nicknamed \"Fat Man\" over the Japanese city of Nagasaki. These bombings caused injuries that resulted in the deaths of approximately 200,000 civilians and military personnel.[5] The ethics of these bombings and their role in Japan's surrender are to this day, still subjects of debate",
454456
textDirective: "text=Nuclear%20weapons,debate",
457+
},
458+
{
459+
name: "Test for Bug 1987845: Punctuation after first word of range",
460+
startContainer: rangeBasedWithPunctuationAfterFirstWordStart.firstChild,
461+
startOffset: 0,
462+
endContainer: rangeBasedWithPunctuationAfterFirstWordEnd.firstChild,
463+
endOffset: rangeBasedWithPunctuationAfterFirstWordEnd.firstChild.length,
464+
content: "Text: the colon\n is important here",
465+
textDirective: "text=Text%3A%20the,here",
455466
}
456467
]) {
457468
const range = document.createRange();

0 commit comments

Comments
 (0)