Skip to content

Commit cb527a5

Browse files
committed
Bug 1973393 - Treat transition between Hangul and non-Hangul chars as a word boundary. r=layout-reviewers,emilio
This is not part of the default word-segmentation rules in UAX#29 that are implemented by the ICU4X segmenter, but is suggested as an extension in a note following the rules, and corresponds with behavior seen in both Chrome and Safari (as well as in some editors/word processors). As such, I think it makes sense to do this. The included WPT fails without the patch, and passes with it. (It should also pass in both Safari and Chrome, afaict.) Differential Revision: https://phabricator.services.mozilla.com/D254847
1 parent c7a65df commit cb527a5

File tree

3 files changed

+97
-16
lines changed

3 files changed

+97
-16
lines changed

layout/generic/nsIFrame.h

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
#include "mozilla/gfx/CompositorHitTestInfo.h"
7979
#include "mozilla/gfx/MatrixFwd.h"
8080
#include "mozilla/intl/BidiEmbeddingLevel.h"
81+
#include "mozilla/intl/UnicodeProperties.h"
8182
#include "nsDisplayItemTypes.h"
8283
#include "nsPresContext.h"
8384
#include "nsTHashSet.h"
@@ -5467,9 +5468,10 @@ class nsIFrame : public nsQueryFrame {
54675468
"aOptions should be changed to const reference");
54685469

54695470
struct PeekWordState {
5471+
using Script = mozilla::intl::Script;
54705472
// true when we're still at the start of the search, i.e., we can't return
54715473
// this point as a valid offset!
5472-
bool mAtStart;
5474+
bool mAtStart = true;
54735475
// true when we've encountered at least one character of the type before the
54745476
// boundary we're looking for:
54755477
// 1. If we're moving forward and eating whitepace, looking for a word
@@ -5478,37 +5480,38 @@ class nsIFrame : public nsQueryFrame {
54785480
// 2. Otherwise, looking for a word beginning (i.e. a boundary between
54795481
// non-whitespace and whitespace), then mSawBeforeType==true means "we
54805482
// already saw some non-whitespace".
5481-
bool mSawBeforeType;
5483+
bool mSawBeforeType = false;
54825484
// true when we've encountered at least one non-newline character
5483-
bool mSawInlineCharacter;
5485+
bool mSawInlineCharacter = false;
54845486
// true when the last character encountered was punctuation
5485-
bool mLastCharWasPunctuation;
5487+
bool mLastCharWasPunctuation = false;
54865488
// true when the last character encountered was whitespace
5487-
bool mLastCharWasWhitespace;
5489+
bool mLastCharWasWhitespace = false;
54885490
// true when we've seen non-punctuation since the last whitespace
5489-
bool mSeenNonPunctuationSinceWhitespace;
5491+
bool mSeenNonPunctuationSinceWhitespace = false;
5492+
// Script code of most recent character (other than INHERITED).
5493+
// (Currently only HANGUL vs any-other-script is significant.)
5494+
Script mLastScript = Script::INVALID;
54905495
// text that's *before* the current frame when aForward is true, *after*
54915496
// the current frame when aForward is false. Only includes the text
54925497
// on the current line.
54935498
nsAutoString mContext;
54945499

5495-
PeekWordState()
5496-
: mAtStart(true),
5497-
mSawBeforeType(false),
5498-
mSawInlineCharacter(false),
5499-
mLastCharWasPunctuation(false),
5500-
mLastCharWasWhitespace(false),
5501-
mSeenNonPunctuationSinceWhitespace(false) {}
5500+
PeekWordState() {}
55025501
void SetSawBeforeType() { mSawBeforeType = true; }
55035502
void SetSawInlineCharacter() { mSawInlineCharacter = true; }
5504-
void Update(bool aAfterPunctuation, bool aAfterWhitespace) {
5503+
void Update(bool aAfterPunctuation, bool aAfterWhitespace,
5504+
Script aScript = Script::INVALID) {
55055505
mLastCharWasPunctuation = aAfterPunctuation;
55065506
mLastCharWasWhitespace = aAfterWhitespace;
55075507
if (aAfterWhitespace) {
55085508
mSeenNonPunctuationSinceWhitespace = false;
55095509
} else if (!aAfterPunctuation) {
55105510
mSeenNonPunctuationSinceWhitespace = true;
55115511
}
5512+
if (aScript != Script::INHERITED) {
5513+
mLastScript = aScript;
5514+
}
55125515
mAtStart = false;
55135516
}
55145517
};

layout/generic/nsTextFrame.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7997,6 +7997,7 @@ class MOZ_STACK_CLASS ClusterIterator {
79977997
bool IsInlineWhitespace() const;
79987998
bool IsNewline() const;
79997999
bool IsPunctuation() const;
8000+
intl::Script ScriptCode() const;
80008001
bool HaveWordBreakBefore() const { return mHaveWordBreak; }
80018002

80028003
// Get the charIndex that corresponds to the "before" side of the current
@@ -8162,6 +8163,20 @@ bool ClusterIterator::IsPunctuation() const {
81628163
return mozilla::IsPunctuationForWordSelect(ch);
81638164
}
81648165

8166+
intl::Script ClusterIterator::ScriptCode() const {
8167+
NS_ASSERTION(mCharIndex >= 0, "No cluster selected");
8168+
const char16_t ch = mFrag->CharAt(AssertedCast<uint32_t>(mCharIndex));
8169+
return intl::UnicodeProperties::GetScriptCode(ch);
8170+
}
8171+
8172+
static inline bool IsKorean(intl::Script aScript) {
8173+
// We only need to check for HANGUL script code; there is a script code
8174+
// KOREAN but this is not assigned to any codepoints. (If that ever changes,
8175+
// we could check for both codes here.)
8176+
MOZ_ASSERT(aScript != intl::Script::KOREAN, "unexpected script code");
8177+
return aScript == intl::Script::HANGUL;
8178+
}
8179+
81658180
int32_t ClusterIterator::GetAfterInternal() const {
81668181
if (mFrag->IsHighSurrogateFollowedByLowSurrogateAt(
81678182
AssertedCast<uint32_t>(mCharIndex))) {
@@ -8343,17 +8358,22 @@ nsIFrame::FrameSearchResult nsTextFrame::PeekOffsetWord(
83438358
return CONTINUE_EMPTY;
83448359
}
83458360

8361+
// Do we need to check for Korean characters?
8362+
bool is2b = TextFragment()->Is2b();
83468363
do {
83478364
bool isPunctuation = cIter.IsPunctuation();
83488365
bool isInlineWhitespace = cIter.IsInlineWhitespace();
83498366
bool isWhitespace = isInlineWhitespace || cIter.IsNewline();
83508367
bool isWordBreakBefore = cIter.HaveWordBreakBefore();
8368+
// If the text is one-byte, we don't actually care about script code as
8369+
// there cannot be any Korean in the frame.
8370+
intl::Script scriptCode = is2b ? cIter.ScriptCode() : intl::Script::COMMON;
83518371
if (!isWhitespace || isInlineWhitespace) {
83528372
aState->SetSawInlineCharacter();
83538373
}
83548374
if (aWordSelectEatSpace == isWhitespace && !aState->mSawBeforeType) {
83558375
aState->SetSawBeforeType();
8356-
aState->Update(isPunctuation, isWhitespace);
8376+
aState->Update(isPunctuation, isWhitespace, scriptCode);
83578377
continue;
83588378
}
83598379
// See if we can break before the current cluster
@@ -8374,12 +8394,18 @@ nsIFrame::FrameSearchResult nsTextFrame::PeekOffsetWord(
83748394
canBreak = isWordBreakBefore && aState->mSawBeforeType &&
83758395
(aWordSelectEatSpace != isWhitespace);
83768396
}
8397+
// Special-case for Korean: treat a boundary between Hangul & non-Hangul
8398+
// characters as a word boundary (see bug 1973393 and UAX#29).
8399+
if (!canBreak && is2b && aState->mLastScript != intl::Script::INVALID &&
8400+
IsKorean(aState->mLastScript) != IsKorean(scriptCode)) {
8401+
canBreak = true;
8402+
}
83778403
if (canBreak) {
83788404
*aOffset = cIter.GetBeforeOffset() - mContentOffset;
83798405
return FOUND;
83808406
}
83818407
}
8382-
aState->Update(isPunctuation, isWhitespace);
8408+
aState->Update(isPunctuation, isWhitespace, scriptCode);
83838409
} while (cIter.NextCluster());
83848410

83858411
*aOffset = cIter.GetAfterOffset() - mContentOffset;
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<!DOCTYPE html>
2+
<meta charset="utf-8">
3+
<title>Korean/Latin transition is treated as a word boundary</title>
4+
5+
<link rel="help" href="https://unicode.org/reports/tr29/#Word_Boundary_Rules">
6+
<link rel="help" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1973393">
7+
8+
<script src=/resources/testharness.js></script>
9+
<script src=/resources/testharnessreport.js></script>
10+
<script src="/resources/testdriver.js"></script>
11+
<script src="/resources/testdriver-vendor.js"></script>
12+
<script src="/resources/testdriver-actions.js"></script>
13+
<script src="../editing/include/editor-test-utils.js"></script>
14+
15+
<div contenteditable id="target">희진DJ</div>
16+
<textarea id="textareaTarget">DJ희진</textarea>
17+
18+
<script>
19+
const selection = getSelection();
20+
const textNode = document.getElementById("target").childNodes[0];
21+
const textareaNode = document.getElementById("textareaTarget");
22+
23+
test(() => {
24+
selection.collapse(textNode, 0); // Start at beginning of text
25+
selection.modify("move", "forward", "word");
26+
assert_equals(selection.focusNode, textNode);
27+
assert_equals(selection.focusOffset, 2, "Caret should move after the Korean characters");
28+
}, "Korean/Latin transition should be considered a word boundary when moving forward");
29+
30+
test(() => {
31+
selection.collapse(textNode, 4); // Start at end of text
32+
selection.modify("move", "backward", "word");
33+
assert_equals(selection.focusNode, textNode);
34+
assert_equals(selection.focusOffset, 2, "Caret should move before the Latin characters");
35+
}, "Korean/Latin transition should be considered a word boundary when moving backward");
36+
37+
promise_test(async () => {
38+
textareaNode.focus();
39+
textareaNode.setSelectionRange(0, 0); // Start at beginning of text
40+
const utils = new EditorTestUtils(textareaNode);
41+
await utils.sendMoveWordRightKey();
42+
assert_equals(textareaNode.selectionStart, 2, "Caret should move after the Latin characters");
43+
}, "Latin/Korean transition should be considered a word boundary when moving forward");
44+
45+
promise_test(async () => {
46+
textareaNode.focus();
47+
textareaNode.setSelectionRange(4, 4); // Start at end of text
48+
const utils = new EditorTestUtils(textareaNode);
49+
await utils.sendMoveWordLeftKey();
50+
assert_equals(textareaNode.selectionStart, 2, "Caret should move before the Korean characters");
51+
}, "Latin/Korean transition should be considered a word boundary when moving backward");
52+
</script>

0 commit comments

Comments
 (0)