Bug 1973393 - Treat transition between Hangul and non-Hangul chars as a word boundary. r=layout-reviewers,emilio

jfkthame · jfkthame · commit cb527a5f295c · 2025-06-24T21:01:17.000Z
This is not part of the default word-segmentation rules in UAX#29 that are implemented by the ICU4X segmenter, but is suggested as an extension in a note following the rules, and corresponds with behavior seen in both Chrome and Safari (as well as in some editors/word processors). As such, I think it makes sense to do this. The included WPT fails without the patch, and passes with it. (It should also pass in both Safari and Chrome, afaict.) Differential Revision: https://phabricator.services.mozilla.com/D254847
diff --git a/layout/generic/nsIFrame.h b/layout/generic/nsIFrame.h
@@ -78,6 +78,7 @@
 #include "mozilla/gfx/CompositorHitTestInfo.h"
 #include "mozilla/gfx/MatrixFwd.h"
 #include "mozilla/intl/BidiEmbeddingLevel.h"
+#include "mozilla/intl/UnicodeProperties.h"
 #include "nsDisplayItemTypes.h"
 #include "nsPresContext.h"
 #include "nsTHashSet.h"
@@ -5467,9 +5468,10 @@ class nsIFrame : public nsQueryFrame {
                 "aOptions should be changed to const reference");
 
   struct PeekWordState {
+    using Script = mozilla::intl::Script;
     // true when we're still at the start of the search, i.e., we can't return
     // this point as a valid offset!
-    bool mAtStart;
+    bool mAtStart = true;
     // true when we've encountered at least one character of the type before the
     // boundary we're looking for:
     // 1. If we're moving forward and eating whitepace, looking for a word
@@ -5478,37 +5480,38 @@ class nsIFrame : public nsQueryFrame {
     // 2. Otherwise, looking for a word beginning (i.e. a boundary between
     //    non-whitespace and whitespace), then mSawBeforeType==true means "we
     //    already saw some non-whitespace".
-    bool mSawBeforeType;
+    bool mSawBeforeType = false;
     // true when we've encountered at least one non-newline character
-    bool mSawInlineCharacter;
+    bool mSawInlineCharacter = false;
     // true when the last character encountered was punctuation
-    bool mLastCharWasPunctuation;
+    bool mLastCharWasPunctuation = false;
     // true when the last character encountered was whitespace
-    bool mLastCharWasWhitespace;
+    bool mLastCharWasWhitespace = false;
     // true when we've seen non-punctuation since the last whitespace
-    bool mSeenNonPunctuationSinceWhitespace;
+    bool mSeenNonPunctuationSinceWhitespace = false;
+    // Script code of most recent character (other than INHERITED).
+    // (Currently only HANGUL vs any-other-script is significant.)
+    Script mLastScript = Script::INVALID;
     // text that's *before* the current frame when aForward is true, *after*
     // the current frame when aForward is false. Only includes the text
     // on the current line.
     nsAutoString mContext;
 
-    PeekWordState()
-        : mAtStart(true),
-          mSawBeforeType(false),
-          mSawInlineCharacter(false),
-          mLastCharWasPunctuation(false),
-          mLastCharWasWhitespace(false),
-          mSeenNonPunctuationSinceWhitespace(false) {}
+    PeekWordState() {}
     void SetSawBeforeType() { mSawBeforeType = true; }
     void SetSawInlineCharacter() { mSawInlineCharacter = true; }
-    void Update(bool aAfterPunctuation, bool aAfterWhitespace) {
+    void Update(bool aAfterPunctuation, bool aAfterWhitespace,
+                Script aScript = Script::INVALID) {
       mLastCharWasPunctuation = aAfterPunctuation;
       mLastCharWasWhitespace = aAfterWhitespace;
       if (aAfterWhitespace) {
         mSeenNonPunctuationSinceWhitespace = false;
       } else if (!aAfterPunctuation) {
         mSeenNonPunctuationSinceWhitespace = true;
       }
+      if (aScript != Script::INHERITED) {
+        mLastScript = aScript;
+      }
       mAtStart = false;
     }
   };
diff --git a/layout/generic/nsTextFrame.cpp b/layout/generic/nsTextFrame.cpp
@@ -7997,6 +7997,7 @@ class MOZ_STACK_CLASS ClusterIterator {
   bool IsInlineWhitespace() const;
   bool IsNewline() const;
   bool IsPunctuation() const;
+  intl::Script ScriptCode() const;
   bool HaveWordBreakBefore() const { return mHaveWordBreak; }
 
   // Get the charIndex that corresponds to the "before" side of the current
@@ -8162,6 +8163,20 @@ bool ClusterIterator::IsPunctuation() const {
   return mozilla::IsPunctuationForWordSelect(ch);
 }
 
+intl::Script ClusterIterator::ScriptCode() const {
+  NS_ASSERTION(mCharIndex >= 0, "No cluster selected");
+  const char16_t ch = mFrag->CharAt(AssertedCast<uint32_t>(mCharIndex));
+  return intl::UnicodeProperties::GetScriptCode(ch);
+}
+
+static inline bool IsKorean(intl::Script aScript) {
+  // We only need to check for HANGUL script code; there is a script code
+  // KOREAN but this is not assigned to any codepoints. (If that ever changes,
+  // we could check for both codes here.)
+  MOZ_ASSERT(aScript != intl::Script::KOREAN, "unexpected script code");
+  return aScript == intl::Script::HANGUL;
+}
+
 int32_t ClusterIterator::GetAfterInternal() const {
   if (mFrag->IsHighSurrogateFollowedByLowSurrogateAt(
           AssertedCast<uint32_t>(mCharIndex))) {
@@ -8343,17 +8358,22 @@ nsIFrame::FrameSearchResult nsTextFrame::PeekOffsetWord(
     return CONTINUE_EMPTY;
   }
 
+  // Do we need to check for Korean characters?
+  bool is2b = TextFragment()->Is2b();
   do {
     bool isPunctuation = cIter.IsPunctuation();
     bool isInlineWhitespace = cIter.IsInlineWhitespace();
     bool isWhitespace = isInlineWhitespace || cIter.IsNewline();
     bool isWordBreakBefore = cIter.HaveWordBreakBefore();
+    // If the text is one-byte, we don't actually care about script code as
+    // there cannot be any Korean in the frame.
+    intl::Script scriptCode = is2b ? cIter.ScriptCode() : intl::Script::COMMON;
     if (!isWhitespace || isInlineWhitespace) {
       aState->SetSawInlineCharacter();
     }
     if (aWordSelectEatSpace == isWhitespace && !aState->mSawBeforeType) {
       aState->SetSawBeforeType();
-      aState->Update(isPunctuation, isWhitespace);
+      aState->Update(isPunctuation, isWhitespace, scriptCode);
       continue;
     }
     // See if we can break before the current cluster
@@ -8374,12 +8394,18 @@ nsIFrame::FrameSearchResult nsTextFrame::PeekOffsetWord(
         canBreak = isWordBreakBefore && aState->mSawBeforeType &&
                    (aWordSelectEatSpace != isWhitespace);
       }
+      // Special-case for Korean: treat a boundary between Hangul & non-Hangul
+      // characters as a word boundary (see bug 1973393 and UAX#29).
+      if (!canBreak && is2b && aState->mLastScript != intl::Script::INVALID &&
+          IsKorean(aState->mLastScript) != IsKorean(scriptCode)) {
+        canBreak = true;
+      }
       if (canBreak) {
         *aOffset = cIter.GetBeforeOffset() - mContentOffset;
         return FOUND;
       }
     }
-    aState->Update(isPunctuation, isWhitespace);
+    aState->Update(isPunctuation, isWhitespace, scriptCode);
   } while (cIter.NextCluster());
 
   *aOffset = cIter.GetAfterOffset() - mContentOffset;
diff --git a/testing/web-platform/tests/selection/move-by-word-korean.html b/testing/web-platform/tests/selection/move-by-word-korean.html
@@ -0,0 +1,52 @@
+<!DOCTYPE html>
+<meta charset="utf-8">
+<title>Korean/Latin transition is treated as a word boundary</title>
+
+<link rel="help" href="https://unicode.org/reports/tr29/#Word_Boundary_Rules">
+<link rel="help" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1973393">
+
+<script src=/resources/testharness.js></script>
+<script src=/resources/testharnessreport.js></script>
+<script src="/resources/testdriver.js"></script>
+<script src="/resources/testdriver-vendor.js"></script>
+<script src="/resources/testdriver-actions.js"></script>
+<script src="../editing/include/editor-test-utils.js"></script>
+
+<div contenteditable id="target">희진DJ</div>
+<textarea id="textareaTarget">DJ희진</textarea>
+
+<script>
+  const selection = getSelection();
+  const textNode = document.getElementById("target").childNodes[0];
+  const textareaNode = document.getElementById("textareaTarget");
+
+  test(() => {
+    selection.collapse(textNode, 0); // Start at beginning of text
+    selection.modify("move", "forward", "word");
+    assert_equals(selection.focusNode, textNode);
+    assert_equals(selection.focusOffset, 2, "Caret should move after the Korean characters");
+  }, "Korean/Latin transition should be considered a word boundary when moving forward");
+
+  test(() => {
+    selection.collapse(textNode, 4); // Start at end of text
+    selection.modify("move", "backward", "word");
+    assert_equals(selection.focusNode, textNode);
+    assert_equals(selection.focusOffset, 2, "Caret should move before the Latin characters");
+  }, "Korean/Latin transition should be considered a word boundary when moving backward");
+
+  promise_test(async () => {
+    textareaNode.focus();
+    textareaNode.setSelectionRange(0, 0); // Start at beginning of text
+    const utils = new EditorTestUtils(textareaNode);
+    await utils.sendMoveWordRightKey();
+    assert_equals(textareaNode.selectionStart, 2, "Caret should move after the Latin characters");
+  }, "Latin/Korean transition should be considered a word boundary when moving forward");
+
+  promise_test(async () => {
+    textareaNode.focus();
+    textareaNode.setSelectionRange(4, 4); // Start at end of text
+    const utils = new EditorTestUtils(textareaNode);
+    await utils.sendMoveWordLeftKey();
+    assert_equals(textareaNode.selectionStart, 2, "Caret should move before the Korean characters");
+  }, "Latin/Korean transition should be considered a word boundary when moving backward");
+</script>