From f94efb7091d7a8dbd57590036ae28ad767888ffe Mon Sep 17 00:00:00 2001
From: Bryan Jennings <bryanjenningz@gmail.com>
Date: Wed, 7 Jun 2023 14:13:30 -0700
Subject: [PATCH] fix(search): Handle word searches with char codes below
 0x3000 (#1607)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Words like ＤＶＤ-ＲＯＭ were not matching due to the hyphen (represented by a char code below 0x3000) in the middle. In order to include, entries like ＋α, logic was removed from rikaicontent.ts that skipped lookups when the letter the mouse hovered over was not a normal Japanese char code. (This included other various ranges which didn't show additional impacts as of this PR.)

These changes are thought to have been added for performance reasons which are no longer relevant but if some problem does crop up we can solve it more narrowly and with better documentation at that time.

<details>
  <summary>See word entries affected by this change (122 unique words)</summary>

```ts
[
  "〇×",
  "〇×テスト",
  "＋α",
  "○",
  "○×",
  "○×テスト",
  "○○",
  "◎",
  "△",
  "※印",
  "２×４工法",
  "２進-５進",
  "４°",
  "ＣＤ-ＤＡ",
  "ＣＤ-Ｒ",
  "ＣＤ-ＲＯＭ",
  "ＣＯＶＩＤ-１９",
  "ＤＶＤ-Ｒ",
  "ＤＶＤ-ＲＡＭ",
  "ＤＶＤ-ＲＯＭ",
  "ＨＡＮＡ-ＢＩ",
  "Ｈα線",
  "Ｊ-ＰＯＰ",
  "Ｋ-１",
  "Ｋ-ＰＯＰ",
  "Ｋ-Ｔ境界",
  "ＲＦ-ＩＤ",
  "ＵＴＦ-８",
  "Ｗｉ-Ｆｉ",
  "ウラン-２３８",
  "プラスα",
  "マイクロRNA",
  "マッチョ♀ウーマン",
  "Α",
  "Β",
  "Γ",
  "Γ分布",
  "Δ",
  "Ε",
  "Ζ",
  "Η",
  "Θ",
  "Θ関数",
  "Ι",
  "Κ",
  "Λ",
  "Μ",
  "Ν",
  "Ξ",
  "Ο",
  "Π",
  "Ρ",
  "Σ",
  "Τ",
  "Υ",
  "Φ",
  "Χ",
  "Ψ",
  "Ω",
  "α",
  "α-ｈｅｌｉｘ",
  "α-ヘリックス",
  "α-メチルトリプタミン",
  "α-リノレン酸",
  "αらせん",
  "αメチルトリプタミン",
  "αリノレン酸",
  "α化",
  "α化米",
  "α線",
  "α波",
  "α分類学",
  "α米",
  "α崩壊",
  "α螺旋",
  "α粒子",
  "β",
  "β-カロチン",
  "β-カロテン",
  "βカロチン",
  "βカロテン",
  "βテスト",
  "βバージョン",
  "β線",
  "β波",
  "β版",
  "β崩壊",
  "β粒子",
  "γ",
  "γ-オリザノール",
  "γアミノ酪酸",
  "γリノレン酸",
  "γ線",
  "γ分布",
  "γ崩壊",
  "δ",
  "ε",
  "ζ",
  "η",
  "θ",
  "θ理論",
  "ι",
  "κ",
  "λ",
  "μ",
  "μ粒子",
  "ν",
  "ξ",
  "ο",
  "π",
  "π中間子",
  "π電子",
  "ρ",
  "σ",
  "σ電子",
  "τ",
  "τ粒子",
  "υ",
  "φ",
  "χ",
  "ψ",
  "ω"
]
```
This list of words was collected by running this code:

```ts
const uniqueWords = [
  ...new Set(
    rcxDict.wordDict
      .trim()
      .split("\n")
      .map((line) => line.split(" ")[0])
  ),
];
const wordsWithCharCodesBelow0x3000 = uniqueWords.filter((word) =>
  [...word].some((char) => char.charCodeAt(0) < 0x3000)
);
console.log(
  "wordsWithCharCodesBelow0x3000",
  JSON.stringify(wordsWithCharCodesBelow0x3000, null, 2)
);
```
</details>

I wrote snapshot tests to make sure none of the words without character codes below 0x3000 were affected by this change. I didn't include those tests in this pull request because they take a while to run (since there are over 200,000 entries in the dictionary) and they make this pull request bigger. I can add those tests in a separate pull request if we think that's a good idea so we can verify diffs of word searches for all words in the dictionary. This would give us more confidence whenever we change the wordSearch function because it shows all the diffs for each search result in the dictionary and it would allow us to drastically simplify the wordSearch function in the future with confidence that we aren't breaking of the previous behavior.

I also wrote code to do a check to make sure all words in the dictionary are searchable and found there is only a 1 unsearchable word in the dictionary now:

```ts
const uniqueWords = [
  ...new Set(
    rcxDict.wordDict
      .trim()
      .split("\n")
      .map((line) => line.split(" ")[0])
  ),
];
const unsearchableWords = uniqueWords.filter(
  (word) =>
    !rcxDict
      .wordSearch(word, false)
      ?.data.find((data) => data.entry.startsWith(word + " "))
);
console.log("unsearchableWords", unsearchableWords); // Logs ['～']
```

So this means after this pull request gets merged, the only unsearchable word entry in the dictionary is "～ [にょろ] /(n) tilde/wave dash/".

This word is a single character with a character code of 65374, which is 0xff5e in hexadecimal (which is above 0x3000, which explains why this pull request doesn't handle this word). After searching through the code for this number 0xff5e, it looks like there's some code that handles this character code that needs to be modified slightly to account for this word in the dictionary. That can be handled in a subsequent pull request to keep this pull request short and simple.

Fixes: https://github.com/melink14/rikaikun/issues/1453
---
 extension/data.ts                   |  4 ----
 extension/rikaicontent.ts           | 14 --------------
 extension/test/data_test.ts         |  6 ++++++
 extension/test/rikaicontent_test.ts | 16 ++++++++++++++++
 4 files changed, 22 insertions(+), 18 deletions(-)
diff --git a/extension/data.ts b/extension/data.ts
index 07340153b..459a70783 100644
--- a/extension/data.ts
+++ b/extension/data.ts
@@ -348,10 +348,6 @@ class RcxDict {
         continue;
       }
 
-      if (u <= 0x3000) {
-        break;
-      }
-
       // full-width katakana to hiragana
       if (u >= 0x30a1 && u <= 0x30f3) {
         u -= 0x60;
diff --git a/extension/rikaicontent.ts b/extension/rikaicontent.ts
index cfaf7528c..5d45f02e6 100644
--- a/extension/rikaicontent.ts
+++ b/extension/rikaicontent.ts
@@ -848,20 +848,6 @@ class RcxContent {
       }
     }
 
-    //
-    if (
-      isNaN(u) ||
-      (u !== 0x25cb &&
-        (u < 0x3001 || u > 0x30ff) &&
-        (u < 0x3400 || u > 0x9fff) &&
-        (u < 0xf900 || u > 0xfaff) &&
-        (u < 0xff10 || u > 0xff9d))
-    ) {
-      this.clearHi();
-      this.hidePopup();
-      return -2;
-    }
-
     // selection end data
     const selEndList: { node: CharacterData; offset: number }[] = [];
     const text = this.getTextFromRange(rp, ro, selEndList, 13 /* maxlength*/);
diff --git a/extension/test/data_test.ts b/extension/test/data_test.ts
index 4d512b786..9f497c2d9 100644
--- a/extension/test/data_test.ts
+++ b/extension/test/data_test.ts
@@ -62,5 +62,11 @@ describe('data.ts', function () {
         rcxDict.wordSearch('ぼんです', /* doNames= */ false)?.data
       ).to.not.include.something.like({ entry: /^凡打 .*/ });
     });
+
+    it('should return results for words with punctuation', function () {
+      expect(
+        rcxDict.wordSearch('ＤＶＤ-ＲＯＭ', /* doNames= */ false)?.data
+      ).to.include.something.like({ entry: /^ＤＶＤ-ＲＯＭ .*/ });
+    });
   });
 });
diff --git a/extension/test/rikaicontent_test.ts b/extension/test/rikaicontent_test.ts
index 6f6631c6b..5e2261373 100644
--- a/extension/test/rikaicontent_test.ts
+++ b/extension/test/rikaicontent_test.ts
@@ -133,6 +133,22 @@ describe('RcxContent', function () {
       });
     });
 
+    it('triggers xsearch message when above non-Japanese text', function () {
+      const clock = sinon.useFakeTimers();
+      const span = insertHtmlIntoDomAndReturnFirstTextNode(
+        '<span>＋αtest</span>'
+      ) as HTMLSpanElement;
+
+      triggerMousemoveAtElementStart(span);
+      // Tick the clock forward to account for the popup delay.
+      clock.tick(1);
+
+      expect(chrome.runtime.sendMessage).to.have.been.calledWithMatch({
+        type: 'xsearch',
+        text: '＋αtest',
+      });
+    });
+
     describe('inside text area', function () {
       it('triggers xsearch message when above Japanese text', function () {
         const clock = sinon.useFakeTimers();