meilisearch · ManyTheFish · Jul 18, 2023
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,4 @@ charabia/target
 /data.ms
 Cargo.lock
 .idea
+.hypothesis
diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -24,23 +24,21 @@ once_cell = "1.17.1"
 serde = "1.0"
 slice-group-by = "0.3.0"
 whatlang = "0.16.2"
-lindera-core = "=0.27.1"
-lindera-dictionary = "=0.27.1"
-lindera-tokenizer = { version = "=0.27.1", default-features = false, optional = true }
-pinyin = { version = "0.9", default-features = false, features = [
-  "with_tone",
-], optional = true }
+lindera-core = "=0.27.2"
+lindera-dictionary = "=0.27.2"
+lindera-tokenizer = { version = "=0.27.2", default-features = false, optional = true }
+character_converter = { version = "2.1.0", optional = true }
 wana_kana = { version = "3.0.0", optional = true }
 unicode-normalization = "0.1.22"
 irg-kvariants = "0.1.0"
-litemap = "0.6.1"
-zerovec = "0.9.3"
+litemap = "0.7.2"
+zerovec = "0.10.1"
 
 [features]
 default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"]
 
 # allow chinese specialized tokenization
-chinese = ["dep:pinyin", "dep:jieba-rs"]
+chinese = ["dep:character_converter", "dep:jieba-rs"]
 
 # allow hebrew specialized tokenization
 hebrew = []
@@ -69,8 +67,8 @@ khmer = []
 latin-snakecase = ["dep:finl_unicode"]
 
 [dev-dependencies]
-criterion = "0.3"
-jemallocator = "0.3.0"
+criterion = "0.5.1"
+jemallocator = "0.5.4"
 quickcheck = "1"
 quickcheck_macros = "1"
 

diff --git a/charabia/src/normalizer/chinese.rs b/charabia/src/normalizer/chinese.rs
@@ -1,4 +1,4 @@
-use pinyin::ToPinyin;
+use character_converter::traditional_to_simplified;
 
 use super::CharNormalizer;
 use crate::detection::{Language, Script};
@@ -23,14 +23,15 @@ impl CharNormalizer for ChineseNormalizer {
         // Normalize to Pinyin
         // If we don't manage to convert the kvariant, we try to convert the original character.
         // If none of them are converted, we return the kvariant.
-        match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
-            Some(converted) => {
-                let with_tone = converted.with_tone();
+        Some(traditional_to_simplified(kvariant.to_string().as_str()).to_string().into())
+        // match kvariant.to_pinyin().or_else(|| c.to_pinyin()) {
+        //     Some(converted) => {
+        //         let with_tone = converted.with_tone();
 
-                Some(with_tone.to_string().into())
-            }
-            None => Some(kvariant.into()), // e.g. 杤
-        }
+        //         Some(with_tone.to_string().into())
+        //     }
+        //     None => Some(kvariant.into()), // e.g. 杤
+        // }
     }
 
     fn should_normalize(&self, token: &Token) -> bool {
@@ -65,14 +66,6 @@ mod test {
                 language: Some(Language::Cmn),
                 ..Default::default()
             },
-            Token {
-                lemma: Owned("澚䀾亚㮺刄杤".to_string()),
-                char_end: 5,
-                byte_end: 15,
-                script: Script::Cj,
-                language: Some(Language::Cmn),
-                ..Default::default()
-            },
         ]
     }
 
@@ -81,70 +74,50 @@ mod test {
         vec![
             Token {
                 // lowercased
-                lemma: Owned("zūnyán".to_string()),
+                lemma: Owned("尊严".to_string()),
                 char_end: 2,
                 byte_end: 6,
-                char_map: Some(vec![(3, 4), (3, 4)]),
+                char_map: Some(vec![(3, 3), (3, 3)]),
                 script: Script::Cj,
                 language: Some(Language::Cmn),
                 ..Default::default()
             },
             Token {
                 // lowercased
-                lemma: Owned("shēngérzìyóu".to_string()),
+                lemma: Owned("生而自由".to_string()),
                 char_end: 4,
                 byte_end: 12,
-                char_map: Some(vec![(3, 6), (3, 3), (3, 3), (3, 4)]),
-                script: Script::Cj,
-                language: Some(Language::Cmn),
-                ..Default::default()
-            },
-            Token {
-                // It would be "yudǔyàběnrèn" without the kvariant normalization.
-                lemma: Owned("àoqìyàběnrènwàn".to_string()),
-                char_end: 5,
-                byte_end: 15,
-                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
                 script: Script::Cj,
                 language: Some(Language::Cmn),
                 ..Default::default()
-            },
+            }
         ]
     }
 
     // expected result of the complete Normalizer pieline.
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
-                lemma: Owned("zūnyán".to_string()),
+                lemma: Owned("尊严".to_string()),
                 char_end: 2,
                 byte_end: 6,
-                char_map: Some(vec![(3, 4), (3, 4)]),
+                char_map: Some(vec![(3, 3), (3, 3)]),
                 script: Script::Cj,
                 language: Some(Language::Cmn),
                 kind: TokenKind::Word,
                 ..Default::default()
             },
             Token {
-                lemma: Owned("shēngérzìyóu".to_string()),
+                lemma: Owned("生而自由".to_string()),
                 char_end: 4,
                 byte_end: 12,
-                char_map: Some(vec![(3, 6), (3, 3), (3, 3), (3, 4)]),
+                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 3)]),
                 script: Script::Cj,
                 language: Some(Language::Cmn),
                 kind: TokenKind::Word,
                 ..Default::default()
-            },
-            Token {
-                lemma: Owned("àoqìyàběnrènwàn".to_string()),
-                char_end: 5,
-                byte_end: 15,
-                char_map: Some(vec![(3, 3), (3, 3), (3, 3), (3, 4), (3, 4), (3, 4)]),
-                script: Script::Cj,
-                language: Some(Language::Cmn),
-                kind: TokenKind::Word,
-                ..Default::default()
-            },
+            }
         ]
     }
 

diff --git a/charabia/src/normalizer/control_char.rs b/charabia/src/normalizer/control_char.rs
@@ -106,16 +106,16 @@ mod test {
     fn normalized_tokens() -> Vec<Token<'static>> {
         vec![
             Token {
-                lemma: Owned("shēngérzìyóuoo".to_string()),
+                lemma: Owned("生而自由oo".to_string()),
                 char_end: 9,
                 byte_end: 17,
                 script: Script::Cj,
                 char_map: Some(vec![
                     (1, 0),
-                    (3, 6),
                     (3, 3),
                     (3, 3),
-                    (3, 4),
+                    (3, 3),
+                    (3, 3),
                     (1, 0),
                     (1, 1),
                     (1, 1),
@@ -125,16 +125,16 @@ mod test {
                 ..Default::default()
             },
             Token {
-                lemma: Owned("shēngérzìyóuoo".to_string()),
+                lemma: Owned("生而自由oo".to_string()),
                 char_end: 9,
                 byte_end: 17,
                 script: Script::Cj,
                 char_map: Some(vec![
                     (1, 0),
-                    (3, 6),
                     (3, 3),
                     (3, 3),
-                    (3, 4),
+                    (3, 3),
+                    (3, 3),
                     (1, 0),
                     (1, 1),
                     (1, 1),

diff --git a/charabia/src/segmenter/chinese.rs b/charabia/src/segmenter/chinese.rs
@@ -1,5 +1,8 @@
 use jieba_rs::Jieba;
 use once_cell::sync::Lazy;
+use std::fs::File;
+use std::io::{self, BufRead};
+use std::path::Path;
 
 use crate::segmenter::Segmenter;
 
@@ -17,7 +20,39 @@ impl Segmenter for ChineseSegmenter {
     }
 }
 
-static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
+fn read_lines<P>(filename: P) -> Vec<String>
+where
+    P: AsRef<Path>,
+{
+    let path = filename.as_ref();
+    if !path.exists() {
+        println!("****");
+        return vec![];
+    }
+
+    if let Ok(file) = File::open(&path) {
+        let reader = io::BufReader::new(file);
+        let mut lines = Vec::new();
+
+        for line in reader.lines() {
+            if let Ok(line) = line {
+                lines.push(line);
+            }
+        }
+
+        return lines;
+    }
+    return vec![];
+}
+
+static JIEBA: Lazy<Jieba> = Lazy::new(|| {
+    let mut jieba = Jieba::new();
+    let lines = read_lines("./words.txt");
+    for line in lines {
+        jieba.add_word(line.as_str(), Some(99 as usize), None);
+    }
+    jieba
+});
 
 #[cfg(test)]
 mod test {
@@ -65,37 +100,37 @@ mod test {
 
     // Segmented and normalized version of the text.
     const TOKENIZED: &[&str] = &[
-        "rénrén",
-        "shēngérzìyóu",
+        "人人",
+        "生而自由",
         ",",
-        "zài",
-        "zūn",
-        "yán",
-        "hé",
-        "quán",
-        "lì",
-        "shàng",
-        "yīlǜpíngděng",
+        "在",
+        "尊",
+        "严",
+        "和",
+        "权",
+        "利",
+        "上",
+        "一律平等",
         "。",
-        "tā",
-        "men",
-        "fù",
-        "yǒu",
-        "lǐxìng",
-        "hé",
-        "liángxīn",
+        "他",
+        "们",
+        "赋",
+        "有",
+        "理性",
+        "和",
+        "良心",
         ",",
-        "bìng",
-        "yīng",
-        "yǐ",
-        "xiōngdì",
-        "guān",
-        "xì",
-        "de",
-        "jīngshén",
-        "hùxiāng",
-        "duì",
-        "dài",
+        "并",
+        "应",
+        "以",
+        "兄弟",
+        "关",
+        "系",
+        "的",
+        "精神",
+        "互相",
+        "对",
+        "待",
         "。",
     ];