Merge #540

540: Integrate charabia r=Kerollmops a=ManyTheFish related to meilisearch/meilisearch#2375 related to meilisearch/meilisearch#2144 related to meilisearch/meilisearch#2417 Co-authored-by: ManyTheFish <many@meilisearch.com>
meilisearch · Jun 2, 2022 · 06f6e1e · 06f6e1e
2 parents ac6df0d + 4dd3675
commit 06f6e1e
Show file tree

Hide file tree

Showing 9 changed files with 140 additions and 217 deletions.
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
@@ -19,7 +19,7 @@ use flate2::read::GzDecoder;
 use futures::{stream, FutureExt, StreamExt};
 use heed::EnvOpenOptions;
 use milli::documents::DocumentBatchReader;
-use milli::tokenizer::{Analyzer, AnalyzerConfig};
+use milli::tokenizer::{Tokenizer, TokenizerBuilder};
 use milli::update::UpdateIndexingStep::*;
 use milli::update::{
     ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
@@ -139,17 +139,16 @@ pub struct IndexerOpt {
     pub max_positions_per_attributes: Option<u32>,
 }
 
-struct Highlighter<'a, A> {
-    analyzer: Analyzer<'a, A>,
+struct Highlighter<'s, A> {
+    tokenizer: Tokenizer<'s, A>,
 }
 
-impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
-    fn new(stop_words: &'a fst::Set<A>) -> Self {
-        let mut config = AnalyzerConfig::default();
-        config.stop_words(stop_words);
-        let analyzer = Analyzer::new(config);
+impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
+    fn new(stop_words: &'s fst::Set<A>) -> Self {
+        let mut builder = TokenizerBuilder::new();
+        builder.stop_words(stop_words);
 
-        Self { analyzer }
+        Self { tokenizer: builder.build() }
     }
 
     fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value {
@@ -158,9 +157,8 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
             Value::Bool(boolean) => Value::Bool(boolean),
             Value::Number(number) => Value::Number(number),
             Value::String(old_string) => {
-                let analyzed = self.analyzer.analyze(&old_string);
-                let analyzed: Vec<_> = analyzed.tokens().collect();
-                let mut matcher = matcher_builder.build(&analyzed[..], &old_string);
+                let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect();
+                let mut matcher = matcher_builder.build(&tokens[..], &old_string);
 
                 let format_options = FormatOptions { highlight: true, crop: Some(10) };
 

diff --git a/milli/Cargo.toml b/milli/Cargo.toml
@@ -9,18 +9,18 @@ bimap = { version = "0.6.2", features = ["serde"] }
 bincode = "1.3.3"
 bstr = "0.2.17"
 byteorder = "1.4.3"
+charabia = "0.5.0"
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.2"
 either = "1.6.1"
+flatten-serde-json = { path = "../flatten-serde-json" }
 fst = "0.4.7"
 fxhash = "0.2.1"
-flatten-serde-json = { path = "../flatten-serde-json" }
-grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
 geoutils = "0.4.1"
+grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
 heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
 json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
-meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
 memmap2 = "0.5.3"
 obkv = "0.2.0"
 once_cell = "1.10.0"

diff --git a/milli/src/lib.rs b/milli/src/lib.rs
@@ -21,7 +21,7 @@ pub use filter_parser::{Condition, FilterCondition};
 use fxhash::{FxHasher32, FxHasher64};
 pub use grenad::CompressionType;
 use serde_json::{Map, Value};
-pub use {heed, meilisearch_tokenizer as tokenizer};
+pub use {charabia as tokenizer, heed};
 
 pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
 pub use self::criterion::{default_criteria, Criterion, CriterionError};

diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs
@@ -3,8 +3,8 @@ use std::collections::BTreeMap;
 use std::fmt;
 use std::ops::{Index, IndexMut};
 
+use charabia::Token;
 use levenshtein_automata::{Distance, DFA};
-use meilisearch_tokenizer::Token;
 
 use crate::search::build_dfa;
 
@@ -99,13 +99,13 @@ impl MatchingWord {
 
     /// Returns the lenght in chars of the match in case of the token matches the term.
     pub fn match_token(&self, token: &Token) -> Option<usize> {
-        match self.dfa.eval(token.text()) {
+        match self.dfa.eval(token.lemma()) {
             Distance::Exact(t) if t <= self.typo => {
                 if self.prefix {
-                    let len = bytes_to_highlight(token.text(), &self.word);
-                    Some(token.num_chars_from_bytes(len))
+                    let len = bytes_to_highlight(token.lemma(), &self.word);
+                    Some(token.original_lengths(len).0)
                 } else {
-                    Some(token.num_chars_from_bytes(token.text().len()))
+                    Some(token.original_lengths(token.lemma().len()).0)
                 }
             }
             _otherwise => None,
@@ -262,7 +262,7 @@ mod tests {
     use std::borrow::Cow;
     use std::str::from_utf8;
 
-    use meilisearch_tokenizer::TokenKind;
+    use charabia::TokenKind;
 
     use super::*;
     use crate::MatchingWords;
@@ -344,11 +344,10 @@ mod tests {
             matching_words
                 .match_token(&Token {
                     kind: TokenKind::Word,
-                    word: Cow::Borrowed("word"),
-                    byte_start: 0,
-                    char_index: 0,
+                    lemma: Cow::Borrowed("word"),
+                    char_end: "word".chars().count(),
                     byte_end: "word".len(),
-                    char_map: None,
+                    ..Default::default()
                 })
                 .next(),
             Some(MatchType::Full { char_len: 3, ids: &[2] })
@@ -357,11 +356,10 @@ mod tests {
             matching_words
                 .match_token(&Token {
                     kind: TokenKind::Word,
-                    word: Cow::Borrowed("nyc"),
-                    byte_start: 0,
-                    char_index: 0,
+                    lemma: Cow::Borrowed("nyc"),
+                    char_end: "nyc".chars().count(),
                     byte_end: "nyc".len(),
-                    char_map: None,
+                    ..Default::default()
                 })
                 .next(),
             None
@@ -370,11 +368,10 @@ mod tests {
             matching_words
                 .match_token(&Token {
                     kind: TokenKind::Word,
-                    word: Cow::Borrowed("world"),
-                    byte_start: 0,
-                    char_index: 0,
+                    lemma: Cow::Borrowed("world"),
+                    char_end: "world".chars().count(),
                     byte_end: "world".len(),
-                    char_map: None,
+                    ..Default::default()
                 })
                 .next(),
             Some(MatchType::Full { char_len: 5, ids: &[2] })
@@ -383,11 +380,10 @@ mod tests {
             matching_words
                 .match_token(&Token {
                     kind: TokenKind::Word,
-                    word: Cow::Borrowed("splitted"),
-                    byte_start: 0,
-                    char_index: 0,
+                    lemma: Cow::Borrowed("splitted"),
+                    char_end: "splitted".chars().count(),
                     byte_end: "splitted".len(),
-                    char_map: None,
+                    ..Default::default()
                 })
                 .next(),
             Some(MatchType::Full { char_len: 5, ids: &[0] })
@@ -396,11 +392,10 @@ mod tests {
             matching_words
                 .match_token(&Token {
                     kind: TokenKind::Word,
-                    word: Cow::Borrowed("thisnew"),
-                    byte_start: 0,
-                    char_index: 0,
+                    lemma: Cow::Borrowed("thisnew"),
+                    char_end: "thisnew".chars().count(),
                     byte_end: "thisnew".len(),
-                    char_map: None,
+                    ..Default::default()
                 })
                 .next(),
             None
@@ -409,11 +404,10 @@ mod tests {
             matching_words
                 .match_token(&Token {
                     kind: TokenKind::Word,
-                    word: Cow::Borrowed("borld"),
-                    byte_start: 0,
-                    char_index: 0,
+                    lemma: Cow::Borrowed("borld"),
+                    char_end: "borld".chars().count(),
                     byte_end: "borld".len(),
-                    char_map: None,
+                    ..Default::default()
                 })
                 .next(),
             Some(MatchType::Full { char_len: 5, ids: &[2] })
@@ -422,11 +416,10 @@ mod tests {
             matching_words
                 .match_token(&Token {
                     kind: TokenKind::Word,
-                    word: Cow::Borrowed("wordsplit"),
-                    byte_start: 0,
-                    char_index: 0,
+                    lemma: Cow::Borrowed("wordsplit"),
+                    char_end: "wordsplit".chars().count(),
                     byte_end: "wordsplit".len(),
-                    char_map: None,
+                    ..Default::default()
                 })
                 .next(),
             Some(MatchType::Full { char_len: 4, ids: &[2] })