Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #540
Browse files Browse the repository at this point in the history
540: Integrate charabia r=Kerollmops a=ManyTheFish

related to meilisearch/meilisearch#2375
related to meilisearch/meilisearch#2144
related to meilisearch/meilisearch#2417

Co-authored-by: ManyTheFish <many@meilisearch.com>
  • Loading branch information
bors[bot] and ManyTheFish committed Jun 2, 2022
2 parents ac6df0d + 4dd3675 commit 06f6e1e
Show file tree
Hide file tree
Showing 9 changed files with 140 additions and 217 deletions.
22 changes: 10 additions & 12 deletions http-ui/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use flate2::read::GzDecoder;
use futures::{stream, FutureExt, StreamExt};
use heed::EnvOpenOptions;
use milli::documents::DocumentBatchReader;
use milli::tokenizer::{Analyzer, AnalyzerConfig};
use milli::tokenizer::{Tokenizer, TokenizerBuilder};
use milli::update::UpdateIndexingStep::*;
use milli::update::{
ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
Expand Down Expand Up @@ -139,17 +139,16 @@ pub struct IndexerOpt {
pub max_positions_per_attributes: Option<u32>,
}

struct Highlighter<'a, A> {
analyzer: Analyzer<'a, A>,
struct Highlighter<'s, A> {
tokenizer: Tokenizer<'s, A>,
}

impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
fn new(stop_words: &'a fst::Set<A>) -> Self {
let mut config = AnalyzerConfig::default();
config.stop_words(stop_words);
let analyzer = Analyzer::new(config);
impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
fn new(stop_words: &'s fst::Set<A>) -> Self {
let mut builder = TokenizerBuilder::new();
builder.stop_words(stop_words);

Self { analyzer }
Self { tokenizer: builder.build() }
}

fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value {
Expand All @@ -158,9 +157,8 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
Value::Bool(boolean) => Value::Bool(boolean),
Value::Number(number) => Value::Number(number),
Value::String(old_string) => {
let analyzed = self.analyzer.analyze(&old_string);
let analyzed: Vec<_> = analyzed.tokens().collect();
let mut matcher = matcher_builder.build(&analyzed[..], &old_string);
let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect();
let mut matcher = matcher_builder.build(&tokens[..], &old_string);

let format_options = FormatOptions { highlight: true, crop: Some(10) };

Expand Down
6 changes: 3 additions & 3 deletions milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,18 @@ bimap = { version = "0.6.2", features = ["serde"] }
bincode = "1.3.3"
bstr = "0.2.17"
byteorder = "1.4.3"
charabia = "0.5.0"
concat-arrays = "0.1.2"
crossbeam-channel = "0.5.2"
either = "1.6.1"
flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7"
fxhash = "0.2.1"
flatten-serde-json = { path = "../flatten-serde-json" }
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
geoutils = "0.4.1"
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
memmap2 = "0.5.3"
obkv = "0.2.0"
once_cell = "1.10.0"
Expand Down
2 changes: 1 addition & 1 deletion milli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ pub use filter_parser::{Condition, FilterCondition};
use fxhash::{FxHasher32, FxHasher64};
pub use grenad::CompressionType;
use serde_json::{Map, Value};
pub use {heed, meilisearch_tokenizer as tokenizer};
pub use {charabia as tokenizer, heed};

pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::criterion::{default_criteria, Criterion, CriterionError};
Expand Down
61 changes: 27 additions & 34 deletions milli/src/search/matches/matching_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ use std::collections::BTreeMap;
use std::fmt;
use std::ops::{Index, IndexMut};

use charabia::Token;
use levenshtein_automata::{Distance, DFA};
use meilisearch_tokenizer::Token;

use crate::search::build_dfa;

Expand Down Expand Up @@ -99,13 +99,13 @@ impl MatchingWord {

/// Returns the lenght in chars of the match in case of the token matches the term.
pub fn match_token(&self, token: &Token) -> Option<usize> {
match self.dfa.eval(token.text()) {
match self.dfa.eval(token.lemma()) {
Distance::Exact(t) if t <= self.typo => {
if self.prefix {
let len = bytes_to_highlight(token.text(), &self.word);
Some(token.num_chars_from_bytes(len))
let len = bytes_to_highlight(token.lemma(), &self.word);
Some(token.original_lengths(len).0)
} else {
Some(token.num_chars_from_bytes(token.text().len()))
Some(token.original_lengths(token.lemma().len()).0)
}
}
_otherwise => None,
Expand Down Expand Up @@ -262,7 +262,7 @@ mod tests {
use std::borrow::Cow;
use std::str::from_utf8;

use meilisearch_tokenizer::TokenKind;
use charabia::TokenKind;

use super::*;
use crate::MatchingWords;
Expand Down Expand Up @@ -344,11 +344,10 @@ mod tests {
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("word"),
byte_start: 0,
char_index: 0,
lemma: Cow::Borrowed("word"),
char_end: "word".chars().count(),
byte_end: "word".len(),
char_map: None,
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 3, ids: &[2] })
Expand All @@ -357,11 +356,10 @@ mod tests {
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("nyc"),
byte_start: 0,
char_index: 0,
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
byte_end: "nyc".len(),
char_map: None,
..Default::default()
})
.next(),
None
Expand All @@ -370,11 +368,10 @@ mod tests {
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("world"),
byte_start: 0,
char_index: 0,
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
byte_end: "world".len(),
char_map: None,
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
Expand All @@ -383,11 +380,10 @@ mod tests {
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("splitted"),
byte_start: 0,
char_index: 0,
lemma: Cow::Borrowed("splitted"),
char_end: "splitted".chars().count(),
byte_end: "splitted".len(),
char_map: None,
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[0] })
Expand All @@ -396,11 +392,10 @@ mod tests {
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("thisnew"),
byte_start: 0,
char_index: 0,
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
byte_end: "thisnew".len(),
char_map: None,
..Default::default()
})
.next(),
None
Expand All @@ -409,11 +404,10 @@ mod tests {
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("borld"),
byte_start: 0,
char_index: 0,
lemma: Cow::Borrowed("borld"),
char_end: "borld".chars().count(),
byte_end: "borld".len(),
char_map: None,
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
Expand All @@ -422,11 +416,10 @@ mod tests {
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("wordsplit"),
byte_start: 0,
char_index: 0,
lemma: Cow::Borrowed("wordsplit"),
char_end: "wordsplit".chars().count(),
byte_end: "wordsplit".len(),
char_map: None,
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 4, ids: &[2] })
Expand Down
Loading

0 comments on commit 06f6e1e

Please sign in to comment.