Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #426
Browse files Browse the repository at this point in the history
426: Fix search highlight for non-unicode chars r=ManyTheFish a=Samyak2

# Pull Request

## What does this PR do?
Fixes meilisearch/meilisearch#1480
<!-- Please link the issue you're trying to fix with this PR, if none then please create an issue first. -->

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?

## Changes

The `matching_bytes` function takes a `&Token` now and:
- gets the number of bytes to highlight (unchanged).
- uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight.

In essence, the `matching_bytes` function now returns the number of matching grapheme clusters instead of bytes.

Added proper highlighting in the HTTP UI:
- requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens
- `<mark>` tag is put around only the matched part
    - before this change, the entire word was highlighted even if only a part of it matched

## Questions

Since `matching_bytes` does not return number of bytes but grapheme clusters, should it be renamed to something like `matching_chars` or `matching_graphemes`? Will this break the API?

Thank you very much `@ManyTheFish` for helping 😄 

Co-authored-by: Samyak S Sarnayak <samyak201@gmail.com>
  • Loading branch information
bors[bot] and Samyak2 committed Jan 17, 2022
2 parents 15bbde1 + c0313f3 commit 4c516c0
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 21 deletions.
2 changes: 1 addition & 1 deletion http-ui/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ anyhow = "1.0.38"
byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
crossbeam-channel = "0.5.0"
heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
memmap2 = "0.5.0"
milli = { path = "../milli" }
once_cell = "1.5.2"
Expand Down
20 changes: 13 additions & 7 deletions http-ui/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,13 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
let analyzed = self.analyzer.analyze(&old_string);
for (word, token) in analyzed.reconstruct() {
if token.is_word() {
let to_highlight = matching_words.matching_bytes(token.text()).is_some();
if to_highlight {
string.push_str("<mark>")
}
string.push_str(word);
if to_highlight {
string.push_str("</mark>")
match matching_words.matching_bytes(&token) {
Some(chars_to_highlight) => {
let mut chars = word.chars();

string.push_str("<mark>");
// push the part to highlight
string.extend(chars.by_ref().take(chars_to_highlight));
string.push_str("</mark>");
// push the suffix after highlight
string.extend(chars);
}
// no highlight
None => string.push_str(word),
}
} else {
string.push_str(word);
Expand Down
2 changes: 1 addition & 1 deletion milli/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe
human_format = "1.0.3"
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
linked-hash-map = "0.5.4"
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
memmap2 = "0.5.0"
obkv = "0.2.0"
once_cell = "1.5.2"
Expand Down
98 changes: 86 additions & 12 deletions milli/src/search/matching_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
use std::ops::{Index, IndexMut};

use levenshtein_automata::{Distance, DFA};
use meilisearch_tokenizer::Token;

use super::build_dfa;
use crate::search::query_tree::{Operation, Query};
Expand Down Expand Up @@ -33,15 +34,15 @@ impl MatchingWords {
}

/// Returns the number of matching bytes if the word matches one of the query words.
pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> {
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
match dfa.eval(word_to_highlight) {
match dfa.eval(word_to_highlight.text()) {
Distance::Exact(t) if t <= *typo => {
if *is_prefix {
let len = bytes_to_highlight(word_to_highlight, query_word);
Some(len)
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
Some(word_to_highlight.num_chars_from_bytes(len))
} else {
Some(word_to_highlight.len())
Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
}
}
_otherwise => None,
Expand Down Expand Up @@ -178,8 +179,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize {

#[cfg(test)]
mod tests {
use std::borrow::Cow;
use std::str::from_utf8;

use meilisearch_tokenizer::TokenKind;

use super::*;
use crate::search::query_tree::{Operation, Query, QueryKind};
use crate::MatchingWords;
Expand Down Expand Up @@ -269,12 +273,82 @@ mod tests {

let matching_words = MatchingWords::from_query_tree(&query_tree);

assert_eq!(matching_words.matching_bytes("word"), Some(3));
assert_eq!(matching_words.matching_bytes("nyc"), None);
assert_eq!(matching_words.matching_bytes("world"), Some(5));
assert_eq!(matching_words.matching_bytes("splitted"), Some(5));
assert_eq!(matching_words.matching_bytes("thisnew"), None);
assert_eq!(matching_words.matching_bytes("borld"), Some(5));
assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4));
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("word"),
byte_start: 0,
char_index: 0,
byte_end: "word".len(),
char_map: None,
}),
Some(3)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("nyc"),
byte_start: 0,
char_index: 0,
byte_end: "nyc".len(),
char_map: None,
}),
None
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("world"),
byte_start: 0,
char_index: 0,
byte_end: "world".len(),
char_map: None,
}),
Some(5)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("splitted"),
byte_start: 0,
char_index: 0,
byte_end: "splitted".len(),
char_map: None,
}),
Some(5)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("thisnew"),
byte_start: 0,
char_index: 0,
byte_end: "thisnew".len(),
char_map: None,
}),
None
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("borld"),
byte_start: 0,
char_index: 0,
byte_end: "borld".len(),
char_map: None,
}),
Some(5)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("wordsplit"),
byte_start: 0,
char_index: 0,
byte_end: "wordsplit".len(),
char_map: None,
}),
Some(4)
);
}
}

0 comments on commit 4c516c0

Please sign in to comment.