Skip to content

Commit

Permalink
Merge #287
Browse files Browse the repository at this point in the history
287: Add swedish recomposition normalizer and link it to a feature r=Kerollmops a=ManyTheFish

# Pull Request

- Add a Swedish normalizer, avoiding the diacritic removal from the letter, and preserving the expected Swedish character ordering.
- Add a feature flag for it (not enabled by default)
- Trigger test in the CI


Co-authored-by: ManyTheFish <many@meilisearch.com>
  • Loading branch information
meili-bors[bot] and ManyTheFish committed Apr 30, 2024
2 parents 181f55c + b726e84 commit d970d91
Show file tree
Hide file tree
Showing 4 changed files with 172 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/rust.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ jobs:
run: cargo test --verbose --features japanese-transliteration
- name: Run tests with chinese-normalization-pinyin on
run: cargo test --verbose --features chinese chinese-normalization-pinyin
- name: Run tests with swedish-recomposition on
run: cargo test --verbose --features swedish-recomposition
- name: Run irg-kvariants tests
run: cargo test -p irg-kvariants --verbose

Expand Down
3 changes: 3 additions & 0 deletions charabia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@ vietnamese = []
# allow splitting snake_case latin words
latin-snakecase = ["dep:finl_unicode"]

# force Charabia to recompose Swedish characters
swedish-recomposition = []

[dev-dependencies]
criterion = "0.5"
jemallocator = "0.5.4"
Expand Down
6 changes: 6 additions & 0 deletions charabia/src/normalizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub use self::japanese::JapaneseNormalizer;
pub use self::lowercase::LowercaseNormalizer;
use self::nonspacing_mark::NonspacingMarkNormalizer;
use self::quote::QuoteNormalizer;
#[cfg(feature = "swedish-recomposition")]
use self::swedish_recomposition::SwedishRecompositionNormalizer;
#[cfg(feature = "vietnamese")]
pub use self::vietnamese::VietnameseNormalizer;
use crate::segmenter::SegmentedTokenIter;
Expand All @@ -33,13 +35,17 @@ mod japanese;
mod lowercase;
mod nonspacing_mark;
mod quote;
#[cfg(feature = "swedish-recomposition")]
mod swedish_recomposition;
#[cfg(feature = "vietnamese")]
mod vietnamese;

/// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
vec![
Box::new(CompatibilityDecompositionNormalizer),
#[cfg(feature = "swedish-recomposition")]
Box::new(SwedishRecompositionNormalizer),
Box::new(ControlCharNormalizer),
Box::new(Classifier),
]
Expand Down
161 changes: 161 additions & 0 deletions charabia/src/normalizer/swedish_recomposition.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
use std::borrow::Cow;

use aho_corasick::AhoCorasick;
use once_cell::sync::Lazy;

use super::Normalizer;
use crate::normalizer::NormalizerOption;
use crate::{Script, Token};

static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
.unwrap()
});

/// Swedish specialized [`Normalizer`].
///
/// This Normalizer recompose swedish characters containing diacritics.
///
/// This avoids the diacritic removal from the letter and preserves expected swedish character ordering.
pub struct SwedishRecompositionNormalizer;

impl Normalizer for SwedishRecompositionNormalizer {
fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
match token.char_map.take() {
Some(mut char_map) => {
// if a char_map already exists,iterate over it to reconstruct sub-strings.
let mut lemma = String::new();
let mut tail = token.lemma.as_ref();
let mut normalized = String::new();
for (_, normalized_len) in char_map.iter_mut() {
let (head, t) = tail.split_at(*normalized_len as usize);
tail = t;
normalized.clear();
// then normalize each sub-strings recomputing the size in the char_map.
let mut peekable = head.chars().peekable();
while let Some(c) = peekable.next() {
let (c, peek_consumed) = recompose_swedish(c, peekable.peek());
if peek_consumed {
peekable.next();
}

normalized.push(c);
}

*normalized_len = normalized.len() as u8;
lemma.push_str(normalized.as_ref());
}

token.lemma = Cow::Owned(lemma);
token.char_map = Some(char_map);
}
None => {
// if no char_map exists, iterate over the lemma recomposing characters.
let mut char_map = Vec::new();
let mut lemma = String::new();
let mut peekable = token.lemma.chars().peekable();
while let Some(c) = peekable.next() {
let (normalized, peek_consumed) = recompose_swedish(c, peekable.peek());
if peek_consumed {
peekable.next();
}

if options.create_char_map {
char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8));
}
lemma.push(normalized);
}
token.lemma = Cow::Owned(lemma);
if options.create_char_map {
token.char_map = Some(char_map);
}
}
}

token
}

// Returns `true` if the Normalizer should be used.
fn should_normalize(&self, token: &Token) -> bool {
token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
}
}

fn recompose_swedish(current: char, next: Option<&char>) -> (char, bool) {
match (current, next) {
('A', Some('\u{30a}')) => ('Å', true),
('a', Some('\u{30a}')) => ('å', true),
('A', Some('\u{308}')) => ('Ä', true),
('a', Some('\u{308}')) => ('ä', true),
('O', Some('\u{308}')) => ('Ö', true),
('o', Some('\u{308}')) => ('ö', true),
(c, _) => (c, false),
}
}

// Test the normalizer:
#[cfg(test)]
mod test {
use std::borrow::Cow::Owned;

use crate::normalizer::test::test_normalizer;
use crate::normalizer::Normalizer;
use crate::token::TokenKind;

// base tokens to normalize.
fn tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("öpÅscålcäsÄÖs".to_string()),
char_end: 13,
byte_end: 19,
script: Script::Latin,
..Default::default()
}]
}

// expected result of the current Normalizer.
fn normalizer_result() -> Vec<Token<'static>> {
vec![Token {
// lowercased
lemma: Owned("öpÅscålcäsÄÖs".to_string()),
char_end: 13,
byte_end: 19,
script: Script::Latin,
..Default::default()
}]
}

// expected result of the complete Normalizer pieline.
fn normalized_tokens() -> Vec<Token<'static>> {
vec![Token {
lemma: Owned("öpåscålcäsäös".to_string()),
char_end: 13,
byte_end: 19,
char_map: Some(vec![
(2, 2),
(1, 1),
(2, 2),
(1, 1),
(1, 1),
(2, 2),
(1, 1),
(1, 1),
(2, 2),
(1, 1),
(2, 2),
(2, 2),
(1, 1),
]),
script: Script::Latin,
kind: TokenKind::Word,
..Default::default()
}]
}

test_normalizer!(
SwedishRecompositionNormalizer,
tokens(),
normalizer_result(),
normalized_tokens()
);
}

0 comments on commit d970d91

Please sign in to comment.