Merge #287

287: Add swedish recomposition normalizer and link it to a feature r=Kerollmops a=ManyTheFish # Pull Request - Add a Swedish normalizer, avoiding the diacritic removal from the letter, and preserving the expected Swedish character ordering. - Add a feature flag for it (not enabled by default) - Trigger test in the CI Co-authored-by: ManyTheFish <many@meilisearch.com>
meilisearch · Apr 30, 2024 · d970d91 · d970d91
2 parents 181f55c + b726e84
commit d970d91
Show file tree

Hide file tree

Showing 4 changed files with 172 additions and 0 deletions.
diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
@@ -28,6 +28,8 @@ jobs:
       run: cargo test --verbose --features japanese-transliteration
     - name: Run tests with chinese-normalization-pinyin on
       run: cargo test --verbose --features chinese chinese-normalization-pinyin
+    - name: Run tests with swedish-recomposition on
+      run: cargo test --verbose --features swedish-recomposition
     - name: Run irg-kvariants tests
       run: cargo test -p irg-kvariants --verbose
 

diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml
@@ -72,6 +72,9 @@ vietnamese = []
 # allow splitting snake_case latin words
 latin-snakecase = ["dep:finl_unicode"]
 
+# force Charabia to recompose Swedish characters
+swedish-recomposition = []
+
 [dev-dependencies]
 criterion = "0.5"
 jemallocator = "0.5.4"

diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs
@@ -15,6 +15,8 @@ pub use self::japanese::JapaneseNormalizer;
 pub use self::lowercase::LowercaseNormalizer;
 use self::nonspacing_mark::NonspacingMarkNormalizer;
 use self::quote::QuoteNormalizer;
+#[cfg(feature = "swedish-recomposition")]
+use self::swedish_recomposition::SwedishRecompositionNormalizer;
 #[cfg(feature = "vietnamese")]
 pub use self::vietnamese::VietnameseNormalizer;
 use crate::segmenter::SegmentedTokenIter;
@@ -33,13 +35,17 @@ mod japanese;
 mod lowercase;
 mod nonspacing_mark;
 mod quote;
+#[cfg(feature = "swedish-recomposition")]
+mod swedish_recomposition;
 #[cfg(feature = "vietnamese")]
 mod vietnamese;
 
 /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy.
 pub static NORMALIZERS: Lazy<Vec<Box<dyn Normalizer>>> = Lazy::new(|| {
     vec![
         Box::new(CompatibilityDecompositionNormalizer),
+        #[cfg(feature = "swedish-recomposition")]
+        Box::new(SwedishRecompositionNormalizer),
         Box::new(ControlCharNormalizer),
         Box::new(Classifier),
     ]

diff --git a/charabia/src/normalizer/swedish_recomposition.rs b/charabia/src/normalizer/swedish_recomposition.rs
@@ -0,0 +1,161 @@
+use std::borrow::Cow;
+
+use aho_corasick::AhoCorasick;
+use once_cell::sync::Lazy;
+
+use super::Normalizer;
+use crate::normalizer::NormalizerOption;
+use crate::{Script, Token};
+
+static MATCHING_STR: Lazy<AhoCorasick> = Lazy::new(|| {
+    AhoCorasick::new(&["A\u{30a}", "a\u{30a}", "A\u{308}", "a\u{308}", "O\u{308}", "o\u{308}"])
+        .unwrap()
+});
+
+/// Swedish specialized [`Normalizer`].
+///
+/// This Normalizer recompose swedish characters containing diacritics.
+///
+/// This avoids the diacritic removal from the letter and preserves expected swedish character ordering.
+pub struct SwedishRecompositionNormalizer;
+
+impl Normalizer for SwedishRecompositionNormalizer {
+    fn normalize<'o>(&self, mut token: Token<'o>, options: &NormalizerOption) -> Token<'o> {
+        match token.char_map.take() {
+            Some(mut char_map) => {
+                // if a char_map already exists,iterate over it to reconstruct sub-strings.
+                let mut lemma = String::new();
+                let mut tail = token.lemma.as_ref();
+                let mut normalized = String::new();
+                for (_, normalized_len) in char_map.iter_mut() {
+                    let (head, t) = tail.split_at(*normalized_len as usize);
+                    tail = t;
+                    normalized.clear();
+                    // then normalize each sub-strings recomputing the size in the char_map.
+                    let mut peekable = head.chars().peekable();
+                    while let Some(c) = peekable.next() {
+                        let (c, peek_consumed) = recompose_swedish(c, peekable.peek());
+                        if peek_consumed {
+                            peekable.next();
+                        }
+
+                        normalized.push(c);
+                    }
+
+                    *normalized_len = normalized.len() as u8;
+                    lemma.push_str(normalized.as_ref());
+                }
+
+                token.lemma = Cow::Owned(lemma);
+                token.char_map = Some(char_map);
+            }
+            None => {
+                // if no char_map exists, iterate over the lemma recomposing characters.
+                let mut char_map = Vec::new();
+                let mut lemma = String::new();
+                let mut peekable = token.lemma.chars().peekable();
+                while let Some(c) = peekable.next() {
+                    let (normalized, peek_consumed) = recompose_swedish(c, peekable.peek());
+                    if peek_consumed {
+                        peekable.next();
+                    }
+
+                    if options.create_char_map {
+                        char_map.push((c.len_utf8() as u8, normalized.len_utf8() as u8));
+                    }
+                    lemma.push(normalized);
+                }
+                token.lemma = Cow::Owned(lemma);
+                if options.create_char_map {
+                    token.char_map = Some(char_map);
+                }
+            }
+        }
+
+        token
+    }
+
+    // Returns `true` if the Normalizer should be used.
+    fn should_normalize(&self, token: &Token) -> bool {
+        token.script == Script::Latin && MATCHING_STR.is_match(token.lemma())
+    }
+}
+
+fn recompose_swedish(current: char, next: Option<&char>) -> (char, bool) {
+    match (current, next) {
+        ('A', Some('\u{30a}')) => ('Å', true),
+        ('a', Some('\u{30a}')) => ('å', true),
+        ('A', Some('\u{308}')) => ('Ä', true),
+        ('a', Some('\u{308}')) => ('ä', true),
+        ('O', Some('\u{308}')) => ('Ö', true),
+        ('o', Some('\u{308}')) => ('ö', true),
+        (c, _) => (c, false),
+    }
+}
+
+// Test the normalizer:
+#[cfg(test)]
+mod test {
+    use std::borrow::Cow::Owned;
+
+    use crate::normalizer::test::test_normalizer;
+    use crate::normalizer::Normalizer;
+    use crate::token::TokenKind;
+
+    // base tokens to normalize.
+    fn tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("öpÅscålcäsÄÖs".to_string()),
+            char_end: 13,
+            byte_end: 19,
+            script: Script::Latin,
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the current Normalizer.
+    fn normalizer_result() -> Vec<Token<'static>> {
+        vec![Token {
+            // lowercased
+            lemma: Owned("öpÅscålcäsÄÖs".to_string()),
+            char_end: 13,
+            byte_end: 19,
+            script: Script::Latin,
+            ..Default::default()
+        }]
+    }
+
+    // expected result of the complete Normalizer pieline.
+    fn normalized_tokens() -> Vec<Token<'static>> {
+        vec![Token {
+            lemma: Owned("öpåscålcäsäös".to_string()),
+            char_end: 13,
+            byte_end: 19,
+            char_map: Some(vec![
+                (2, 2),
+                (1, 1),
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (2, 2),
+                (1, 1),
+                (1, 1),
+                (2, 2),
+                (1, 1),
+                (2, 2),
+                (2, 2),
+                (1, 1),
+            ]),
+            script: Script::Latin,
+            kind: TokenKind::Word,
+            ..Default::default()
+        }]
+    }
+
+    test_normalizer!(
+        SwedishRecompositionNormalizer,
+        tokens(),
+        normalizer_result(),
+        normalized_tokens()
+    );
+}