meilisearch · meili-bors · Jun 7, 2023 · Jun 7, 2023
diff --git a/milli/src/index.rs b/milli/src/index.rs
@@ -21,10 +21,9 @@ use crate::heed_codec::facet::{
 };
 use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
 use crate::{
-    default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
-    DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
-    FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
-    Search, U8StrStrCodec, BEU16, BEU32,
+    default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
+    FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
+    Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32,
 };
 
 pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@@ -111,9 +110,6 @@ pub struct Index {
     /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
     pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
 
-    /// Maps a word and a document id (u32) to all the positions where the given word appears.
-    pub docid_word_positions: Database<BEU32StrCodec, BoRoaringBitmapCodec>,
-
     /// Maps the proximity between a pair of words with all the docids where this relation appears.
     pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
     /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
@@ -177,7 +173,6 @@ impl Index {
         let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
         let exact_word_prefix_docids =
             env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
-        let docid_word_positions = env.create_database(&mut wtxn, Some(DOCID_WORD_POSITIONS))?;
         let word_pair_proximity_docids =
             env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
         let script_language_docids =
@@ -220,7 +215,6 @@ impl Index {
             exact_word_docids,
             word_prefix_docids,
             exact_word_prefix_docids,
-            docid_word_positions,
             word_pair_proximity_docids,
             script_language_docids,
             word_prefix_pair_proximity_docids,

diff --git a/milli/src/lib.rs b/milli/src/lib.rs
@@ -5,52 +5,6 @@
 #[global_allocator]
 pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
 
-// #[cfg(test)]
-// pub mod allocator {
-//     use std::alloc::{GlobalAlloc, System};
-//     use std::sync::atomic::{self, AtomicI64};
-
-//     #[global_allocator]
-//     pub static ALLOC: CountingAlloc = CountingAlloc {
-//         max_resident: AtomicI64::new(0),
-//         resident: AtomicI64::new(0),
-//         allocated: AtomicI64::new(0),
-//     };
-
-//     pub struct CountingAlloc {
-//         pub max_resident: AtomicI64,
-//         pub resident: AtomicI64,
-//         pub allocated: AtomicI64,
-//     }
-//     unsafe impl GlobalAlloc for CountingAlloc {
-//         unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 {
-//             self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
-//             let old_resident =
-//                 self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst);
-
-//             let resident = old_resident + layout.size() as i64;
-//             self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst);
-
-//             // if layout.size() > 1_000_000 {
-//             //     eprintln!(
-//             //         "allocating {} with new resident size: {resident}",
-//             //         layout.size() / 1_000_000
-//             //     );
-//             //     // let trace = std::backtrace::Backtrace::capture();
-//             //     // let t = trace.to_string();
-//             //     // eprintln!("{t}");
-//             // }
-
-//             System.alloc(layout)
-//         }
-
-//         unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) {
-//             self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed);
-//             System.dealloc(ptr, layout)
-//         }
-//     }
-// }
-
 #[macro_use]
 pub mod documents;
 

diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
@@ -89,7 +89,6 @@ Create a snapshot test of the given database.
     - `exact_word_docids`
     - `word_prefix_docids`
     - `exact_word_prefix_docids`
-    - `docid_word_positions`
     - `word_pair_proximity_docids`
     - `word_prefix_pair_proximity_docids`
     - `word_position_docids`
@@ -217,11 +216,6 @@ pub fn snap_exact_word_prefix_docids(index: &Index) -> String {
         &format!("{s:<16} {}", display_bitmap(&b))
     })
 }
-pub fn snap_docid_word_positions(index: &Index) -> String {
-    make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| {
-        &format!("{idx:<6} {s:<16} {}", display_bitmap(&b))
-    })
-}
 pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
     make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| {
         &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
@@ -477,9 +471,6 @@ macro_rules! full_snap_of_db {
     ($index:ident, exact_word_prefix_docids) => {{
         $crate::snapshot_tests::snap_exact_word_prefix_docids(&$index)
     }};
-    ($index:ident, docid_word_positions) => {{
-        $crate::snapshot_tests::snap_docid_word_positions(&$index)
-    }};
     ($index:ident, word_pair_proximity_docids) => {{
         $crate::snapshot_tests::snap_word_pair_proximity_docids(&$index)
     }};

diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
@@ -23,7 +23,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             exact_word_docids,
             word_prefix_docids,
             exact_word_prefix_docids,
-            docid_word_positions,
             word_pair_proximity_docids,
             word_prefix_pair_proximity_docids,
             prefix_word_pair_proximity_docids,
@@ -80,7 +79,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         exact_word_docids.clear(self.wtxn)?;
         word_prefix_docids.clear(self.wtxn)?;
         exact_word_prefix_docids.clear(self.wtxn)?;
-        docid_word_positions.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
         word_prefix_pair_proximity_docids.clear(self.wtxn)?;
         prefix_word_pair_proximity_docids.clear(self.wtxn)?;
@@ -141,7 +139,6 @@ mod tests {
 
         assert!(index.word_docids.is_empty(&rtxn).unwrap());
         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
-        assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
         assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());

diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
@@ -1,5 +1,5 @@
 use std::collections::btree_map::Entry;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 
 use fst::IntoStreamer;
 use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
@@ -15,8 +15,7 @@ use crate::facet::FacetType;
 use crate::heed_codec::facet::FieldDocIdFacetCodec;
 use crate::heed_codec::CboRoaringBitmapCodec;
 use crate::{
-    ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec,
-    SmallString32, BEU32,
+    ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
 };
 
 pub struct DeleteDocuments<'t, 'u, 'i> {
@@ -232,7 +231,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             exact_word_docids,
             word_prefix_docids,
             exact_word_prefix_docids,
-            docid_word_positions,
             word_pair_proximity_docids,
             field_id_word_count_docids,
             word_prefix_pair_proximity_docids,
@@ -251,23 +249,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             facet_id_is_empty_docids,
             documents,
         } = self.index;
-
-        // Retrieve the words contained in the documents.
-        let mut words = Vec::new();
+        // Remove from the documents database
         for docid in &self.to_delete_docids {
             documents.delete(self.wtxn, &BEU32::new(docid))?;
-
-            // We iterate through the words positions of the document id, retrieve the word and delete the positions.
-            // We create an iterator to be able to get the content and delete the key-value itself.
-            // It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once.
-            let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
-            while let Some(result) = iter.next() {
-                let ((_docid, word), _positions) = result?;
-                // This boolean will indicate if we must remove this word from the words FST.
-                words.push((SmallString32::from(word), false));
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            }
         }
         // We acquire the current external documents ids map...
         // Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
@@ -278,42 +262,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         let new_external_documents_ids = new_external_documents_ids.into_static();
         self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
 
-        // Maybe we can improve the get performance of the words
-        // if we sort the words first, keeping the LMDB pages in cache.
-        words.sort_unstable();
-
+        let mut words_to_keep = BTreeSet::default();
+        let mut words_to_delete = BTreeSet::default();
         // We iterate over the words and delete the documents ids
         // from the word docids database.
-        for (word, must_remove) in &mut words {
-            remove_from_word_docids(
-                self.wtxn,
-                word_docids,
-                word.as_str(),
-                must_remove,
-                &self.to_delete_docids,
-            )?;
-
-            remove_from_word_docids(
-                self.wtxn,
-                exact_word_docids,
-                word.as_str(),
-                must_remove,
-                &self.to_delete_docids,
-            )?;
-        }
+        remove_from_word_docids(
+            self.wtxn,
+            word_docids,
+            &self.to_delete_docids,
+            &mut words_to_keep,
+            &mut words_to_delete,
+        )?;
+        remove_from_word_docids(
+            self.wtxn,
+            exact_word_docids,
+            &self.to_delete_docids,
+            &mut words_to_keep,
+            &mut words_to_delete,
+        )?;
 
         // We construct an FST set that contains the words to delete from the words FST.
-        let words_to_delete =
-            words.iter().filter_map(
-                |(word, must_remove)| {
-                    if *must_remove {
-                        Some(word.as_str())
-                    } else {
-                        None
-                    }
-                },
-            );
-        let words_to_delete = fst::Set::from_iter(words_to_delete)?;
+        let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;
 
         let new_words_fst = {
             // We retrieve the current words FST from the database.
@@ -532,23 +501,24 @@ fn remove_from_word_prefix_docids(
 fn remove_from_word_docids(
     txn: &mut heed::RwTxn,
     db: &heed::Database<Str, RoaringBitmapCodec>,
-    word: &str,
-    must_remove: &mut bool,
     to_remove: &RoaringBitmap,
+    words_to_keep: &mut BTreeSet<String>,
+    words_to_remove: &mut BTreeSet<String>,
 ) -> Result<()> {
     // We create an iterator to be able to get the content and delete the word docids.
     // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
     // the LMDB B-Tree two times but only once.
-    let mut iter = db.prefix_iter_mut(txn, word)?;
-    if let Some((key, mut docids)) = iter.next().transpose()? {
-        if key == word {
-            let previous_len = docids.len();
-            docids -= to_remove;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-                *must_remove = true;
-            } else if docids.len() != previous_len {
+    let mut iter = db.iter_mut(txn)?;
+    while let Some((key, mut docids)) = iter.next().transpose()? {
+        let previous_len = docids.len();
+        docids -= to_remove;
+        if docids.is_empty() {
+            // safety: we don't keep references from inside the LMDB database.
+            unsafe { iter.del_current()? };
+            words_to_remove.insert(key.to_owned());
+        } else {
+            words_to_keep.insert(key.to_owned());
+            if docids.len() != previous_len {
                 let key = key.to_owned();
                 // safety: we don't keep references from inside the LMDB database.
                 unsafe { iter.put_current(&key, &docids)? };
@@ -627,7 +597,7 @@ mod tests {
 
     use super::*;
     use crate::index::tests::TempIndex;
-    use crate::{db_snap, Filter};
+    use crate::{db_snap, Filter, Search};
 
     fn delete_documents<'t>(
         wtxn: &mut RwTxn<'t, '_>,
@@ -1199,4 +1169,52 @@ mod tests {
             DeletionStrategy::AlwaysSoft,
         );
     }
+
+    #[test]
+    fn delete_words_exact_attributes() {
+        let index = TempIndex::new();
+
+        index
+            .update_settings(|settings| {
+                settings.set_primary_key(S("id"));
+                settings.set_searchable_fields(vec![S("text"), S("exact")]);
+                settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
+            })
+            .unwrap();
+
+        index
+            .add_documents(documents!([
+                { "id": 0, "text": "hello" },
+                { "id": 1, "exact": "hello"}
+            ]))
+            .unwrap();
+        db_snap!(index, word_docids, 1, @r###"
+        hello            [0, ]
+        "###);
+        db_snap!(index, exact_word_docids, 1, @r###"
+        hello            [1, ]
+        "###);
+        db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
+
+        let mut wtxn = index.write_txn().unwrap();
+        let deleted_internal_ids =
+            delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, word_docids, 2, @r###"
+        hello            [0, ]
+        "###);
+        db_snap!(index, exact_word_docids, 2, @"");
+        db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
+
+        insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
+        let txn = index.read_txn().unwrap();
+        let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
+        insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
+
+        let mut s = Search::new(&txn, &index);
+        s.query("hello");
+        let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
+        insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
+    }
 }
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
@@ -325,8 +325,6 @@ fn send_and_extract_flattened_documents_data(
                 // send docid_word_positions_chunk to DB writer
                 let docid_word_positions_chunk =
                     unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
-                let _ = lmdb_writer_sx
-                    .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone())));
 
                 let _ =
                     lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair)));

diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -4,7 +4,6 @@ use std::result::Result as StdResult;
 
 use roaring::RoaringBitmap;
 
-use super::read_u32_ne_bytes;
 use crate::heed_codec::CboRoaringBitmapCodec;
 use crate::update::index_documents::transform::Operation;
 use crate::Result;
@@ -22,10 +21,6 @@ pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Co
     }
 }
 
-pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap {
-    read_u32_ne_bytes(slice).collect()
-}
-
 pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
     buffer.clear();
     buffer.reserve(bitmap.serialized_size());

diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
@@ -14,8 +14,8 @@ pub use grenad_helpers::{
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
-    merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
-    roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
+    merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap,
+    MergeFn,
 };
 
 use crate::MAX_WORD_LENGTH;