diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 64b294541..2cecd1abe 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -47,30 +47,6 @@ impl<'a> ExternalDocumentsIds<'a> { } } - pub fn delete_ids>(&mut self, other: fst::Set) -> fst::Result<()> { - let other = fst::Map::from(other.into_fst()); - let union_op = self.soft.op().add(&other).r#union(); - - let mut iter = union_op.into_stream(); - let mut new_soft_builder = fst::MapBuilder::memory(); - while let Some((external_id, docids)) = iter.next() { - if docids.iter().any(|v| v.index == 1) { - // If the `other` set returns a value here it means - // that it must be marked as deleted. - new_soft_builder.insert(external_id, DELETED_ID)?; - } else { - let value = docids.iter().find(|v| v.index == 0).unwrap().value; - new_soft_builder.insert(external_id, value)?; - } - } - - drop(iter); - - // We save this new map as the new soft map. - self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; - self.merge_soft_into_hard() - } - /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they /// don't contain any soft deleted document id. pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { @@ -173,76 +149,3 @@ impl Default for ExternalDocumentsIds<'static> { fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option { indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn simple_insert_delete_ids() { - let mut external_documents_ids = ExternalDocumentsIds::default(); - - let new_ids = fst::Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3), ("d", 4)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), Some(1)); - assert_eq!(external_documents_ids.get("b"), Some(2)); - assert_eq!(external_documents_ids.get("c"), Some(3)); - assert_eq!(external_documents_ids.get("d"), Some(4)); - - let new_ids = fst::Map::from_iter(vec![("e", 5), ("f", 6), ("g", 7)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), Some(1)); - assert_eq!(external_documents_ids.get("b"), Some(2)); - assert_eq!(external_documents_ids.get("c"), Some(3)); - assert_eq!(external_documents_ids.get("d"), Some(4)); - assert_eq!(external_documents_ids.get("e"), Some(5)); - assert_eq!(external_documents_ids.get("f"), Some(6)); - assert_eq!(external_documents_ids.get("g"), Some(7)); - - let del_ids = fst::Set::from_iter(vec!["a", "c", "f"]).unwrap(); - external_documents_ids.delete_ids(del_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), None); - assert_eq!(external_documents_ids.get("b"), Some(2)); - assert_eq!(external_documents_ids.get("c"), None); - assert_eq!(external_documents_ids.get("d"), Some(4)); - assert_eq!(external_documents_ids.get("e"), Some(5)); - assert_eq!(external_documents_ids.get("f"), None); - assert_eq!(external_documents_ids.get("g"), Some(7)); - - let new_ids = fst::Map::from_iter(vec![("a", 5), ("b", 6), ("h", 8)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - - assert_eq!(external_documents_ids.get("a"), Some(5)); - assert_eq!(external_documents_ids.get("b"), Some(6)); - assert_eq!(external_documents_ids.get("c"), None); - assert_eq!(external_documents_ids.get("d"), Some(4)); - assert_eq!(external_documents_ids.get("e"), Some(5)); - assert_eq!(external_documents_ids.get("f"), None); - assert_eq!(external_documents_ids.get("g"), Some(7)); - assert_eq!(external_documents_ids.get("h"), Some(8)); - } - - #[test] - fn strange_delete_insert_ids() { - let mut external_documents_ids = ExternalDocumentsIds::default(); - - let new_ids = - fst::Map::from_iter(vec![("1", 0), ("123", 1), ("30", 2), ("456", 3)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - assert_eq!(external_documents_ids.get("1"), Some(0)); - assert_eq!(external_documents_ids.get("123"), Some(1)); - assert_eq!(external_documents_ids.get("30"), Some(2)); - assert_eq!(external_documents_ids.get("456"), Some(3)); - - let deleted_ids = fst::Set::from_iter(vec!["30"]).unwrap(); - external_documents_ids.delete_ids(deleted_ids).unwrap(); - assert_eq!(external_documents_ids.get("30"), None); - - let new_ids = fst::Map::from_iter(vec![("30", 2)]).unwrap(); - external_documents_ids.insert_ids(&new_ids).unwrap(); - assert_eq!(external_documents_ids.get("30"), Some(2)); - } -} diff --git a/milli/src/index.rs b/milli/src/index.rs index 48b892c77..19ecf9d33 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1181,6 +1181,7 @@ impl Index { #[cfg(test)] pub(crate) mod tests { + use std::collections::HashSet; use std::ops::Deref; use big_s::S; @@ -1195,7 +1196,7 @@ pub(crate) mod tests { self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; - use crate::{db_snap, obkv_to_json, Index}; + use crate::{db_snap, obkv_to_json, Index, Search, SearchResult}; pub(crate) struct TempIndex { pub inner: Index, @@ -2187,4 +2188,96 @@ pub(crate) mod tests { "###); db_snap!(index, soft_deleted_documents_ids, 2, @"[]"); } + + #[test] + fn bug_3021_fourth() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 11 }, + { "primary_key": 4 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 11 0 + 4 1 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + index + .add_documents(documents!([ + { "primary_key": 4, "a": 0 }, + { "primary_key": 1 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 3 + 11 0 + 4 2 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[1, ]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.disable_soft_deletion(true); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, 2, 3, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 3 + 11 0 + 4 2 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[]"); + + index + .add_documents(documents!([ + { "primary_key": 4, "a": 1 }, + { "primary_key": 1, "a": 0 }, + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 4, ]"); + db_snap!(index, external_documents_ids, @r###" + soft: + hard: + 1 4 + 11 0 + 4 1 + "###); + db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]"); + + let rtxn = index.read_txn().unwrap(); + let search = Search::new(&rtxn, &index); + let SearchResult { matching_words: _, candidates: _, mut documents_ids } = + search.execute().unwrap(); + let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap(); + documents_ids.sort_unstable(); + let docs = index.documents(&rtxn, documents_ids).unwrap(); + let mut all_ids = HashSet::new(); + for (_docid, obkv) in docs { + let id = obkv.get(primary_key_id).unwrap(); + assert!(all_ids.insert(id)); + } + } } diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 6c0f66685..f6fe627d6 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -6,16 +6,14 @@ use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; -use serde_json::Value; use time::OffsetDateTime; use super::facet::delete::FacetsDelete; use super::ClearDocuments; -use crate::error::{InternalError, UserError}; +use crate::error::InternalError; use crate::facet::FacetType; use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::index::{db_name, main_key}; use crate::{ ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, @@ -169,6 +167,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let percentage_used_by_soft_deleted_documents = estimated_size_used_by_soft_deleted * 100 / map_size; + // We always soft-delete the documents, even if they will be permanently + // deleted immediately after. + self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; + // if we have more than 10% of disk space available and the soft deleted // documents uses less than 10% of the total space available, // we skip the deletion. Eg. @@ -183,7 +185,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { && percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { - self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -191,23 +192,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { }); } - // There is more than documents to delete than the threshold we needs to delete them all self.to_delete_docids = soft_deleted_docids; - // and we can reset the soft deleted bitmap - self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; - - let primary_key = - self.index.primary_key(self.wtxn)?.ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::MAIN, - key: Some(main_key::PRIMARY_KEY_KEY), - })?; - - // Since we already checked if the DB was empty, if we can't find the primary key, then - // something is wrong, and we must return an error. - let id_field = match fields_ids_map.id(primary_key) { - Some(field) => field, - None => return Err(UserError::MissingPrimaryKey.into()), - }; let Index { env: _env, @@ -231,33 +216,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { documents, } = self.index; - // Retrieve the words and the external documents ids contained in the documents. + // Retrieve the words contained in the documents. let mut words = Vec::new(); - let mut external_ids = Vec::new(); for docid in &self.to_delete_docids { - // We create an iterator to be able to get the content and delete the document - // content itself. It's faster to acquire a cursor to get and delete, - // as we avoid traversing the LMDB B-Tree two times but only once. - let key = BEU32::new(docid); - let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; - if let Some((_key, obkv)) = iter.next().transpose()? { - if let Some(content) = obkv.get(id_field) { - let external_id = match serde_json::from_slice(content).unwrap() { - Value::String(string) => SmallString32::from(string.as_str()), - Value::Number(number) => SmallString32::from(number.to_string()), - document_id => { - return Err(UserError::InvalidDocumentId { document_id }.into()) - } - }; - external_ids.push(external_id); - } - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } - drop(iter); + documents.delete(self.wtxn, &BEU32::new(docid))?; - // We iterate through the words positions of the document id, - // retrieve the word and delete the positions. + // We iterate through the words positions of the document id, retrieve the word and delete the positions. + // We create an iterator to be able to get the content and delete the key-value itself. + // It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once. let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?; while let Some(result) = iter.next() { let ((_docid, word), _positions) = result?; @@ -267,17 +233,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { unsafe { iter.del_current()? }; } } - - // We create the FST map of the external ids that we must delete. - external_ids.sort_unstable(); - let external_ids_to_delete = fst::Set::from_iter(external_ids)?; - // We acquire the current external documents ids map... + // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?; - // ...and remove the to-delete external ids. - new_external_documents_ids.delete_ids(external_ids_to_delete)?; - - // We write the new external ids into the main database. + // We then remove the soft-deleted docids from it + new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + // and write it back to the main database. let new_external_documents_ids = new_external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; @@ -514,6 +475,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; + self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; + Ok(DetailedDocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -1085,15 +1048,16 @@ mod tests { id ); } + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); // get internal docids from deleted external document ids - let results = index.external_documents_ids(&wtxn).unwrap(); + let results = index.external_documents_ids(&rtxn).unwrap(); for id in deleted_external_ids { assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id); } - - wtxn.commit().unwrap(); - + drop(rtxn); db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); }