Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #557 #572
Browse files Browse the repository at this point in the history
557: Fasten documents deletion and update r=Kerollmops a=irevoire

When a document deletion occurs, instead of deleting the document we mark it as deleted in the new “soft deleted” bitmap. It is then removed from the search and all the other endpoints.

I ran the benchmarks against main;
```
% ./compare.sh indexing_main_83ad1aaf.json indexing_fasten-document-deletion_abab51fb.json
group                                                                     indexing_fasten-document-deletion_abab51fb    indexing_main_83ad1aaf
-----                                                                     ------------------------------------------    ----------------------
indexing/-geo-delete-facetedNumber-facetedGeo-searchable-                 1.05      2.0±0.40ms        ? ?/sec           1.00  1904.9±190.00µs        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-           1.00     10.3±2.64ms        ? ?/sec           961.61      9.9±0.12s        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-nested-    1.00     15.1±3.90ms        ? ?/sec           554.63      8.4±0.12s        ? ?/sec
indexing/-songs-delete-facetedString-facetedNumber-searchable-            1.00     45.1±7.53ms        ? ?/sec           710.15     32.0±0.10s        ? ?/sec
indexing/-wiki-delete-searchable-                                         1.00    277.8±7.97ms        ? ?/sec           1946.57    540.8±3.15s        ? ?/sec
indexing/Indexing geo_point                                               1.00      12.0±0.20s        ? ?/sec           1.03      12.4±0.19s        ? ?/sec
indexing/Indexing movies in three batches                                 1.00      19.3±0.30s        ? ?/sec           1.01      19.4±0.16s        ? ?/sec
indexing/Indexing movies with default settings                            1.00      18.8±0.09s        ? ?/sec           1.00      18.9±0.10s        ? ?/sec
indexing/Indexing nested movies with default settings                     1.00      25.9±0.19s        ? ?/sec           1.00      25.9±0.12s        ? ?/sec
indexing/Indexing nested movies without any facets                        1.00      24.8±0.17s        ? ?/sec           1.00      24.8±0.18s        ? ?/sec
indexing/Indexing songs in three batches with default settings            1.00      65.9±0.96s        ? ?/sec           1.03      67.8±0.82s        ? ?/sec
indexing/Indexing songs with default settings                             1.00      58.8±1.11s        ? ?/sec           1.02      59.9±2.09s        ? ?/sec
indexing/Indexing songs without any facets                                1.00      53.4±0.72s        ? ?/sec           1.01      54.2±0.88s        ? ?/sec
indexing/Indexing songs without faceted numbers                           1.00      57.9±1.17s        ? ?/sec           1.01      58.3±1.20s        ? ?/sec
indexing/Indexing wiki                                                    1.00   1065.2±13.26s        ? ?/sec           1.00   1065.8±12.66s        ? ?/sec
indexing/Indexing wiki in three batches                                   1.00    1182.4±6.20s        ? ?/sec           1.01    1190.8±8.48s        ? ?/sec
```

Most things do not change, we lost 0.1ms on the indexing of geo point (I don’t get why), and then we are between 500 and 1900 times faster when we delete documents.


572: Add reindexing benchmarks r=Kerollmops a=irevoire

With #557 coming, we should add benchmarks that measure our impact on the reindexing process.

Co-authored-by: Tamo <tamo@meilisearch.com>
  • Loading branch information
bors[bot] and irevoire committed Jul 5, 2022
3 parents 62692c1 + 250be9f + 2700d8d commit df679f4
Show file tree
Hide file tree
Showing 11 changed files with 628 additions and 284 deletions.
213 changes: 213 additions & 0 deletions benchmarks/benches/indexing.rs
Expand Up @@ -147,6 +147,58 @@ fn indexing_songs_default(c: &mut Criterion) {
});
}

fn reindexing_songs_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing songs with default settings", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "album", "artist"];
let filterable_fields =
["released-timestamp", "duration-float", "genre", "country", "artist"];
let sortable_fields = [];

let index = setup_index_with_settings(
&primary_key,
&searchable_fields,
&filterable_fields,
&sortable_fields,
);

let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();

index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
},
)
});
}

fn deleting_songs_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
Expand Down Expand Up @@ -378,6 +430,59 @@ fn indexing_wiki(c: &mut Criterion) {
});
}

fn reindexing_wiki(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing wiki", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "body"];
let filterable_fields = [];
let sortable_fields = [];

let index = setup_index_with_settings(
&primary_key,
&searchable_fields,
&filterable_fields,
&sortable_fields,
);

let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();

index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config =
IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
},
)
});
}

fn deleting_wiki_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
Expand Down Expand Up @@ -541,6 +646,57 @@ fn indexing_movies_default(c: &mut Criterion) {
});
}

fn reindexing_movies_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing movies with default settings", |b| {
b.iter_with_setup(
move || {
let primary_key = "id";
let searchable_fields = ["title", "overview"];
let filterable_fields = ["released_date", "genres"];
let sortable_fields = [];

let index = setup_index_with_settings(
&primary_key,
&searchable_fields,
&filterable_fields,
&sortable_fields,
);

let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();

index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::MOVIES, "json");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();
wtxn.commit().unwrap();

index.prepare_for_closing().wait();
},
)
});
}

fn deleting_movies_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
Expand Down Expand Up @@ -881,6 +1037,59 @@ fn indexing_geo(c: &mut Criterion) {
});
}

fn reindexing_geo(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
group.bench_function("Reindexing geo_point", |b| {
b.iter_with_setup(
move || {
let primary_key = "geonameid";
let searchable_fields = ["name", "alternatenames", "elevation"];
let filterable_fields = ["_geo", "population", "elevation"];
let sortable_fields = ["_geo", "population", "elevation"];

let index = setup_index_with_settings(
&primary_key,
&searchable_fields,
&filterable_fields,
&sortable_fields,
);

let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();

wtxn.commit().unwrap();

index
},
move |index| {
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
let mut wtxn = index.write_txn().unwrap();
let mut builder =
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
.unwrap();

let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
builder.add_documents(documents).unwrap();
builder.execute().unwrap();

wtxn.commit().unwrap();

index.prepare_for_closing().wait();
},
)
});
}

fn deleting_geo_in_batches_default(c: &mut Criterion) {
let mut group = c.benchmark_group("indexing");
group.sample_size(BENCHMARK_ITERATION);
Expand Down Expand Up @@ -939,20 +1148,24 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
criterion_group!(
benches,
indexing_songs_default,
reindexing_songs_default,
deleting_songs_in_batches_default,
indexing_songs_without_faceted_numbers,
indexing_songs_without_faceted_fields,
indexing_songs_in_three_batches_default,
indexing_wiki,
reindexing_wiki,
deleting_wiki_in_batches_default,
indexing_wiki_in_three_batches,
indexing_movies_default,
reindexing_movies_default,
deleting_movies_in_batches_default,
indexing_movies_in_three_batches,
indexing_nested_movies_default,
deleting_nested_movies_in_batches_default,
indexing_nested_movies_without_faceted_fields,
indexing_geo,
reindexing_geo,
deleting_geo_in_batches_default
);
criterion_main!(benches);
2 changes: 2 additions & 0 deletions milli/src/error.rs
Expand Up @@ -83,6 +83,8 @@ pub enum FieldIdMapMissingEntry {

#[derive(Error, Debug)]
pub enum UserError {
#[error("A soft deleted internal document id have been used: `{document_id}`.")]
AccessingSoftDeletedDocument { document_id: DocumentId },
#[error("A document cannot contain more than 65,535 fields.")]
AttributeLimitReached,
#[error(transparent)]
Expand Down
11 changes: 9 additions & 2 deletions milli/src/external_documents_ids.rs
Expand Up @@ -5,26 +5,30 @@ use std::{fmt, str};

use fst::map::IndexedValue;
use fst::{IntoStreamer, Streamer};
use roaring::RoaringBitmap;

const DELETED_ID: u64 = u64::MAX;

pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
}

impl<'a> ExternalDocumentsIds<'a> {
pub fn new(
hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft }
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
}

pub fn into_static(self) -> ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft_deleted_docids: self.soft_deleted_docids,
}
}

Expand All @@ -36,7 +40,9 @@ impl<'a> ExternalDocumentsIds<'a> {
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
let external_id = external_id.as_ref();
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()),
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
Some(id.try_into().unwrap())
}
_otherwise => None,
}
}
Expand Down Expand Up @@ -134,6 +140,7 @@ impl Default for ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft_deleted_docids: RoaringBitmap::new(),
}
}
}
Expand Down

0 comments on commit df679f4

Please sign in to comment.