meilisearch · irevoire · Jan 16, 2024 · Jan 15, 2024 · Jan 15, 2024 · Jan 15, 2024
diff --git a/.github/workflows/fuzzer.yml b/.github/workflows/fuzzer.yml
@@ -0,0 +1,26 @@
+name: Run the indexing fuzzer
+
+on:
+  push:
+    branches: ['main']
+  pull_request:
+    branches: ['main']
+
+jobs:
+  fuzz:
+    name: Setup the action
+    runs-on: ubuntu-latest
+    timeout-minutes: 60 
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      # Run fuzzer
+      - name: Run the fuzzer
+        run: |
+          cargo run --release --example fuzz $((60 * 5))
+
diff --git a/Cargo.toml b/Cargo.toml
@@ -25,6 +25,7 @@ thiserror = "1.0.50"
 
 [dev-dependencies]
 anyhow = "1.0.75"
+arbitrary = { version = "1.3.2", features = ["derive"] }
 clap = { version = "4.4.10", features = ["derive"] }
 env_logger = "0.10.1"
 heed = { version = "0.20.0-alpha.9", default-features = false }

diff --git a/README.md b/README.md
@@ -31,11 +31,11 @@ Arroy was built by [@Kerollmops](https://github.com/Kerollmops) and [@irevoire](
 - Multithreaded tree building using rayon
 - Additional features compared to Annoy
   - Filter when querying
-  - Incrementally update the tree without rebuilding it from scratch ([planned](https://github.com/meilisearch/arroy/issues/21))
-  - Store and Modify different indexes atomically using LMDB (indexes are identified by an `u16`)
+  - Incrementally update the tree without rebuilding it from scratch
+  - Store and modify different indexes atomically using LMDB (indexes are identified by an `u16`)
   - Modify the items list **in place** while performing queries using LMDB
   - Storage based on LMDB using LMDB
-  - Safer to use API, i.e., Check dimensions, distances, etc
+  - Safer to use API, i.e., check dimensions, distances, etc
   - The database size does not depend on the highest item ID but on the number of items
   - Generic over your random number generator
 

diff --git a/examples/fuzz.rs b/examples/fuzz.rs
@@ -0,0 +1,121 @@
+use std::time::{Duration, Instant};
+use std::{fmt, panic};
+
+use arbitrary::{Arbitrary, Unstructured};
+use arroy::distances::Euclidean;
+use arroy::{Database, Result, Writer};
+use heed::EnvOpenOptions;
+use rand::rngs::StdRng;
+use rand::{Fill, SeedableRng};
+
+const TWENTY_GIB: usize = 20 * 1024 * 1024 * 1024;
+
+const UPDATE_PER_BATCHES: usize = 50;
+const NB_BATCHES: usize = 5;
+const NB_DIFFERENT_VECTORS: u32 = 5;
+
+#[derive(Clone)]
+struct Document {
+    id: u32,
+    vec: Vec<f32>,
+}
+
+impl fmt::Debug for Document {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.write_str(&self.id.to_string())
+    }
+}
+
+impl<'a> Arbitrary<'a> for Document {
+    fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
+        let id = *u.choose(&(0..NB_DIFFERENT_VECTORS).collect::<Vec<_>>())?;
+        Ok(Document { id, vec: vec![id as f32, 0.0] })
+    }
+}
+
+#[derive(Debug, Clone, Arbitrary)]
+enum Operation {
+    Add(Document),
+    Delete(Document),
+}
+
+fn main() -> Result<()> {
+    let timer = std::env::args()
+        .nth(1)
+        .map(|s| Duration::from_secs(s.parse().expect("Expected a whole number of seconds")));
+
+    let dir = tempfile::tempdir().unwrap();
+    let env = EnvOpenOptions::new().map_size(TWENTY_GIB).open(dir.path())?;
+    let mut wtxn = env.write_txn()?;
+    let database: Database<Euclidean> = env.create_database(&mut wtxn, None)?;
+    wtxn.commit()?;
+
+    let mut rng_points = StdRng::seed_from_u64(42);
+    let rng_arroy = rng_points.clone();
+
+    let total_duration = Instant::now();
+    let mut instant = Instant::now();
+    let mut smol_iterations = 0;
+
+    for iteration in 0.. {
+        // logging progression
+        smol_iterations += 1;
+        let elapsed = instant.elapsed();
+        if elapsed >= Duration::from_secs(1) {
+            println!(
+                "Ran {smol_iterations} iterations in {elapsed:.1?} for a grand total of {iteration} iterations"
+            );
+            instant = Instant::now();
+            smol_iterations = 0;
+
+            if timer.map_or(false, |duration| duration < total_duration.elapsed()) {
+                return Ok(());
+            }
+        }
+
+        let mut v = [0_u8; 10_000];
+        v.try_fill(&mut rng_points).unwrap();
+
+        let mut data = Unstructured::new(&v);
+        let batches =
+            <[[Operation; UPDATE_PER_BATCHES]; NB_BATCHES]>::arbitrary(&mut data).unwrap();
+
+        for operations in batches {
+            let ops = operations.clone();
+            let ret = panic::catch_unwind(|| -> arroy::Result<()> {
+                let mut rng_arroy = rng_arroy.clone();
+                let mut wtxn = env.write_txn()?;
+                let writer = Writer::<Euclidean>::new(database, 0, 2)?;
+
+                for op in operations {
+                    match op {
+                        Operation::Add(doc) => writer.add_item(&mut wtxn, doc.id, &doc.vec)?,
+                        Operation::Delete(doc) => drop(writer.del_item(&mut wtxn, doc.id)?),
+                    }
+                }
+                writer.build(&mut wtxn, &mut rng_arroy, None)?;
+                wtxn.commit()?;
+                Ok(())
+            });
+            if let Err(e) = ret {
+                #[cfg(feature = "plot")]
+                {
+                    use arroy::Reader;
+
+                    let mut buffer = Vec::new();
+
+                    let rtxn = env.read_txn()?;
+                    let reader = Reader::<Euclidean>::open(&rtxn, 0, database)?;
+                    reader.plot_internals_tree_nodes(&rtxn, &mut buffer)?;
+                    std::fs::write("plot.dot", &buffer);
+                    println!("Plotted your database to `plot.dot`");
+                }
+                dbg!(&ops);
+                dbg!(e);
+                return Ok(());
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/src/parallel.rs b/src/parallel.rs
@@ -79,8 +79,8 @@ impl<'a, DE: BytesEncode<'a>> TmpNodes<DE> {
     /// Panic if the node wasn't inserted in the tmp_nodes before calling this method.
     pub fn remove(&mut self, item: ItemId) -> heed::Result<()> {
         self.remove_from_db(item);
-        // In the current algorithm, we're supposed to find the node in the two last positions.
-        if let Some(el) = self.ids.iter_mut().rev().take(2).find(|i| **i == item) {
+        // In the current algorithm, we're supposed to find the node in the last positions.
+        if let Some(el) = self.ids.iter_mut().rev().find(|i| **i == item) {
             *el = u32::MAX;
         } else {
             unreachable!();

diff --git a/src/writer.rs b/src/writer.rs
@@ -745,10 +745,13 @@ impl<D: Distance> Writer<D> {
         nb_tree_nodes: u64,
         nb_items: u64,
     ) -> Result<()> {
+        if roots.is_empty() {
+            return Ok(());
+        }
         let nb_trees = match nb_trees {
             Some(nb_trees) => nb_trees,
             None => {
-                // 1. Estimate the number of nodes per tree
+                // 1. Estimate the number of nodes per tree; the division is safe because we ensured there was at least one root node above.
                 let nodes_per_tree = nb_tree_nodes / roots.len() as u64;
                 // 2. Estimate the number of tree we need to have AT LEAST as much tree-nodes than items
                 (nb_items / nodes_per_tree) as usize