Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a fuzzer #57

Merged
merged 6 commits into from Jan 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
26 changes: 26 additions & 0 deletions .github/workflows/fuzzer.yml
@@ -0,0 +1,26 @@
name: Run the indexing fuzzer

on:
push:
branches: ['main']
pull_request:
branches: ['main']

jobs:
fuzz:
name: Setup the action
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- uses: actions/checkout@v3
- uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: stable
override: true

# Run fuzzer
- name: Run the fuzzer
run: |
cargo run --release --example fuzz $((60 * 5))

1 change: 1 addition & 0 deletions Cargo.toml
Expand Up @@ -25,6 +25,7 @@ thiserror = "1.0.50"

[dev-dependencies]
anyhow = "1.0.75"
arbitrary = { version = "1.3.2", features = ["derive"] }
clap = { version = "4.4.10", features = ["derive"] }
env_logger = "0.10.1"
heed = { version = "0.20.0-alpha.9", default-features = false }
Expand Down
6 changes: 3 additions & 3 deletions README.md
Expand Up @@ -31,11 +31,11 @@ Arroy was built by [@Kerollmops](https://github.com/Kerollmops) and [@irevoire](
- Multithreaded tree building using rayon
- Additional features compared to Annoy
- Filter when querying
- Incrementally update the tree without rebuilding it from scratch ([planned](https://github.com/meilisearch/arroy/issues/21))
- Store and Modify different indexes atomically using LMDB (indexes are identified by an `u16`)
- Incrementally update the tree without rebuilding it from scratch
- Store and modify different indexes atomically using LMDB (indexes are identified by an `u16`)
- Modify the items list **in place** while performing queries using LMDB
- Storage based on LMDB using LMDB
- Safer to use API, i.e., Check dimensions, distances, etc
- Safer to use API, i.e., check dimensions, distances, etc
- The database size does not depend on the highest item ID but on the number of items
- Generic over your random number generator

Expand Down
121 changes: 121 additions & 0 deletions examples/fuzz.rs
@@ -0,0 +1,121 @@
use std::time::{Duration, Instant};
use std::{fmt, panic};

use arbitrary::{Arbitrary, Unstructured};
use arroy::distances::Euclidean;
use arroy::{Database, Result, Writer};
use heed::EnvOpenOptions;
use rand::rngs::StdRng;
use rand::{Fill, SeedableRng};

const TWENTY_GIB: usize = 20 * 1024 * 1024 * 1024;

const UPDATE_PER_BATCHES: usize = 50;
const NB_BATCHES: usize = 5;
const NB_DIFFERENT_VECTORS: u32 = 5;

#[derive(Clone)]
struct Document {
id: u32,
vec: Vec<f32>,
}

impl fmt::Debug for Document {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.id.to_string())
}
}

impl<'a> Arbitrary<'a> for Document {
fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {
let id = *u.choose(&(0..NB_DIFFERENT_VECTORS).collect::<Vec<_>>())?;
Ok(Document { id, vec: vec![id as f32, 0.0] })
}
}

#[derive(Debug, Clone, Arbitrary)]
enum Operation {
Add(Document),
Delete(Document),
}

fn main() -> Result<()> {
let timer = std::env::args()
.nth(1)
.map(|s| Duration::from_secs(s.parse().expect("Expected a whole number of seconds")));

let dir = tempfile::tempdir().unwrap();
let env = EnvOpenOptions::new().map_size(TWENTY_GIB).open(dir.path())?;
let mut wtxn = env.write_txn()?;
let database: Database<Euclidean> = env.create_database(&mut wtxn, None)?;
wtxn.commit()?;

let mut rng_points = StdRng::seed_from_u64(42);
let rng_arroy = rng_points.clone();

let total_duration = Instant::now();
let mut instant = Instant::now();
let mut smol_iterations = 0;

for iteration in 0.. {
// logging progression
smol_iterations += 1;
let elapsed = instant.elapsed();
if elapsed >= Duration::from_secs(1) {
println!(
"Ran {smol_iterations} iterations in {elapsed:.1?} for a grand total of {iteration} iterations"
);
instant = Instant::now();
smol_iterations = 0;

if timer.map_or(false, |duration| duration < total_duration.elapsed()) {
return Ok(());
}
}

let mut v = [0_u8; 10_000];
v.try_fill(&mut rng_points).unwrap();

let mut data = Unstructured::new(&v);
let batches =
<[[Operation; UPDATE_PER_BATCHES]; NB_BATCHES]>::arbitrary(&mut data).unwrap();

for operations in batches {
let ops = operations.clone();
let ret = panic::catch_unwind(|| -> arroy::Result<()> {
let mut rng_arroy = rng_arroy.clone();
let mut wtxn = env.write_txn()?;
let writer = Writer::<Euclidean>::new(database, 0, 2)?;

for op in operations {
match op {
Operation::Add(doc) => writer.add_item(&mut wtxn, doc.id, &doc.vec)?,
Operation::Delete(doc) => drop(writer.del_item(&mut wtxn, doc.id)?),
}
}
writer.build(&mut wtxn, &mut rng_arroy, None)?;
wtxn.commit()?;
Ok(())
});
if let Err(e) = ret {
#[cfg(feature = "plot")]
{
use arroy::Reader;

let mut buffer = Vec::new();

let rtxn = env.read_txn()?;
let reader = Reader::<Euclidean>::open(&rtxn, 0, database)?;
reader.plot_internals_tree_nodes(&rtxn, &mut buffer)?;
std::fs::write("plot.dot", &buffer);

Check warning on line 110 in examples/fuzz.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, stable)

unused `Result` that must be used

Check warning on line 110 in examples/fuzz.rs

View workflow job for this annotation

GitHub Actions / test (macos-latest-xlarge, beta)

unused `Result` that must be used

Check warning on line 110 in examples/fuzz.rs

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, stable)

unused `Result` that must be used

Check warning on line 110 in examples/fuzz.rs

View workflow job for this annotation

GitHub Actions / test (ubuntu-latest, beta)

unused `Result` that must be used

Check warning on line 110 in examples/fuzz.rs

View workflow job for this annotation

GitHub Actions / test (windows-latest, beta)

unused `Result` that must be used

Check warning on line 110 in examples/fuzz.rs

View workflow job for this annotation

GitHub Actions / test (windows-latest, stable)

unused `Result` that must be used
println!("Plotted your database to `plot.dot`");
}
dbg!(&ops);
dbg!(e);
return Ok(());
}
}
}

Ok(())
}
4 changes: 2 additions & 2 deletions src/parallel.rs
Expand Up @@ -79,8 +79,8 @@ impl<'a, DE: BytesEncode<'a>> TmpNodes<DE> {
/// Panic if the node wasn't inserted in the tmp_nodes before calling this method.
pub fn remove(&mut self, item: ItemId) -> heed::Result<()> {
self.remove_from_db(item);
// In the current algorithm, we're supposed to find the node in the two last positions.
if let Some(el) = self.ids.iter_mut().rev().take(2).find(|i| **i == item) {
// In the current algorithm, we're supposed to find the node in the last positions.
if let Some(el) = self.ids.iter_mut().rev().find(|i| **i == item) {
*el = u32::MAX;
} else {
unreachable!();
Expand Down
5 changes: 4 additions & 1 deletion src/writer.rs
Expand Up @@ -745,10 +745,13 @@ impl<D: Distance> Writer<D> {
nb_tree_nodes: u64,
nb_items: u64,
) -> Result<()> {
if roots.is_empty() {
return Ok(());
}
let nb_trees = match nb_trees {
Some(nb_trees) => nb_trees,
None => {
// 1. Estimate the number of nodes per tree
// 1. Estimate the number of nodes per tree; the division is safe because we ensured there was at least one root node above.
let nodes_per_tree = nb_tree_nodes / roots.len() as u64;
// 2. Estimate the number of tree we need to have AT LEAST as much tree-nodes than items
(nb_items / nodes_per_tree) as usize
Expand Down