Skip to content

Commit

Permalink
Merge pull request #62 from meilisearch/improve-the-tmp-node-deletion
Browse files Browse the repository at this point in the history
Improve the tmp node deletion
  • Loading branch information
irevoire committed Jan 16, 2024
2 parents 73d2efb + 9ecaa95 commit 929efa1
Show file tree
Hide file tree
Showing 8 changed files with 112 additions and 48 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/fuzzer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ jobs:
# Run fuzzer
- name: Run the fuzzer
run: |
cargo run --release --example fuzz $((60 * 5))
cargo run --release --features assert-reader-validity --example fuzz $((60 * 5))
7 changes: 7 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ default = []
# Enabling this feature provide a method on the reader that can plot its root node in the dot format.
plot = []

# Enabling this feature provide a method on the reader that assert its own validity.
assert-reader-validity = []

[[example]]
name = "graph"
required-features = ["plot"]

[[example]]
name = "fuzz"
required-features = ["assert-reader-validity"]
9 changes: 5 additions & 4 deletions examples/fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use std::{fmt, panic};

use arbitrary::{Arbitrary, Unstructured};
use arroy::distances::Euclidean;
use arroy::{Database, Result, Writer};
use arroy::{Database, Reader, Result, Writer};
use heed::EnvOpenOptions;
use rand::rngs::StdRng;
use rand::{Fill, SeedableRng};
Expand Down Expand Up @@ -95,19 +95,20 @@ fn main() -> Result<()> {
}
writer.build(&mut wtxn, &mut rng_arroy, None)?;
wtxn.commit()?;
let rtxn = env.read_txn()?;
let reader = Reader::<Euclidean>::open(&rtxn, 0, database)?;
reader.assert_validity(&rtxn).unwrap();
Ok(())
});
if let Err(e) = ret {
#[cfg(feature = "plot")]
{
use arroy::Reader;

let mut buffer = Vec::new();

let rtxn = env.read_txn()?;
let reader = Reader::<Euclidean>::open(&rtxn, 0, database)?;
reader.plot_internals_tree_nodes(&rtxn, &mut buffer)?;
std::fs::write("plot.dot", &buffer);
std::fs::write("plot.dot", &buffer).unwrap();
println!("Plotted your database to `plot.dot`");
}
dbg!(&ops);
Expand Down
24 changes: 6 additions & 18 deletions src/parallel.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,25 +70,13 @@ impl<'a, DE: BytesEncode<'a>> TmpNodes<DE> {
Ok(())
}

/// Mark a node to delete from the DB.
pub fn remove_from_db(&mut self, item: ItemId) {
self.deleted.insert(item);
/// Delete the tmp_nodes and the node in the database.
pub fn remove(&mut self, item: ItemId) {
let deleted = self.deleted.insert(item);
debug_assert!(deleted);
}

/// Mark a node to delete from the DB and delete it from the tmp nodes as well.
/// Panic if the node wasn't inserted in the tmp_nodes before calling this method.
pub fn remove(&mut self, item: ItemId) -> heed::Result<()> {
self.remove_from_db(item);
// In the current algorithm, we're supposed to find the node in the last positions.
if let Some(el) = self.ids.iter_mut().rev().find(|i| **i == item) {
*el = u32::MAX;
} else {
unreachable!();
}
Ok(())
}

/// Converts it into a readers to be able to read the nodes.
/// Converts it into a readers to read the nodes.
pub fn into_bytes_reader(self) -> Result<TmpNodesReader> {
let file = self.file.into_inner().map_err(|iie| iie.into_error())?;
let mmap = unsafe { Mmap::map(&file)? };
Expand Down Expand Up @@ -121,11 +109,11 @@ impl TmpNodesReader {
self.ids
.iter()
.zip(self.bounds.windows(2))
.filter(|(&id, _)| !self.deleted.contains(id))
.map(|(id, bounds)| {
let [start, end] = [bounds[0], bounds[1]];
(*id, &self.mmap[start..end])
})
.filter(|(id, _)| *id != ItemId::MAX)
}
}

Expand Down
82 changes: 82 additions & 0 deletions src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,88 @@ impl<'t, D: Distance> Reader<'t, D> {
}
}
}

/// Verify that the whole reader is correctly formed:
/// - We can access all the items.
/// - All the tree nodes are part of a tree.
/// - No tree shares the same tree node.
/// - We're effectively working with trees and not graphs (i.e., an item or tree node cannot be linked twice in the tree)
#[cfg(any(test, feature = "assert-reader-validity"))]
pub fn assert_validity(&self, rtxn: &RoTxn) -> Result<()> {
// First, get all the items
let mut item_ids = RoaringBitmap::new();
for result in self
.database
.remap_types::<PrefixCodec, DecodeIgnore>()
.prefix_iter(rtxn, &Prefix::item(self.index))?
.remap_key_type::<KeyCodec>()
{
let (i, _) = result?;
item_ids.push(i.node.unwrap_item());
}
// Second, get all the tree nodes
let mut tree_ids = RoaringBitmap::new();
for result in self
.database
.remap_types::<PrefixCodec, DecodeIgnore>()
.prefix_iter(rtxn, &Prefix::tree(self.index))?
.remap_key_type::<KeyCodec>()
{
let (i, _) = result?;
tree_ids.push(i.node.unwrap_tree());
}

// The get all the items AND tree nodes PER trees
for root in self.roots.iter() {
let (trees, items) = self.gather_items_and_tree_ids(rtxn, NodeId::tree(root))?;
// Ensure that every tree can access all items
assert_eq!(item_ids, items, "A tree cannot access to all items");
// We can remove the already explored tree nodes
assert!(tree_ids.is_superset(&trees), "A tree contains an invalid tree node. Either doesn't exist or was already used in another tree");
tree_ids -= trees;
}

assert!(tree_ids.is_empty(), "There is {tree_ids:?} tree nodes floating around");
Ok(())
}

/// Return first the number of tree nodes and second the items accessible from a node.
/// And ensure that an item or tree node is never linked twice in the tree
#[cfg(any(test, feature = "assert-reader-validity"))]
fn gather_items_and_tree_ids(
&self,
rtxn: &RoTxn,
node_id: NodeId,
) -> Result<(RoaringBitmap, RoaringBitmap)> {
match self.database.get(rtxn, &Key::new(self.index, node_id))?.unwrap() {
Node::Leaf(_) => Ok((
RoaringBitmap::new(),
RoaringBitmap::from_sorted_iter(Some(node_id.item)).unwrap(),
)),
Node::Descendants(Descendants { descendants }) => Ok((
RoaringBitmap::from_sorted_iter(Some(node_id.item)).unwrap(),
descendants.into_owned(),
)),
Node::SplitPlaneNormal(SplitPlaneNormal { normal: _, left, right }) => {
let left = self.gather_items_and_tree_ids(rtxn, left)?;
let right = self.gather_items_and_tree_ids(rtxn, right)?;

let total_trees_size = left.0.len() + right.0.len();
let total_items_size = left.1.len() + right.1.len();

let mut trees = left.0 | right.0;
let items = left.1 | right.1;

// We should never find the same tree node or item ID in a single tree.
assert_eq!(total_trees_size, trees.len());
assert_eq!(total_items_size, items.len());

trees.insert(node_id.item);

Ok((trees, items))
}
}
}
}

pub fn item_leaf<'a, D: Distance>(
Expand Down
7 changes: 6 additions & 1 deletion src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rand::SeedableRng;
use tempfile::TempDir;

use crate::roaring::RoaringBitmapCodec;
use crate::{Database, Distance, MetadataCodec, NodeCodec, NodeMode};
use crate::{Database, Distance, MetadataCodec, NodeCodec, NodeMode, Reader};

mod reader;
mod writer;
Expand Down Expand Up @@ -36,6 +36,10 @@ impl<D: Distance> fmt::Display for DatabaseHandle<D> {
current_index = Some(key.index);

if old_index != current_index {
let reader =
Reader::<D>::open(&rtxn, current_index.unwrap(), self.database).unwrap();
reader.assert_validity(&rtxn).unwrap();

writeln!(f, "==================")?;
writeln!(f, "Dumping index {}", current_index.unwrap())?;
}
Expand Down Expand Up @@ -77,6 +81,7 @@ impl<D: Distance> fmt::Display for DatabaseHandle<D> {
}
}
}

Ok(())
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,6 @@ Tree 5: Descendants(Descendants { descendants: [0, 1, 2, 4, 13, 16, 19, 27, 33,
Tree 6: SplitPlaneNormal(SplitPlaneNormal { left: Tree(4), right: Tree(5), normal: [0.0550, -0.0317, -0.1815, 0.1748, 0.2592, 0.1276, 0.0648, -0.3539, -0.1585, 0.0911, -0.1816, -0.0939, 0.0021, 0.0709, 0.0202, 0.0530, -0.1488, -0.2081, -0.0009, -0.2521, -0.0654, 0.0246, 0.4066, 0.3606, -0.2069, 0.1391, 0.0840, 0.1356, 0.1600, -0.3060] })
Tree 7: SplitPlaneNormal(SplitPlaneNormal { left: Tree(3), right: Tree(6), normal: [-0.0346, 0.4691, 0.1765, -0.0148, 0.0259, 0.1307, -0.1240, -0.1665, 0.0875, 0.1291, 0.2901, -0.1181, 0.1028, 0.1873, -0.1748, 0.1071, 0.3026, 0.1918, -0.2636, -0.1360, -0.0094, 0.1939, -0.2513, 0.0142, -0.2946, -0.0155, -0.0352, -0.2004, 0.1297, -0.1179] })
Tree 8: SplitPlaneNormal(SplitPlaneNormal { left: Tree(2), right: Tree(7), normal: [-0.0833, -0.0190, -0.0905, 0.1619, -0.0567, 0.2069, -0.2559, -0.1137, 0.0169, -0.0864, -0.0911, -0.1466, 0.0229, 0.1060, -0.4102, 0.1031, 0.2331, -0.1867, -0.0179, 0.4030, -0.0541, 0.1071, -0.1108, -0.3500, 0.2832, 0.1696, 0.0784, 0.1984, 0.1857, -0.1249] })
Tree 9: Descendants(Descendants { descendants: [1, 7, 10, 13, 30, 31, 39, 97] })
Tree 10: Descendants(Descendants { descendants: [0, 3, 4, 9, 11, 27, 29, 45, 50, 56, 57, 60, 64, 65, 67, 73, 84, 85, 88, 91] })
Tree 11: Descendants(Descendants { descendants: [0, 1, 3, 4, 7, 9, 10, 11, 13, 27, 29, 30, 31, 39, 45, 50, 56, 57, 60, 64, 65, 67, 73, 84, 85, 88, 91, 97] })
Tree 14: Descendants(Descendants { descendants: [25, 32, 33, 49, 81, 99] })
Tree 15: SplitPlaneNormal(SplitPlaneNormal { left: Tree(103), right: Tree(14), normal: [-0.3533, -0.1026, 0.1114, 0.1360, -0.1732, 0.1633, -0.0302, -0.1458, -0.0030, -0.2101, -0.1642, 0.0048, 0.1560, 0.1240, -0.1897, 0.0059, 0.2732, -0.0767, -0.0073, -0.0686, -0.1380, -0.2031, -0.1076, -0.3940, 0.3151, -0.1967, -0.0063, 0.3692, 0.0094, 0.1459] })
Expand All @@ -132,8 +130,6 @@ Tree 28: SplitPlaneNormal(SplitPlaneNormal { left: Tree(26), right: Tree(111), n
Tree 29: Descendants(Descendants { descendants: [7, 9, 11, 12, 31, 32, 33, 35, 39, 52, 58, 69, 74, 81, 86, 89, 92, 97, 99] })
Tree 30: SplitPlaneNormal(SplitPlaneNormal { left: Tree(28), right: Tree(29), normal: [-0.1005, -0.0667, 0.1734, 0.2457, 0.1330, 0.0145, -0.1141, -0.0260, -0.1916, -0.1144, -0.1136, -0.1094, 0.0732, 0.0961, -0.3974, -0.0322, 0.0921, -0.0225, 0.0918, 0.1908, -0.0333, -0.2674, -0.0643, -0.5829, 0.1107, -0.0245, -0.0154, 0.0397, 0.2926, 0.2113] })
Tree 31: SplitPlaneNormal(SplitPlaneNormal { left: Tree(25), right: Tree(30), normal: [0.0357, 0.2985, -0.2477, -0.0879, 0.2653, 0.2303, 0.1487, 0.0825, -0.0937, 0.1187, 0.2846, -0.2087, -0.2004, 0.2058, 0.1070, 0.1817, -0.1476, 0.0179, 0.0776, 0.1242, 0.0699, 0.0648, -0.0274, 0.2480, -0.4002, 0.2701, -0.0571, -0.2140, 0.0239, -0.1267] })
Tree 32: Descendants(Descendants { descendants: [0, 2, 4, 13, 19, 24, 25, 27, 41, 42, 56, 60, 77, 85, 91, 94] })
Tree 33: Descendants(Descendants { descendants: [1, 29, 45, 51, 57, 63, 64, 65, 93] })
Tree 34: Descendants(Descendants { descendants: [0, 1, 2, 4, 13, 19, 24, 25, 27, 29, 41, 42, 45, 51, 56, 57, 60, 63, 64, 65, 77, 85, 91, 93, 94] })
Tree 35: SplitPlaneNormal(SplitPlaneNormal { left: Tree(31), right: Tree(34), normal: [-0.1861, -0.2141, -0.1520, 0.0974, 0.0900, 0.2616, -0.0688, -0.2960, -0.0378, -0.0949, -0.3277, -0.0323, -0.0636, 0.2488, -0.4443, 0.3214, 0.0959, -0.3218, -0.0695, 0.0767, 0.0436, 0.1616, 0.0567, 0.1653, 0.1340, 0.0084, 0.0783, -0.0016, 0.1253, -0.1028] })
Tree 36: Descendants(Descendants { descendants: [8, 15, 18, 21, 36, 42, 53, 59, 71, 87, 89, 93, 98] })
Expand All @@ -154,8 +150,6 @@ Tree 51: Descendants(Descendants { descendants: [1, 14, 29, 30, 35, 38, 57, 63,
Tree 53: SplitPlaneNormal(SplitPlaneNormal { left: Tree(51), right: Tree(117), normal: [-0.2008, 0.1922, 0.3079, -0.0453, -0.3244, 0.1055, -0.0466, -0.1035, 0.1446, -0.0438, 0.1305, 0.3916, -0.0534, -0.0555, 0.1314, -0.1056, 0.2684, 0.3731, -0.1052, -0.2472, -0.1977, -0.0885, 0.0235, 0.0173, 0.0686, -0.2534, 0.1249, 0.0815, -0.1741, 0.1279] })
Tree 54: SplitPlaneNormal(SplitPlaneNormal { left: Tree(50), right: Tree(53), normal: [-0.2175, -0.1158, -0.0781, 0.0622, -0.2080, 0.0082, 0.1641, 0.0493, -0.0815, 0.2456, -0.1406, 0.2167, 0.4664, -0.1600, -0.0316, -0.3137, -0.0300, 0.0707, -0.0475, 0.1332, -0.0198, 0.0259, -0.3368, -0.2182, 0.3876, 0.0855, -0.0989, 0.1142, 0.0094, 0.1087] })
Tree 55: SplitPlaneNormal(SplitPlaneNormal { left: Tree(45), right: Tree(54), normal: [-0.0297, 0.0635, -0.0090, 0.0742, 0.0765, 0.2930, -0.2378, -0.0111, -0.1069, -0.1084, -0.2281, -0.1307, -0.1940, 0.0743, -0.2224, 0.2621, 0.4020, -0.1358, -0.1421, 0.2556, 0.1564, 0.0275, 0.0829, -0.2441, 0.1231, 0.2096, 0.0415, 0.3321, 0.2420, 0.0255] })
Tree 56: Descendants(Descendants { descendants: [1, 3, 4, 7, 27, 30, 33, 52, 56, 81, 88, 93, 99] })
Tree 57: Descendants(Descendants { descendants: [11, 14, 29, 39, 50, 57, 67, 68, 86, 94, 95, 97] })
Tree 58: Descendants(Descendants { descendants: [1, 3, 4, 7, 11, 14, 27, 29, 30, 33, 39, 50, 52, 56, 57, 67, 68, 81, 86, 88, 93, 94, 95, 97, 99] })
Tree 59: Descendants(Descendants { descendants: [5, 9, 15, 23, 31, 32, 45, 63, 64, 65, 69, 77, 83, 89, 92] })
Tree 60: SplitPlaneNormal(SplitPlaneNormal { left: Tree(58), right: Tree(59), normal: [0.1786, -0.1600, -0.2775, -0.0898, 0.1079, 0.1067, -0.0025, 0.0785, -0.0136, -0.0227, -0.0216, 0.1138, -0.0032, -0.1261, -0.1610, -0.1209, -0.4143, 0.2001, 0.4793, -0.2008, -0.0190, -0.2578, 0.2425, 0.1568, -0.0856, 0.2075, -0.2392, -0.0009, -0.1147, 0.0704] })
Expand All @@ -172,10 +166,6 @@ Tree 72: Descendants(Descendants { descendants: [21, 23, 41, 43, 46, 49, 53, 59,
Tree 73: SplitPlaneNormal(SplitPlaneNormal { left: Tree(71), right: Tree(72), normal: [-0.0215, 0.0861, 0.0260, -0.2042, -0.1901, -0.1088, 0.0154, -0.2001, 0.3676, -0.0737, 0.4196, 0.0262, -0.2664, 0.1847, 0.0711, -0.0195, 0.0464, 0.0459, 0.1556, -0.2079, -0.0035, -0.3500, 0.1718, -0.1083, -0.0923, -0.2715, 0.1083, -0.0153, -0.2891, -0.1430] })
Tree 74: SplitPlaneNormal(SplitPlaneNormal { left: Tree(68), right: Tree(73), normal: [0.1338, 0.3609, -0.1010, -0.1138, 0.0836, 0.0482, -0.0207, 0.0315, 0.1349, 0.1157, 0.2265, -0.2023, -0.2901, 0.1725, 0.2272, 0.2300, -0.0068, -0.0465, 0.0117, 0.1397, 0.0488, -0.0288, 0.1493, 0.4854, -0.2355, 0.0637, -0.0046, -0.2136, -0.2705, -0.1199] })
Tree 75: SplitPlaneNormal(SplitPlaneNormal { left: Tree(67), right: Tree(74), normal: [0.0874, -0.1312, 0.0401, 0.1433, -0.0151, -0.0312, 0.0888, 0.0681, -0.2158, -0.0702, 0.0673, 0.1867, 0.3739, -0.1279, 0.3181, -0.3485, -0.1212, 0.2294, 0.2856, -0.2235, 0.0543, -0.2036, 0.1905, -0.0273, -0.0616, 0.2454, -0.1052, -0.0761, -0.3185, -0.0837] })
Tree 76: Descendants(Descendants { descendants: [1, 19, 63, 74, 81, 91] })
Tree 77: Descendants(Descendants { descendants: [0, 4, 7, 11, 12, 17, 25, 31, 33, 48, 56, 58, 60, 66, 70, 93, 99] })
Tree 78: Descendants(Descendants { descendants: [0, 1, 4, 7, 11, 12, 17, 19, 25, 31, 33, 48, 56, 58, 60, 63, 66, 70, 74, 81, 91, 93, 99] })
Tree 79: Descendants(Descendants { descendants: [32, 35, 36, 49, 75, 96] })
Tree 80: Descendants(Descendants { descendants: [0, 1, 4, 7, 11, 12, 17, 19, 25, 31, 32, 33, 35, 36, 48, 49, 56, 58, 60, 63, 66, 70, 74, 75, 81, 91, 93, 96, 99] })
Tree 81: Descendants(Descendants { descendants: [27, 85] })
Tree 83: Descendants(Descendants { descendants: [3, 5, 6, 8, 9, 13, 15, 18, 20, 24, 40, 45, 46, 57, 61, 65, 73, 77, 79, 82, 83, 87, 89, 92, 95] })
Expand Down
19 changes: 5 additions & 14 deletions src/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,7 @@ impl<D: Distance> Writer<D> {
Ok((current_node, descendants.into_owned()))
} else if !self.fit_in_descendant(new_descendants.len()) {
// if it doesn't fit in one descendent we need to craft a new whole subtree
tmp_nodes.remove_from_db(current_node.item);
tmp_nodes.remove(current_node.item);
let new_id = self.make_tree_in_file(
frozen_reader,
rng,
Expand All @@ -529,7 +529,7 @@ impl<D: Distance> Writer<D> {

Ok((new_id, new_descendants))
} else if new_descendants.len() == 1 {
tmp_nodes.remove_from_db(current_node.item);
tmp_nodes.remove(current_node.item);
let item = new_descendants.iter().next().unwrap();
Ok((NodeId::item(item), new_descendants))
} else {
Expand Down Expand Up @@ -582,22 +582,13 @@ impl<D: Distance> Writer<D> {
if self.fit_in_descendant(total_items.len()) {
// Since we're shrinking we KNOW that new_left and new_right are descendants
// thus we can delete them directly knowing there is no sub-tree to look at.

// deleting and getting the elements available in our children
if new_left.mode == NodeMode::Tree {
if new_left != left {
tmp_nodes.remove(new_left.item)?;
} else {
tmp_nodes.remove_from_db(new_left.item);
}
tmp_nodes.remove(new_left.item);
}
if new_right.mode == NodeMode::Tree {
if new_right != right {
tmp_nodes.remove(new_right.item)?;
} else {
tmp_nodes.remove_from_db(new_right.item);
}
tmp_nodes.remove(new_right.item);
}

tmp_nodes.put(
current_node.item,
&Node::Descendants(Descendants {
Expand Down

0 comments on commit 929efa1

Please sign in to comment.