diff --git a/include/pisa/compress.hpp b/include/pisa/compress.hpp index e8c21a00..62ee243e 100644 --- a/include/pisa/compress.hpp +++ b/include/pisa/compress.hpp @@ -15,7 +15,8 @@ void compress( std::string const& output_filename, ScorerParams const& scorer_params, std::optional quantization_bits, - bool check + bool check, + bool in_memory ); } // namespace pisa diff --git a/src/compress.cpp b/src/compress.cpp index a910f8fe..ca4d090a 100644 --- a/src/compress.cpp +++ b/src/compress.cpp @@ -113,32 +113,38 @@ void compress_index( std::string const& seq_type, std::optional const& wand_data_filename, ScorerParams const& scorer_params, - std::optional quantization_bits + std::optional quantization_bits, + bool in_memory ) { std::optional> quantizing_scorer{}; - if constexpr (std::is_same_v) { - WandType wdata; - mio::mmap_source wdata_source; - if (quantization_bits.has_value()) { - ensure(wand_data_filename.has_value()) - .or_panic("Bug: Asked for quantized but no wand data"); - std::error_code error; - wdata_source.map(*wand_data_filename, error); - if (error) { - spdlog::error("error mapping file: {}, exiting...", error.message()); - std::abort(); + + // Performs the compression using an intermediate buffer. + if (!in_memory) { + if constexpr (std::is_same_v) { + WandType wdata; + mio::mmap_source wdata_source; + if (quantization_bits.has_value()) { + ensure(wand_data_filename.has_value()) + .or_panic("Bug: Asked for quantized but no wand data"); + std::error_code error; + wdata_source.map(*wand_data_filename, error); + if (error) { + spdlog::error("error mapping file: {}, exiting...", error.message()); + std::abort(); + } + mapper::map(wdata, wdata_source, mapper::map_flags::warmup); + auto scorer = scorer::from_params(scorer_params, wdata); + LinearQuantizer quantizer(wdata.index_max_term_weight(), quantization_bits->as_int()); + quantizing_scorer = QuantizingScorer(std::move(scorer), quantizer); } - mapper::map(wdata, wdata_source, mapper::map_flags::warmup); - auto scorer = scorer::from_params(scorer_params, wdata); - LinearQuantizer quantizer(wdata.index_max_term_weight(), quantization_bits->as_int()); - quantizing_scorer = QuantizingScorer(std::move(scorer), quantizer); + compress_index_streaming( + input, params, *output_filename, std::move(quantizing_scorer), check + ); + return; } - compress_index_streaming( - input, params, *output_filename, std::move(quantizing_scorer), check - ); - return; } + // Performs the compression in memory. spdlog::info("Processing {} documents", input.num_docs()); double tick = get_time_usecs(); @@ -217,7 +223,8 @@ void compress( std::string const& output_filename, ScorerParams const& scorer_params, std::optional quantization_bits, - bool check + bool check, + bool in_memory ) { binary_freq_collection input(input_basename.c_str()); global_parameters params; @@ -227,7 +234,7 @@ void compress( } \ else if (index_encoding == BOOST_PP_STRINGIZE(T)) { \ compress_index>( \ - input, params, output_filename, check, index_encoding, wand_data_filename, scorer_params, quantization_bits \ + input, params, output_filename, check, index_encoding, wand_data_filename, scorer_params, quantization_bits, in_memory \ ); \ /**/ BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES); diff --git a/test/test_compress.cpp b/test/test_compress.cpp index 81fdf664..9cbd1056 100644 --- a/test/test_compress.cpp +++ b/test/test_compress.cpp @@ -26,6 +26,7 @@ TEST_CASE("Compress index", "[index][compress]") { "block_simple16", "block_simdbp" ); + const bool in_memory = GENERATE(true, false); pisa::TemporaryDirectory tmp; pisa::compress( PISA_SOURCE_DIR "/test/test_data/test_collection", @@ -34,7 +35,8 @@ TEST_CASE("Compress index", "[index][compress]") { (tmp.path() / encoding).string(), ScorerParams(""), // no scorer std::nullopt, // no quantization - true // check=true + true, // check=true + in_memory ); } @@ -74,6 +76,7 @@ TEST_CASE("Compress quantized index", "[index][compress]") { "block_simdbp" ); CAPTURE(encoding); + const bool in_memory = GENERATE(true, false); pisa::compress( input, @@ -82,6 +85,7 @@ TEST_CASE("Compress quantized index", "[index][compress]") { (tmp.path() / encoding).string(), scorer_params, pisa::Size(8), - true // check=true + true, // check=true, + in_memory ); } diff --git a/tools/app.hpp b/tools/app.hpp index d241db88..a33bb44e 100644 --- a/tools/app.hpp +++ b/tools/app.hpp @@ -270,11 +270,13 @@ namespace arg { ->required(); app->add_option("-o,--output", m_output, "Output inverted index")->required(); app->add_flag("--check", m_check, "Check the correctness of the index"); + app->add_flag("--in-memory", m_in_memory, "Compress the index in memory, without using an intermediate buffer"); } [[nodiscard]] auto input_basename() const -> std::string { return m_input_basename; } [[nodiscard]] auto output() const -> std::string { return m_output; } [[nodiscard]] auto check() const -> bool { return m_check; } + [[nodiscard]] auto in_memory() const -> bool { return m_in_memory; } /// Transform paths for `shard`. void apply_shard(Shard_Id shard) { @@ -286,6 +288,7 @@ namespace arg { std::string m_input_basename{}; std::string m_output{}; bool m_check = false; + bool m_in_memory = false; }; struct CreateWandData { diff --git a/tools/compress_inverted_index.cpp b/tools/compress_inverted_index.cpp index 556abc42..fb7625fb 100644 --- a/tools/compress_inverted_index.cpp +++ b/tools/compress_inverted_index.cpp @@ -18,6 +18,7 @@ int main(int argc, char** argv) { args.output(), args.scorer_params(), args.quantization_bits(), - args.check() + args.check(), + args.in_memory() ); } diff --git a/tools/shards.cpp b/tools/shards.cpp index 0d2d05a3..bac9f69f 100644 --- a/tools/shards.cpp +++ b/tools/shards.cpp @@ -119,7 +119,8 @@ int main(int argc, char** argv) { shard_args.output(), shard_args.scorer_params(), shard_args.quantization_bits(), - shard_args.check() + shard_args.check(), + shard_args.in_memory() ); } return 0;