Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option that restores the ability to perform index compression in memory #580

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/pisa/compress.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ void compress(
std::string const& output_filename,
ScorerParams const& scorer_params,
std::optional<Size> quantization_bits,
bool check
bool check,
bool in_memory
);

} // namespace pisa
51 changes: 29 additions & 22 deletions src/compress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,32 +113,38 @@ void compress_index(
std::string const& seq_type,
std::optional<std::string> const& wand_data_filename,
ScorerParams const& scorer_params,
std::optional<Size> quantization_bits
std::optional<Size> quantization_bits,
bool in_memory
) {
std::optional<QuantizingScorer<WandType>> quantizing_scorer{};
if constexpr (std::is_same_v<typename CollectionType::index_layout_tag, BlockIndexTag>) {
WandType wdata;
mio::mmap_source wdata_source;
if (quantization_bits.has_value()) {
ensure(wand_data_filename.has_value())
.or_panic("Bug: Asked for quantized but no wand data");
std::error_code error;
wdata_source.map(*wand_data_filename, error);
if (error) {
spdlog::error("error mapping file: {}, exiting...", error.message());
std::abort();

// Performs the compression using an intermediate buffer.
if (!in_memory) {
if constexpr (std::is_same_v<typename CollectionType::index_layout_tag, BlockIndexTag>) {
WandType wdata;
mio::mmap_source wdata_source;
if (quantization_bits.has_value()) {
ensure(wand_data_filename.has_value())
.or_panic("Bug: Asked for quantized but no wand data");
std::error_code error;
wdata_source.map(*wand_data_filename, error);
if (error) {
spdlog::error("error mapping file: {}, exiting...", error.message());
std::abort();
}
mapper::map(wdata, wdata_source, mapper::map_flags::warmup);
auto scorer = scorer::from_params(scorer_params, wdata);
LinearQuantizer quantizer(wdata.index_max_term_weight(), quantization_bits->as_int());
quantizing_scorer = QuantizingScorer(std::move(scorer), quantizer);
}
mapper::map(wdata, wdata_source, mapper::map_flags::warmup);
auto scorer = scorer::from_params(scorer_params, wdata);
LinearQuantizer quantizer(wdata.index_max_term_weight(), quantization_bits->as_int());
quantizing_scorer = QuantizingScorer(std::move(scorer), quantizer);
compress_index_streaming<CollectionType, WandType>(
input, params, *output_filename, std::move(quantizing_scorer), check
);
return;
}
compress_index_streaming<CollectionType, WandType>(
input, params, *output_filename, std::move(quantizing_scorer), check
);
return;
}

// Performs the compression in memory.
spdlog::info("Processing {} documents", input.num_docs());
double tick = get_time_usecs();

Expand Down Expand Up @@ -217,7 +223,8 @@ void compress(
std::string const& output_filename,
ScorerParams const& scorer_params,
std::optional<Size> quantization_bits,
bool check
bool check,
bool in_memory
) {
binary_freq_collection input(input_basename.c_str());
global_parameters params;
Expand All @@ -227,7 +234,7 @@ void compress(
} \
else if (index_encoding == BOOST_PP_STRINGIZE(T)) { \
compress_index<pisa::BOOST_PP_CAT(T, _index), wand_data<wand_data_raw>>( \
input, params, output_filename, check, index_encoding, wand_data_filename, scorer_params, quantization_bits \
input, params, output_filename, check, index_encoding, wand_data_filename, scorer_params, quantization_bits, in_memory \
); \
/**/
BOOST_PP_SEQ_FOR_EACH(LOOP_BODY, _, PISA_INDEX_TYPES);
Expand Down
6 changes: 4 additions & 2 deletions test/test_compress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ TEST_CASE("Compress index", "[index][compress]") {
(tmp.path() / encoding).string(),
ScorerParams(""), // no scorer
std::nullopt, // no quantization
true // check=true
true, // check=true
GENERATE(true, false) // in-memory=(true, false)
elshize marked this conversation as resolved.
Show resolved Hide resolved
);
}

Expand Down Expand Up @@ -82,6 +83,7 @@ TEST_CASE("Compress quantized index", "[index][compress]") {
(tmp.path() / encoding).string(),
scorer_params,
pisa::Size(8),
true // check=true
true, // check=true,
GENERATE(true, false) // in-memory=(true, false)
);
}
3 changes: 3 additions & 0 deletions tools/app.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,11 +270,13 @@ namespace arg {
->required();
app->add_option("-o,--output", m_output, "Output inverted index")->required();
app->add_flag("--check", m_check, "Check the correctness of the index");
app->add_flag("--in-memory", m_in_memory, "Compress the index in memory, without using an intermediate buffer");
}

[[nodiscard]] auto input_basename() const -> std::string { return m_input_basename; }
[[nodiscard]] auto output() const -> std::string { return m_output; }
[[nodiscard]] auto check() const -> bool { return m_check; }
[[nodiscard]] auto in_memory() const -> bool { return m_in_memory; }

/// Transform paths for `shard`.
void apply_shard(Shard_Id shard) {
Expand All @@ -286,6 +288,7 @@ namespace arg {
std::string m_input_basename{};
std::string m_output{};
bool m_check = false;
bool m_in_memory = false;
};

struct CreateWandData {
Expand Down
3 changes: 2 additions & 1 deletion tools/compress_inverted_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ int main(int argc, char** argv) {
args.output(),
args.scorer_params(),
args.quantization_bits(),
args.check()
args.check(),
args.in_memory()
);
}
3 changes: 2 additions & 1 deletion tools/shards.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,8 @@ int main(int argc, char** argv) {
shard_args.output(),
shard_args.scorer_params(),
shard_args.quantization_bits(),
shard_args.check()
shard_args.check(),
shard_args.in_memory()
);
}
return 0;
Expand Down
Loading