Skip to content

Commit

Permalink
apacheGH-35749: [C++] Handle run-end encoded filters in compute kerne…
Browse files Browse the repository at this point in the history
…ls (apache#35750)

### Rationale for this change

Boolean arrays (bitmaps) used to represent filters in Arrow take 1 bit per boolean value. If the filter contains long runs, the filter can be run-end encoded and save even more memory.

Using POPCNT, a bitmap can be scanned efficiently for <64 runs of logical values, but a run-end encoded array gives the lengths of the run directly and go beyond word size per run.

These two observations make the case that, for the right dataset, REE filters can be more efficiently processed in compute kernels.

### What changes are included in this PR?

 - [x] `GetFilterOutputSize` can count number of emits from a REE filter
 - [x] `GetTakeIndices` can produce an array of logical indices from a REE filter
 - [x] `"array_filter"` can handle REE filters

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* Closes: apache#35749

Lead-authored-by: Felipe Oliveira Carvalho <felipekde@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
felipecrv and pitrou committed Jun 15, 2023
1 parent 95972cc commit 475b5b9
Show file tree
Hide file tree
Showing 10 changed files with 802 additions and 286 deletions.
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/api_vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,18 @@ namespace internal {
// These internal functions are implemented in kernels/vector_selection.cc

/// \brief Return the number of selected indices in the boolean filter
///
/// \param filter a plain or run-end encoded boolean array with or without nulls
/// \param null_selection how to handle nulls in the filter
ARROW_EXPORT
int64_t GetFilterOutputSize(const ArraySpan& filter,
FilterOptions::NullSelectionBehavior null_selection);

/// \brief Compute uint64 selection indices for use with Take given a boolean
/// filter
///
/// \param filter a plain or run-end encoded boolean array with or without nulls
/// \param null_selection how to handle nulls in the filter
ARROW_EXPORT
Result<std::shared_ptr<ArrayData>> GetTakeIndices(
const ArraySpan& filter, FilterOptions::NullSelectionBehavior null_selection,
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/compute/kernel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,10 @@ std::shared_ptr<TypeMatcher> RunEndEncoded(
std::move(value_type_matcher));
}

std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id) {
return RunEndEncoded(SameTypeId(value_type_id));
}

std::shared_ptr<TypeMatcher> RunEndEncoded(
std::shared_ptr<TypeMatcher> run_end_type_matcher,
std::shared_ptr<TypeMatcher> value_type_matcher) {
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/compute/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,12 @@ ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndInteger();
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(
std::shared_ptr<TypeMatcher> value_type_matcher);

/// \brief Match run-end encoded types that use any valid run-end type and
/// encode specific value types
///
/// @param[in] value_type_id a type id that the type of the values field should match
ARROW_EXPORT std::shared_ptr<TypeMatcher> RunEndEncoded(Type::type value_type_id);

/// \brief Match run-end encoded types that encode specific run-end and value types
///
/// @param[in] run_end_type_matcher a matcher that is applied to the run_ends field
Expand Down
25 changes: 25 additions & 0 deletions cpp/src/arrow/compute/kernels/ree_util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,18 @@ class ReadWriteValue<ArrowType, in_has_validity_buffer, out_has_validity_buffer,
return valid;
}

/// Pre-conditions guaranteed by the callers:
/// - i and j are valid indices into the values buffer
/// - the values in i and j are valid
bool CompareValuesAt(int64_t i, int64_t j) const {
if constexpr (std::is_same_v<ArrowType, BooleanType>) {
return bit_util::GetBit(input_values_, i) == bit_util::GetBit(input_values_, j);
} else {
return (reinterpret_cast<const ValueRepr*>(input_values_))[i] ==
(reinterpret_cast<const ValueRepr*>(input_values_))[j];
}
}

/// \brief Ensure padding is zeroed in validity bitmap.
void ZeroValidityPadding(int64_t length) const {
DCHECK(output_values_);
Expand Down Expand Up @@ -166,6 +178,11 @@ class ReadWriteValue<ArrowType, in_has_validity_buffer, out_has_validity_buffer,
return valid;
}

bool CompareValuesAt(int64_t i, int64_t j) const {
return 0 == memcmp(input_values_ + (i * byte_width_),
input_values_ + (j * byte_width_), byte_width_);
}

/// \brief Ensure padding is zeroed in validity bitmap.
void ZeroValidityPadding(int64_t length) const {
DCHECK(output_values_);
Expand Down Expand Up @@ -253,6 +270,14 @@ class ReadWriteValue<ArrowType, in_has_validity_buffer, out_has_validity_buffer,
return valid;
}

bool CompareValuesAt(int64_t i, int64_t j) const {
const offset_type len_i = input_offsets_[i + 1] - input_offsets_[i];
const offset_type len_j = input_offsets_[j + 1] - input_offsets_[j];
return len_i == len_j &&
memcmp(input_values_ + input_offsets_[i], input_values_ + input_offsets_[j],
static_cast<size_t>(len_i));
}

/// \brief Ensure padding is zeroed in validity bitmap.
void ZeroValidityPadding(int64_t length) const {
DCHECK(output_values_);
Expand Down
7 changes: 3 additions & 4 deletions cpp/src/arrow/compute/kernels/vector_selection.cc
Original file line number Diff line number Diff line change
Expand Up @@ -332,8 +332,8 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
VectorKernel filter_base;
filter_base.init = FilterState::Init;
RegisterSelectionFunction("array_filter", array_filter_doc, filter_base,
/*selection_type=*/boolean(), filter_kernels,
GetDefaultFilterOptions(), registry);
std::move(filter_kernels), GetDefaultFilterOptions(),
registry);

DCHECK_OK(registry->AddFunction(MakeFilterMetaFunction()));

Expand All @@ -345,8 +345,7 @@ void RegisterVectorSelection(FunctionRegistry* registry) {
take_base.init = TakeState::Init;
take_base.can_execute_chunkwise = false;
RegisterSelectionFunction("array_take", array_take_doc, take_base,
/*selection_type=*/match::Integer(), take_kernels,
GetDefaultTakeOptions(), registry);
std::move(take_kernels), GetDefaultTakeOptions(), registry);

DCHECK_OK(registry->AddFunction(MakeTakeMetaFunction()));

Expand Down
Loading

0 comments on commit 475b5b9

Please sign in to comment.