Skip to content

Commit

Permalink
apacheGH-36420: [C++] Add An Enum Option For SetLookup Options (apach…
Browse files Browse the repository at this point in the history
…e#36739)

### Rationale for this change
As apache#36420 says, we want add an sql-compatible `is_in` variant, which has a different logic handling Null. After a dicussion with @ ianmcook and @ bkietz, we decide to support an enum option `null_matching_behavior` for SetLookup, which actually adds two semantics of null handling for `is_in` and doesn't add an new behavior for `index_in`.

The enum option `null_matching_behavior` will replace `skip_nulls` in the future.

### What changes are included in this PR?
Add an enum parameter `null_matching_behavior` for SetLookupOptions.

### Are these changes tested?
Two kinds of tests are implemented
- Replace default parameter with `null_matching_behavior` instead of `skip_nulls` for `is_in` and `index_in` tests 
- Add tests for `NullMatchingBehavior::EMIT_NULL` and `NullMatchingBehavior::INCONCLUSIVE` for `is_in`

Besides, since the `skip_nulls` is not deprecated now, I still preserve the old tests with `skip_nulls`. When the `skip_nulls` is totally deprecated, we can replace the test parameter `skip_nulls=false` with `null_matching_behavior=MATCH` and `skip_nulls=true` with `null_matching_behavior=SKIP` for these old tests.

### Are there any user-facing changes?
No. Currently we support backward compatibility. In the future, we plan to replace `skip_nulls` with `null_matching_behavior` completely.

* Closes: apache#36420

Lead-authored-by: Junming Chen <junming.chen.r@outlook.com>
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
Co-authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
  • Loading branch information
3 people authored and loicalleyne committed Nov 13, 2023
1 parent 758c3c4 commit 36a3a38
Show file tree
Hide file tree
Showing 8 changed files with 942 additions and 52 deletions.
17 changes: 8 additions & 9 deletions c_glib/arrow-glib/compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3346,7 +3346,7 @@ garrow_set_lookup_options_get_property(GObject *object,
g_value_set_object(value, priv->value_set);
break;
case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS:
g_value_set_boolean(value, options->skip_nulls);
g_value_set_boolean(value, options->skip_nulls.has_value() && options->skip_nulls.value());
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec);
Expand Down Expand Up @@ -3398,13 +3398,11 @@ garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass)
*
* Since: 6.0.0
*/
spec = g_param_spec_boolean("skip-nulls",
"Skip NULLs",
"Whether NULLs are skipped or not",
options.skip_nulls,
static_cast<GParamFlags>(G_PARAM_READWRITE));
g_object_class_install_property(gobject_class,
PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS,
auto skip_nulls = (options.skip_nulls.has_value() && options.skip_nulls.value());
spec =
g_param_spec_boolean("skip-nulls", "Skip NULLs", "Whether NULLs are skipped or not",
skip_nulls, static_cast<GParamFlags>(G_PARAM_READWRITE));
g_object_class_install_property(gobject_class, PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS,
spec);
}

Expand Down Expand Up @@ -6458,9 +6456,10 @@ garrow_set_lookup_options_new_raw(
arrow_copied_options.get());
auto value_set =
garrow_datum_new_raw(&(arrow_copied_set_lookup_options->value_set));
auto skip_nulls = (arrow_options->skip_nulls.has_value() && arrow_options->skip_nulls.value());
auto options = g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS,
"value-set", value_set,
"skip-nulls", arrow_options->skip_nulls,
"skip-nulls", skip_nulls,
NULL);
return GARROW_SET_LOOKUP_OPTIONS(options);
}
Expand Down
52 changes: 49 additions & 3 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,29 @@ struct EnumTraits<compute::MapLookupOptions::Occurrence>
}
};

template <>
struct EnumTraits<compute::SetLookupOptions::NullMatchingBehavior>
: BasicEnumTraits<compute::SetLookupOptions::NullMatchingBehavior,
compute::SetLookupOptions::NullMatchingBehavior::MATCH,
compute::SetLookupOptions::NullMatchingBehavior::SKIP,
compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL,
compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE> {
static std::string name() { return "SetLookupOptions::NullMatchingBehavior"; }
static std::string value_name(compute::SetLookupOptions::NullMatchingBehavior value) {
switch (value) {
case compute::SetLookupOptions::NullMatchingBehavior::MATCH:
return "MATCH";
case compute::SetLookupOptions::NullMatchingBehavior::SKIP:
return "SKIP";
case compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL:
return "EMIT_NULL";
case compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE:
return "INCONCLUSIVE";
}
return "<INVALID>";
}
};

} // namespace internal

namespace compute {
Expand All @@ -286,6 +309,7 @@ using ::arrow::internal::checked_cast;

namespace internal {
namespace {
using ::arrow::internal::CoercedDataMember;
using ::arrow::internal::DataMember;
static auto kArithmeticOptionsType = GetFunctionOptionsType<ArithmeticOptions>(
DataMember("check_overflow", &ArithmeticOptions::check_overflow));
Expand Down Expand Up @@ -344,7 +368,8 @@ static auto kRoundToMultipleOptionsType = GetFunctionOptionsType<RoundToMultiple
DataMember("round_mode", &RoundToMultipleOptions::round_mode));
static auto kSetLookupOptionsType = GetFunctionOptionsType<SetLookupOptions>(
DataMember("value_set", &SetLookupOptions::value_set),
DataMember("skip_nulls", &SetLookupOptions::skip_nulls));
CoercedDataMember("null_matching_behavior", &SetLookupOptions::null_matching_behavior,
&SetLookupOptions::GetNullMatchingBehavior));
static auto kSliceOptionsType = GetFunctionOptionsType<SliceOptions>(
DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop),
DataMember("step", &SliceOptions::step));
Expand Down Expand Up @@ -540,8 +565,29 @@ constexpr char RoundToMultipleOptions::kTypeName[];
SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls)
: FunctionOptions(internal::kSetLookupOptionsType),
value_set(std::move(value_set)),
skip_nulls(skip_nulls) {}
SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {}
skip_nulls(skip_nulls) {
if (skip_nulls) {
this->null_matching_behavior = SetLookupOptions::SKIP;
} else {
this->null_matching_behavior = SetLookupOptions::MATCH;
}
}
SetLookupOptions::SetLookupOptions(
Datum value_set, SetLookupOptions::NullMatchingBehavior null_matching_behavior)
: FunctionOptions(internal::kSetLookupOptionsType),
value_set(std::move(value_set)),
null_matching_behavior(std::move(null_matching_behavior)) {}
SetLookupOptions::SetLookupOptions()
: SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {}
SetLookupOptions::NullMatchingBehavior SetLookupOptions::GetNullMatchingBehavior() const {
if (!this->skip_nulls.has_value()) {
return this->null_matching_behavior;
} else if (this->skip_nulls.value()) {
return SetLookupOptions::SKIP;
} else {
return SetLookupOptions::MATCH;
}
}
constexpr char SetLookupOptions::kTypeName[];

SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step)
Expand Down
34 changes: 32 additions & 2 deletions cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -268,19 +268,49 @@ class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions {
/// Options for IsIn and IndexIn functions
class ARROW_EXPORT SetLookupOptions : public FunctionOptions {
public:
explicit SetLookupOptions(Datum value_set, bool skip_nulls = false);
/// How to handle null values.
enum NullMatchingBehavior {
/// MATCH, any null in `value_set` is successfully matched in
/// the input.
MATCH,
/// SKIP, any null in `value_set` is ignored and nulls in the input
/// produce null (IndexIn) or false (IsIn) values in the output.
SKIP,
/// EMIT_NULL, any null in `value_set` is ignored and nulls in the
/// input produce null (IndexIn and IsIn) values in the output.
EMIT_NULL,
/// INCONCLUSIVE, null values are regarded as unknown values, which is
/// sql-compatible. nulls in the input produce null (IndexIn and IsIn)
/// values in the output. Besides, if `value_set` contains a null,
/// non-null unmatched values in the input also produce null values
/// (IndexIn and IsIn) in the output.
INCONCLUSIVE
};

explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH);
SetLookupOptions();

// DEPRECATED(will be removed after removing of skip_nulls)
explicit SetLookupOptions(Datum value_set, bool skip_nulls);

static constexpr char const kTypeName[] = "SetLookupOptions";

/// The set of values to look up input values into.
Datum value_set;

NullMatchingBehavior null_matching_behavior;

// DEPRECATED(will be removed after removing of skip_nulls)
NullMatchingBehavior GetNullMatchingBehavior() const;

// DEPRECATED(use null_matching_behavior instead)
/// Whether nulls in `value_set` count for lookup.
///
/// If true, any null in `value_set` is ignored and nulls in the input
/// produce null (IndexIn) or false (IsIn) values in the output.
/// If false, any null in `value_set` is successfully matched in
/// the input.
bool skip_nulls;
std::optional<bool> skip_nulls;
};

/// Options for struct_field function
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/compute/expression_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,9 @@ TEST(Expression, ToString) {
auto in_12 = call("index_in", {field_ref("beta")},
compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")});

EXPECT_EQ(in_12.ToString(),
"index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=false})");
EXPECT_EQ(
in_12.ToString(),
"index_in(beta, {value_set=int32:[\n 1,\n 2\n], null_matching_behavior=MATCH})");

EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)");
EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)");
Expand Down
104 changes: 77 additions & 27 deletions cpp/src/arrow/compute/kernels/scalar_set_lookup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ struct SetLookupState : public SetLookupStateBase {
explicit SetLookupState(MemoryPool* pool) : memory_pool(pool) {}

Status Init(const SetLookupOptions& options) {
this->null_matching_behavior = options.GetNullMatchingBehavior();
if (options.value_set.is_array()) {
const ArrayData& value_set = *options.value_set.array();
memo_index_to_value_index.reserve(value_set.length);
Expand All @@ -66,7 +67,8 @@ struct SetLookupState : public SetLookupStateBase {
} else {
return Status::Invalid("value_set should be an array or chunked array");
}
if (!options.skip_nulls && lookup_table->GetNull() >= 0) {
if (this->null_matching_behavior != SetLookupOptions::SKIP &&
lookup_table->GetNull() >= 0) {
null_index = memo_index_to_value_index[lookup_table->GetNull()];
}
value_set_type = options.value_set.type();
Expand Down Expand Up @@ -117,19 +119,23 @@ struct SetLookupState : public SetLookupStateBase {
// be mapped back to indices in the value_set.
std::vector<int32_t> memo_index_to_value_index;
int32_t null_index = -1;
SetLookupOptions::NullMatchingBehavior null_matching_behavior;
};

template <>
struct SetLookupState<NullType> : public SetLookupStateBase {
explicit SetLookupState(MemoryPool*) {}

Status Init(const SetLookupOptions& options) {
value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls;
Status Init(SetLookupOptions& options) {
null_matching_behavior = options.GetNullMatchingBehavior();
value_set_has_null = (options.value_set.length() > 0) &&
this->null_matching_behavior != SetLookupOptions::SKIP;
value_set_type = null();
return Status::OK();
}

bool value_set_has_null;
SetLookupOptions::NullMatchingBehavior null_matching_behavior;
};

// TODO: Put this concept somewhere reusable
Expand Down Expand Up @@ -270,14 +276,20 @@ struct IndexInVisitor {
: ctx(ctx), data(data), out(out), out_bitmap(out->buffers[0].data) {}

Status Visit(const DataType& type) {
DCHECK_EQ(type.id(), Type::NA);
DCHECK(false) << "IndexIn " << type;
return Status::NotImplemented("IndexIn has no implementation with value type ", type);
}

Status Visit(const NullType&) {
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());

if (data.length != 0) {
// skip_nulls is honored for consistency with other types
bit_util::SetBitsTo(out_bitmap, out->offset, out->length, state.value_set_has_null);
bit_util::SetBitsTo(out_bitmap, out->offset, out->length,
state.null_matching_behavior == SetLookupOptions::MATCH &&
state.value_set_has_null);

// Set all values to 0, which will be unmasked only if null is in the value_set
// and null_matching_behavior is equal to MATCH
std::memset(out->GetValues<int32_t>(1), 0x00, out->length * sizeof(int32_t));
}
return Status::OK();
Expand Down Expand Up @@ -305,7 +317,8 @@ struct IndexInVisitor {
bitmap_writer.Next();
},
[&]() {
if (state.null_index != -1) {
if (state.null_index != -1 &&
state.null_matching_behavior == SetLookupOptions::MATCH) {
bitmap_writer.Set();

// value_set included null
Expand Down Expand Up @@ -379,49 +392,86 @@ Status ExecIndexIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) {
return IndexInVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute();
}

// ----------------------------------------------------------------------

// IsIn writes the results into a preallocated boolean data bitmap
struct IsInVisitor {
KernelContext* ctx;
const ArraySpan& data;
ArraySpan* out;
uint8_t* out_boolean_bitmap;
uint8_t* out_null_bitmap;

IsInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out)
: ctx(ctx), data(data), out(out) {}
: ctx(ctx),
data(data),
out(out),
out_boolean_bitmap(out->buffers[1].data),
out_null_bitmap(out->buffers[0].data) {}

Status Visit(const DataType& type) {
DCHECK_EQ(type.id(), Type::NA);
DCHECK(false) << "IndexIn " << type;
return Status::NotImplemented("IsIn has no implementation with value type ", type);
}

Status Visit(const NullType&) {
const auto& state = checked_cast<const SetLookupState<NullType>&>(*ctx->state());
// skip_nulls is honored for consistency with other types
bit_util::SetBitsTo(out->buffers[1].data, out->offset, out->length,
state.value_set_has_null);

if (state.null_matching_behavior == SetLookupOptions::MATCH &&
state.value_set_has_null) {
bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, true);
bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true);
} else if (state.null_matching_behavior == SetLookupOptions::SKIP ||
(!state.value_set_has_null &&
state.null_matching_behavior == SetLookupOptions::MATCH)) {
bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false);
bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true);
} else {
bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, false);
}
return Status::OK();
}

template <typename Type>
Status ProcessIsIn(const SetLookupState<Type>& state, const ArraySpan& input) {
using T = typename GetViewType<Type>::T;
FirstTimeBitmapWriter writer(out->buffers[1].data, out->offset, out->length);
FirstTimeBitmapWriter writer_boolean(out_boolean_bitmap, out->offset, out->length);
FirstTimeBitmapWriter writer_null(out_null_bitmap, out->offset, out->length);
bool value_set_has_null = state.null_index != -1;
VisitArraySpanInline<Type>(
input,
[&](T v) {
if (state.lookup_table->Get(v) != -1) {
writer.Set();
} else {
writer.Clear();
if (state.lookup_table->Get(v) != -1) { // true
writer_boolean.Set();
writer_null.Set();
} else if (state.null_matching_behavior == SetLookupOptions::INCONCLUSIVE &&
value_set_has_null) { // null
writer_boolean.Clear();
writer_null.Clear();
} else { // false
writer_boolean.Clear();
writer_null.Set();
}
writer.Next();
writer_boolean.Next();
writer_null.Next();
},
[&]() {
if (state.null_index != -1) {
writer.Set();
} else {
writer.Clear();
if (state.null_matching_behavior == SetLookupOptions::MATCH &&
value_set_has_null) { // true
writer_boolean.Set();
writer_null.Set();
} else if (state.null_matching_behavior == SetLookupOptions::SKIP ||
(!value_set_has_null && state.null_matching_behavior ==
SetLookupOptions::MATCH)) { // false
writer_boolean.Clear();
writer_null.Set();
} else { // null
writer_boolean.Clear();
writer_null.Clear();
}
writer.Next();
writer_boolean.Next();
writer_null.Next();
});
writer.Finish();
writer_boolean.Finish();
writer_null.Finish();
return Status::OK();
}

Expand Down Expand Up @@ -598,7 +648,7 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) {
ScalarKernel isin_base;
isin_base.init = InitSetLookup;
isin_base.exec = ExecIsIn;
isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL;
isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE;
auto is_in = std::make_shared<SetLookupFunction>("is_in", Arity::Unary(), is_in_doc);

AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get());
Expand Down

0 comments on commit 36a3a38

Please sign in to comment.