Skip to content

Commit

Permalink
ARROW-10959: [C++] Add scalar string join kernel
Browse files Browse the repository at this point in the history
@jorisvandenbossche I've implemented this kernel as a binary (arity) kernel, so the input list array *and* the separator input string array can both be an array (see python test).

I did not implement the case where the input list is a scalar, and the separator an array, since I don't think that's very common.

And note that the kernel is named `binary_join` because it takes string-like and binary-like inputs.

Closes apache#8990 from maartenbreddels/ARROW-10959

Lead-authored-by: Antoine Pitrou <antoine@python.org>
Co-authored-by: Maarten A. Breddels <maartenbreddels@gmail.com>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
2 people authored and michalursa committed Jun 13, 2021
1 parent 36c5a58 commit 6343359
Show file tree
Hide file tree
Showing 11 changed files with 587 additions and 28 deletions.
70 changes: 70 additions & 0 deletions cpp/src/arrow/array/array_binary_test.cc
Expand Up @@ -473,6 +473,70 @@ class TestStringBuilder : public TestBuilder {
CheckStringArray(*result_, strings, is_valid, reps);
}

void TestExtendCurrent() {
std::vector<std::string> strings = {"", "bbbb", "aaaaa", "", "ccc"};
std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};

int N = static_cast<int>(strings.size());
int reps = 10;

for (int j = 0; j < reps; ++j) {
for (int i = 0; i < N; ++i) {
if (!is_valid[i]) {
ASSERT_OK(builder_->AppendNull());
} else if (strings[i].length() > 3) {
ASSERT_OK(builder_->Append(strings[i].substr(0, 3)));
ASSERT_OK(builder_->ExtendCurrent(strings[i].substr(3)));
} else {
ASSERT_OK(builder_->Append(strings[i]));
}
}
}
Done();

ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(reps, result_->null_count());
ASSERT_EQ(reps * 12, result_->value_data()->size());

CheckStringArray(*result_, strings, is_valid, reps);
}

void TestExtendCurrentUnsafe() {
std::vector<std::string> strings = {"", "bbbb", "aaaaa", "", "ccc"};
std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};

int N = static_cast<int>(strings.size());
int reps = 13;
int64_t total_length = 0;
for (const auto& s : strings) {
total_length += static_cast<int64_t>(s.size());
}

ASSERT_OK(builder_->Reserve(N * reps));
ASSERT_OK(builder_->ReserveData(total_length * reps));

for (int j = 0; j < reps; ++j) {
for (int i = 0; i < N; ++i) {
if (!is_valid[i]) {
builder_->UnsafeAppendNull();
} else if (strings[i].length() > 3) {
builder_->UnsafeAppend(strings[i].substr(0, 3));
builder_->UnsafeExtendCurrent(strings[i].substr(3));
} else {
builder_->UnsafeAppend(strings[i]);
}
}
}
ASSERT_EQ(builder_->value_data_length(), total_length * reps);
Done();

ASSERT_EQ(reps * N, result_->length());
ASSERT_EQ(reps, result_->null_count());
ASSERT_EQ(reps * 12, result_->value_data()->size());

CheckStringArray(*result_, strings, is_valid, reps);
}

void TestVectorAppend() {
std::vector<std::string> strings = {"", "bb", "a", "", "ccc"};
std::vector<uint8_t> valid_bytes = {1, 1, 1, 0, 1};
Expand Down Expand Up @@ -608,6 +672,12 @@ TYPED_TEST(TestStringBuilder, TestScalarAppend) { this->TestScalarAppend(); }

TYPED_TEST(TestStringBuilder, TestScalarAppendUnsafe) { this->TestScalarAppendUnsafe(); }

TYPED_TEST(TestStringBuilder, TestExtendCurrent) { this->TestExtendCurrent(); }

TYPED_TEST(TestStringBuilder, TestExtendCurrentUnsafe) {
this->TestExtendCurrentUnsafe();
}

TYPED_TEST(TestStringBuilder, TestVectorAppend) { this->TestVectorAppend(); }

TYPED_TEST(TestStringBuilder, TestAppendCStringsWithValidBytes) {
Expand Down
33 changes: 33 additions & 0 deletions cpp/src/arrow/array/builder_binary.h
Expand Up @@ -77,6 +77,23 @@ class BaseBinaryBuilder : public ArrayBuilder {
return Append(value.data(), static_cast<offset_type>(value.size()));
}

/// Extend the last appended value by appending more data at the end
///
/// Unlike Append, this does not create a new offset.
Status ExtendCurrent(const uint8_t* value, offset_type length) {
// Safety check for UBSAN.
if (ARROW_PREDICT_TRUE(length > 0)) {
ARROW_RETURN_NOT_OK(ValidateOverflow(length));
ARROW_RETURN_NOT_OK(value_data_builder_.Append(value, length));
}
return Status::OK();
}

Status ExtendCurrent(util::string_view value) {
return ExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}

Status AppendNulls(int64_t length) final {
const int64_t num_bytes = value_data_builder_.length();
ARROW_RETURN_NOT_OK(Reserve(length));
Expand Down Expand Up @@ -133,12 +150,28 @@ class BaseBinaryBuilder : public ArrayBuilder {
UnsafeAppend(value.data(), static_cast<offset_type>(value.size()));
}

/// Like ExtendCurrent, but do not check capacity
void UnsafeExtendCurrent(const uint8_t* value, offset_type length) {
value_data_builder_.UnsafeAppend(value, length);
}

void UnsafeExtendCurrent(util::string_view value) {
UnsafeExtendCurrent(reinterpret_cast<const uint8_t*>(value.data()),
static_cast<offset_type>(value.size()));
}

void UnsafeAppendNull() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(false);
}

void UnsafeAppendEmptyValue() {
const int64_t num_bytes = value_data_builder_.length();
offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_bytes));
UnsafeAppendToBitmap(true);
}

/// \brief Append a sequence of strings in one shot.
///
/// \param[in] values a vector of strings
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/arrow/compute/function.cc
Expand Up @@ -210,8 +210,9 @@ Status Function::Validate() const {
if (arity_.is_varargs && arg_count == arity_.num_args + 1) {
return Status::OK();
}
return Status::Invalid("In function '", name_,
"': ", "number of argument names != function arity");
return Status::Invalid(
"In function '", name_,
"': ", "number of argument names for function documentation != function arity");
}
return Status::OK();
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/compute/kernels/codegen_internal.h
Expand Up @@ -1193,15 +1193,15 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
}

// similar to GenerateTypeAgnosticPrimitive, but for variable types
template <template <typename...> class Generator>
template <template <typename...> class Generator, typename... Args>
ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::BINARY:
case Type::STRING:
return Generator<BinaryType>::Exec;
return Generator<BinaryType, Args...>::Exec;
case Type::LARGE_BINARY:
case Type::LARGE_STRING:
return Generator<LargeBinaryType>::Exec;
return Generator<LargeBinaryType, Args...>::Exec;
default:
DCHECK(false);
return ExecFail;
Expand Down

0 comments on commit 6343359

Please sign in to comment.