Skip to content

Commit

Permalink
apacheGH-37710: [C++][Integration] Add C++ Utf8View implementation (a…
Browse files Browse the repository at this point in the history
…pache#37792)

### Rationale for this change

After the PR changing the spec and schema ( apache#37526 ) is accepted, this PR will be undrafted. It adds the minimal addition of a C++ implementation and was extracted from the original C++ Utf8View pr ( apache#35628 ) for ease of review.

### What changes are included in this PR?

- The new types are available with new subclasses of DataType, Array, ArrayBuilder, ...
- The values of string view arrays can be visited as `std::string_view` as with StringArray
- String view arrays can be round tripped through IPC and integration JSON

* Closes: apache#37710

Relevant mailing list discussions: 
* https://lists.apache.org/thread/l8t1vj5x1wdf75mdw3wfjvnxrfy5xomy
* https://lists.apache.org/thread/3qhkomvvc69v3gkotbwldyko7yk9cs9k

Authored-by: Benjamin Kietzman <bengilgit@gmail.com>
Signed-off-by: Benjamin Kietzman <bengilgit@gmail.com>
  • Loading branch information
bkietz authored and loicalleyne committed Nov 13, 2023
1 parent 476b1df commit 0ccedb5
Show file tree
Hide file tree
Showing 69 changed files with 1,951 additions and 375 deletions.
2 changes: 2 additions & 0 deletions cpp/src/arrow/array/array_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ struct ScalarFromArraySlotImpl {
return Finish(a.GetString(index_));
}

Status Visit(const BinaryViewArray& a) { return Finish(a.GetString(index_)); }

Status Visit(const FixedSizeBinaryArray& a) { return Finish(a.GetString(index_)); }

Status Visit(const DayTimeIntervalArray& a) { return Finish(a.Value(index_)); }
Expand Down
28 changes: 28 additions & 0 deletions cpp/src/arrow/array/array_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "arrow/array/validate.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/binary_view_util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"

Expand Down Expand Up @@ -89,6 +90,33 @@ LargeStringArray::LargeStringArray(int64_t length,

Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

BinaryViewArray::BinaryViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW);
SetData(std::move(data));
}

BinaryViewArray::BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector buffers,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
int64_t offset) {
buffers.insert(buffers.begin(), std::move(views));
buffers.insert(buffers.begin(), std::move(null_bitmap));
SetData(
ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset));
}

std::string_view BinaryViewArray::GetView(int64_t i) const {
const std::shared_ptr<Buffer>* data_buffers = data_->buffers.data() + 2;
return util::FromBinaryView(raw_values_[i], data_buffers);
}

StringViewArray::StringViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW);
SetData(std::move(data));
}

Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
}
Expand Down
60 changes: 60 additions & 0 deletions cpp/src/arrow/array/array_binary.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <string_view>
#include <vector>
Expand Down Expand Up @@ -217,6 +218,65 @@ class ARROW_EXPORT LargeStringArray : public LargeBinaryArray {
Status ValidateUTF8() const;
};

// ----------------------------------------------------------------------
// BinaryView and StringView

/// Concrete Array class for variable-size binary view data using the
/// BinaryViewType::c_type struct to reference in-line or out-of-line string values
class ARROW_EXPORT BinaryViewArray : public FlatArray {
public:
using TypeClass = BinaryViewType;
using IteratorType = stl::ArrayIterator<BinaryViewArray>;
using c_type = BinaryViewType::c_type;

explicit BinaryViewArray(std::shared_ptr<ArrayData> data);

BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector data_buffers,
std::shared_ptr<Buffer> null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

// For API compatibility with BinaryArray etc.
std::string_view GetView(int64_t i) const;
std::string GetString(int64_t i) const { return std::string{GetView(i)}; }

const auto& values() const { return data_->buffers[1]; }
const c_type* raw_values() const { return raw_values_; }

std::optional<std::string_view> operator[](int64_t i) const {
return *IteratorType(*this, i);
}

IteratorType begin() const { return IteratorType(*this); }
IteratorType end() const { return IteratorType(*this, length()); }

protected:
using FlatArray::FlatArray;

void SetData(std::shared_ptr<ArrayData> data) {
FlatArray::SetData(std::move(data));
raw_values_ = data_->GetValuesSafe<c_type>(1);
}

const c_type* raw_values_;
};

/// Concrete Array class for variable-size string view (utf-8) data using
/// BinaryViewType::c_type to reference in-line or out-of-line string values
class ARROW_EXPORT StringViewArray : public BinaryViewArray {
public:
using TypeClass = StringViewType;

explicit StringViewArray(std::shared_ptr<ArrayData> data);

using BinaryViewArray::BinaryViewArray;

/// \brief Validate that this array contains only valid UTF8 entries
///
/// This check is also implied by ValidateFull()
Status ValidateUTF8() const;
};

// ----------------------------------------------------------------------
// Fixed width binary

Expand Down
138 changes: 121 additions & 17 deletions cpp/src/arrow/array/array_binary_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,21 @@

#include "arrow/array.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/validate.h"
#include "arrow/buffer.h"
#include "arrow/memory_pool.h"
#include "arrow/status.h"
#include "arrow/testing/builder.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/matchers.h"
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_builders.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/key_value_metadata.h"
#include "arrow/util/logging.h"
#include "arrow/visit_data_inline.h"

namespace arrow {
Expand Down Expand Up @@ -365,38 +369,134 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { this->TestValidateOffsets();

TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }

// Produce an Array of index/offset views from a std::vector of index/offset
// BinaryViewType::c_type
Result<std::shared_ptr<StringViewArray>> MakeBinaryViewArray(
BufferVector data_buffers, const std::vector<BinaryViewType::c_type>& views,
bool validate = true) {
auto length = static_cast<int64_t>(views.size());
auto arr = std::make_shared<StringViewArray>(
utf8_view(), length, Buffer::FromVector(views), std::move(data_buffers));
if (validate) {
RETURN_NOT_OK(arr->ValidateFull());
}
return arr;
}

TEST(StringViewArray, Validate) {
// Since this is a test of validation, we need to be able to construct invalid arrays.
auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)");
auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy");

// empty array is valid
EXPECT_THAT(MakeBinaryViewArray({}, {}), Ok());

// empty array with some data buffers is valid
EXPECT_THAT(MakeBinaryViewArray({buffer_s, buffer_y}, {}), Ok());

// inline views need not have a corresponding buffer
EXPECT_THAT(MakeBinaryViewArray({},
{
util::ToInlineBinaryView("hello"),
util::ToInlineBinaryView("world"),
util::ToInlineBinaryView("inline me"),
}),
Ok());

// non-inline views are expected to reference only buffers managed by the array
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s, buffer_y},
{util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
Ok());

// views may not reference data buffers not present in the array
EXPECT_THAT(
MakeBinaryViewArray(
{}, {util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0)}),
Raises(StatusCode::IndexError));
// ... or ranges which overflow the referenced data buffer
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s}, {util::ToBinaryView(
"supe", static_cast<int32_t>(buffer_s->size() + 50), 0, 0)}),
Raises(StatusCode::IndexError));

// Additionally, the prefixes of non-inline views must match the data buffer
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s, buffer_y},
{util::ToBinaryView("SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", static_cast<int32_t>(buffer_y->size()), 1, 0)}),
Raises(StatusCode::Invalid));

// Invalid string views which are masked by a null bit do not cause validation to fail
auto invalid_but_masked =
MakeBinaryViewArray(
{buffer_s},
{util::ToBinaryView("SUPE", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("yyyy", 50, 40, 30)},
/*validate=*/false)
.ValueOrDie()
->data();
invalid_but_masked->null_count = 2;
invalid_but_masked->buffers[0] = *AllocateEmptyBitmap(2);
EXPECT_THAT(internal::ValidateArrayFull(*invalid_but_masked), Ok());

// overlapping views are allowed
EXPECT_THAT(
MakeBinaryViewArray(
{buffer_s},
{
util::ToBinaryView("supe", static_cast<int32_t>(buffer_s->size()), 0, 0),
util::ToBinaryView("uper", static_cast<int32_t>(buffer_s->size() - 1), 0,
1),
util::ToBinaryView("perc", static_cast<int32_t>(buffer_s->size() - 2), 0,
2),
util::ToBinaryView("erca", static_cast<int32_t>(buffer_s->size() - 3), 0,
3),
}),
Ok());
}

template <typename T>
class TestUTF8Array : public ::testing::Test {
public:
using TypeClass = T;
using offset_type = typename TypeClass::offset_type;
using ArrayType = typename TypeTraits<TypeClass>::ArrayType;

Status ValidateUTF8(int64_t length, std::vector<offset_type> offsets,
std::string_view data, int64_t offset = 0) {
ArrayType arr(length, Buffer::Wrap(offsets), std::make_shared<Buffer>(data),
/*null_bitmap=*/nullptr, /*null_count=*/0, offset);
return arr.ValidateUTF8();
std::shared_ptr<DataType> type() const {
if constexpr (is_binary_view_like_type<TypeClass>::value) {
return TypeClass::is_utf8 ? utf8_view() : binary_view();
} else {
return TypeTraits<TypeClass>::type_singleton();
}
}

Status ValidateUTF8(const std::string& json) {
auto ty = TypeTraits<T>::type_singleton();
auto arr = ArrayFromJSON(ty, json);
return checked_cast<const ArrayType&>(*arr).ValidateUTF8();
Status ValidateUTF8(const Array& arr) {
return checked_cast<const ArrayType&>(arr).ValidateUTF8();
}

Status ValidateUTF8(std::vector<std::string> values) {
std::shared_ptr<Array> arr;
ArrayFromVector<T, std::string>(type(), values, &arr);
return ValidateUTF8(*arr);
}

void TestValidateUTF8() {
ASSERT_OK(ValidateUTF8(R"(["Voix", "ambiguë", "d’un", "cœur"])"));
ASSERT_OK(ValidateUTF8(1, {0, 4}, "\xf4\x8f\xbf\xbf")); // \U0010ffff
ASSERT_OK(
ValidateUTF8(*ArrayFromJSON(type(), R"(["Voix", "ambiguë", "d’un", "cœur"])")));
ASSERT_OK(ValidateUTF8({"\xf4\x8f\xbf\xbf"})); // \U0010ffff

ASSERT_RAISES(Invalid, ValidateUTF8(1, {0, 1}, "\xf4"));
ASSERT_RAISES(Invalid, ValidateUTF8({"\xf4"}));

// More tests in TestValidateData() above
// (ValidateFull() calls ValidateUTF8() internally)
}
};

TYPED_TEST_SUITE(TestUTF8Array, StringArrowTypes);
TYPED_TEST_SUITE(TestUTF8Array, StringOrStringViewArrowTypes);

TYPED_TEST(TestUTF8Array, TestValidateUTF8) { this->TestValidateUTF8(); }

Expand Down Expand Up @@ -883,11 +983,15 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
void SetUp() override { type_ = TypeTraits<TypeClass>::type_singleton(); }

void TestBasics() {
auto array = ArrayFromJSON(type_, R"(["foo", null, "bar"])");
auto array = ArrayFromJSON(
type_,
R"(["foo", null, "bar", "inline_me", "allocate_me_aaaaa", "allocate_me_bbbb"])");
BinaryAppender appender;
ArraySpanVisitor<TypeClass> visitor;
ASSERT_OK(visitor.Visit(*array->data(), &appender));
ASSERT_THAT(appender.data, ::testing::ElementsAreArray({"foo", "(null)", "bar"}));
ASSERT_THAT(appender.data,
::testing::ElementsAreArray({"foo", "(null)", "bar", "inline_me",
"allocate_me_aaaaa", "allocate_me_bbbb"}));
ARROW_UNUSED(visitor); // Workaround weird MSVC warning
}

Expand All @@ -904,7 +1008,7 @@ class TestBaseBinaryDataVisitor : public ::testing::Test {
std::shared_ptr<DataType> type_;
};

TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryArrowTypes);
TYPED_TEST_SUITE(TestBaseBinaryDataVisitor, BaseBinaryOrBinaryViewLikeArrowTypes);

TYPED_TEST(TestBaseBinaryDataVisitor, Basics) { this->TestBasics(); }

Expand Down
8 changes: 7 additions & 1 deletion cpp/src/arrow/array/array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -382,10 +382,12 @@ static std::vector<std::shared_ptr<DataType>> TestArrayUtilitiesAgainstTheseType
float64(),
binary(),
large_binary(),
binary_view(),
fixed_size_binary(3),
decimal(16, 4),
utf8(),
large_utf8(),
utf8_view(),
list(utf8()),
list(int64()), // NOTE: Regression case for ARROW-9071/MakeArrayOfNull
list(large_utf8()),
Expand Down Expand Up @@ -601,12 +603,15 @@ static ScalarVector GetScalars() {
std::make_shared<DurationScalar>(60, duration(TimeUnit::SECOND)),
std::make_shared<BinaryScalar>(hello),
std::make_shared<LargeBinaryScalar>(hello),
std::make_shared<BinaryViewScalar>(hello),
std::make_shared<FixedSizeBinaryScalar>(
hello, fixed_size_binary(static_cast<int32_t>(hello->size()))),
std::make_shared<Decimal128Scalar>(Decimal128(10), decimal(16, 4)),
std::make_shared<Decimal256Scalar>(Decimal256(10), decimal(76, 38)),
std::make_shared<StringScalar>(hello),
std::make_shared<LargeStringScalar>(hello),
std::make_shared<StringViewScalar>(hello),
std::make_shared<StringViewScalar>(Buffer::FromString("long string; not inlined")),
std::make_shared<ListScalar>(ArrayFromJSON(int8(), "[1, 2, 3]")),
ScalarFromJSON(map(int8(), utf8()), R"([[1, "foo"], [2, "bar"]])"),
std::make_shared<LargeListScalar>(ArrayFromJSON(int8(), "[1, 1, 2, 2, 3, 3]")),
Expand Down Expand Up @@ -647,13 +652,14 @@ TEST_F(TestArray, TestMakeArrayFromScalar) {

for (int64_t length : {16}) {
for (auto scalar : scalars) {
ARROW_SCOPED_TRACE("scalar type: ", scalar->type->ToString());
ASSERT_OK_AND_ASSIGN(auto array, MakeArrayFromScalar(*scalar, length));
ASSERT_OK(array->ValidateFull());
ASSERT_EQ(array->length(), length);
ASSERT_EQ(array->null_count(), 0);

// test case for ARROW-13321
for (int64_t i : std::vector<int64_t>{0, length / 2, length - 1}) {
for (int64_t i : {int64_t{0}, length / 2, length - 1}) {
ASSERT_OK_AND_ASSIGN(auto s, array->GetScalar(i));
AssertScalarsEqual(*s, *scalar, /*verbose=*/true);
}
Expand Down

0 comments on commit 0ccedb5

Please sign in to comment.