Skip to content

Commit

Permalink
apacheGH-40297: [C++] Add TensorFromJSON helper function (apache#40365)
Browse files Browse the repository at this point in the history
### Rationale for this change

To make tests easier to write and read, we should create a `TensorFromJSON()` helper function.

* GitHub Issue: apache#40297

Lead-authored-by: AlenkaF <frim.alenka@gmail.com>
Co-authored-by: Alenka Frim <AlenkaF@users.noreply.github.com>
Co-authored-by: Rok Mihevc <rok@mihevc.org>
Signed-off-by: AlenkaF <frim.alenka@gmail.com>
  • Loading branch information
AlenkaF and rok committed Mar 14, 2024
1 parent e268b31 commit bad7b1a
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 35 deletions.
52 changes: 17 additions & 35 deletions cpp/src/arrow/record_batch_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -705,17 +705,12 @@ TEST_F(TestRecordBatch, ToTensorSupportedNaN) {
std::vector<int64_t> shape = {9, 2};
const int64_t f32_size = sizeof(float);
std::vector<int64_t> f_strides = {f32_size, f32_size * shape[0]};
std::vector<float> f_values = {
static_cast<float>(NAN), 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40,
static_cast<float>(NAN), 60, 70, 80, 90};
auto data = Buffer::Wrap(f_values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(float32(), data, shape, f_strides));
std::shared_ptr<Tensor> tensor_expected = TensorFromJSON(
float32(), "[NaN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, NaN, 60, 70, 80, 90]",
shape, f_strides);

EXPECT_FALSE(tensor_expected->Equals(*tensor));
EXPECT_TRUE(tensor_expected->Equals(*tensor, EqualOptions().nans_equal(true)));

CheckTensor<FloatType>(tensor, 18, shape, f_strides);
}

Expand Down Expand Up @@ -752,15 +747,11 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) {

std::vector<int64_t> shape = {9, 3};
std::vector<int64_t> f_strides = {unit_size, unit_size * shape[0]};
std::vector<c_data_type> f_values = {1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 20, 30, 40, 50, 60, 70, 80, 90,
100, 100, 100, 100, 100, 100, 100, 100, 100};
auto data = Buffer::Wrap(f_values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(
tensor_expected,
Tensor::Make(TypeTraits<DataType>::type_singleton(), data, shape, f_strides));
std::shared_ptr<Tensor> tensor_expected = TensorFromJSON(
TypeTraits<DataType>::type_singleton(),
"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, "
"80, 90, 100, 100, 100, 100, 100, 100, 100, 100, 100]",
shape, f_strides);

EXPECT_TRUE(tensor_expected->Equals(*tensor));
CheckTensor<DataType>(tensor, 27, shape, f_strides);
Expand All @@ -773,15 +764,11 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) {

std::vector<int64_t> shape_sliced = {8, 3};
std::vector<int64_t> f_strides_sliced = {unit_size, unit_size * shape_sliced[0]};
std::vector<c_data_type> f_values_sliced = {2, 3, 4, 5, 6, 7, 8, 9,
20, 30, 40, 50, 60, 70, 80, 90,
100, 100, 100, 100, 100, 100, 100, 100};
auto data_sliced = Buffer::Wrap(f_values_sliced);

std::shared_ptr<Tensor> tensor_expected_sliced;
ASSERT_OK_AND_ASSIGN(tensor_expected_sliced,
Tensor::Make(TypeTraits<DataType>::type_singleton(), data_sliced,
shape_sliced, f_strides_sliced));
std::shared_ptr<Tensor> tensor_expected_sliced =
TensorFromJSON(TypeTraits<DataType>::type_singleton(),
"[2, 3, 4, 5, 6, 7, 8, 9, 20, 30, 40, 50, 60, "
"70, 80, 90, 100, 100, 100, 100, 100, 100, 100, 100]",
shape_sliced, f_strides_sliced);

EXPECT_TRUE(tensor_expected_sliced->Equals(*tensor_sliced));
CheckTensor<DataType>(tensor_expected_sliced, 24, shape_sliced, f_strides_sliced);
Expand All @@ -793,15 +780,10 @@ TYPED_TEST_P(TestBatchToTensor, SupportedTypes) {

std::vector<int64_t> shape_sliced_1 = {5, 3};
std::vector<int64_t> f_strides_sliced_1 = {unit_size, unit_size * shape_sliced_1[0]};
std::vector<c_data_type> f_values_sliced_1 = {
2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100,
};
auto data_sliced_1 = Buffer::Wrap(f_values_sliced_1);

std::shared_ptr<Tensor> tensor_expected_sliced_1;
ASSERT_OK_AND_ASSIGN(tensor_expected_sliced_1,
Tensor::Make(TypeTraits<DataType>::type_singleton(), data_sliced_1,
shape_sliced_1, f_strides_sliced_1));
std::shared_ptr<Tensor> tensor_expected_sliced_1 =
TensorFromJSON(TypeTraits<DataType>::type_singleton(),
"[2, 3, 4, 5, 6, 20, 30, 40, 50, 60, 100, 100, 100, 100, 100]",
shape_sliced_1, f_strides_sliced_1);

EXPECT_TRUE(tensor_expected_sliced_1->Equals(*tensor_sliced_1));
CheckTensor<DataType>(tensor_expected_sliced_1, 15, shape_sliced_1, f_strides_sliced_1);
Expand Down
43 changes: 43 additions & 0 deletions cpp/src/arrow/testing/gtest_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,11 @@
#include "arrow/compute/api_vector.h"
#include "arrow/datum.h"
#include "arrow/ipc/json_simple.h"
#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
#include "arrow/pretty_print.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/tensor.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/config.h"
Expand All @@ -62,6 +64,10 @@
#include "arrow/util/thread_pool.h"
#include "arrow/util/windows_compatibility.h"

#include <rapidjson/document.h>

namespace rj = arrow::rapidjson;

namespace arrow {

using internal::checked_cast;
Expand Down Expand Up @@ -425,6 +431,43 @@ std::shared_ptr<Table> TableFromJSON(const std::shared_ptr<Schema>& schema,
return *Table::FromRecordBatches(schema, std::move(batches));
}

std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data, std::string_view shape,
std::string_view strides,
std::string_view dim_names) {
std::shared_ptr<Array> array = ArrayFromJSON(type, data);

rj::Document json_shape;
json_shape.Parse(shape.data(), shape.length());
std::vector<int64_t> shape_vector;
for (auto& x : json_shape.GetArray()) {
shape_vector.emplace_back(x.GetInt64());
}
rj::Document json_strides;
json_strides.Parse(strides.data(), strides.length());
std::vector<int64_t> strides_vector;
for (auto& x : json_strides.GetArray()) {
strides_vector.emplace_back(x.GetInt64());
}
rj::Document json_dim_names;
json_dim_names.Parse(dim_names.data(), dim_names.length());
std::vector<std::string> dim_names_vector;
for (auto& x : json_dim_names.GetArray()) {
dim_names_vector.emplace_back(x.GetString());
}
return *Tensor::Make(type, array->data()->buffers[1], shape_vector, strides_vector,
dim_names_vector);
}

std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& strides,
const std::vector<std::string>& dim_names) {
std::shared_ptr<Array> array = ArrayFromJSON(type, data);
return *Tensor::Make(type, array->data()->buffers[1], shape, strides, dim_names);
}

Result<std::shared_ptr<Table>> RunEndEncodeTableColumns(
const Table& table, const std::vector<int>& column_indices) {
const int num_columns = table.num_columns();
Expand Down
13 changes: 13 additions & 0 deletions cpp/src/arrow/testing/gtest_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,19 @@ ARROW_TESTING_EXPORT
std::shared_ptr<Table> TableFromJSON(const std::shared_ptr<Schema>&,
const std::vector<std::string>& json);

ARROW_TESTING_EXPORT
std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data, std::string_view shape,
std::string_view strides = "[]",
std::string_view dim_names = "[]");

ARROW_TESTING_EXPORT
std::shared_ptr<Tensor> TensorFromJSON(const std::shared_ptr<DataType>& type,
std::string_view data,
const std::vector<int64_t>& shape,
const std::vector<int64_t>& strides = {},
const std::vector<std::string>& dim_names = {});

ARROW_TESTING_EXPORT
Result<std::shared_ptr<Table>> RunEndEncodeTableColumns(
const Table& table, const std::vector<int>& column_indices);
Expand Down
37 changes: 37 additions & 0 deletions cpp/src/arrow/testing/gtest_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "arrow/array/builder_decimal.h"
#include "arrow/datum.h"
#include "arrow/record_batch.h"
#include "arrow/tensor.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/testing/random.h"
#include "arrow/type.h"
Expand Down Expand Up @@ -134,4 +135,40 @@ TEST_F(TestAssertContainsNaN, DatumEqual) {
AssertDatumsEqual(expected_chunked, actual_chunked);
}

class TestTensorFromJSON : public ::testing::Test {};

TEST_F(TestTensorFromJSON, FromJSONAndArray) {
std::vector<int64_t> shape = {9, 2};
const int64_t i64_size = sizeof(int64_t);
std::vector<int64_t> f_strides = {i64_size, i64_size * shape[0]};
std::vector<int64_t> f_values = {1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 20, 30, 40, 50, 60, 70, 80, 90};
auto data = Buffer::Wrap(f_values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(int64(), data, shape, f_strides));

std::shared_ptr<Tensor> result = TensorFromJSON(
int64(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]",
shape, f_strides);

EXPECT_TRUE(tensor_expected->Equals(*result));
}

TEST_F(TestTensorFromJSON, FromJSON) {
std::vector<int64_t> shape = {9, 2};
std::vector<int64_t> values = {1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 20, 30, 40, 50, 60, 70, 80, 90};
auto data = Buffer::Wrap(values);

std::shared_ptr<Tensor> tensor_expected;
ASSERT_OK_AND_ASSIGN(tensor_expected, Tensor::Make(int64(), data, shape));

std::shared_ptr<Tensor> result = TensorFromJSON(
int64(), "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90]",
"[9, 2]");

EXPECT_TRUE(tensor_expected->Equals(*result));
}

} // namespace arrow

0 comments on commit bad7b1a

Please sign in to comment.