From b6c4fbc2ce9281829b625d33a736262f3f9ae739 Mon Sep 17 00:00:00 2001 From: Anja Kefala Date: Wed, 20 Mar 2024 11:32:19 -0400 Subject: [PATCH] GH-40316: [Python] only allocate the ScalarMemoTable when used (#40565) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Mimalloc and jemalloc can allocate a [relatively large amount of memory for the ScalarMemoTable](https://github.com/apache/arrow/issues/40301). For this reason, the ScalarMemoTable should only be allocated when it is used (when `options.deduplicate_objects=True`). I tested this change, and for small tables it does improve memory allocation. `options.deduplicate_objects=False` After this change: 📦 Total memory allocated: 174.422MB 📊 Histogram of allocation size: min: 1.000B -------------------------------------------- < 6.000B : 3064 ▇▇ < 36.000B : 7533 ▇▇▇▇ < 222.000B : 9974 ▇▇▇▇▇ < 1.319KB : 53264 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ < 7.999KB : 5188 ▇▇▇ < 48.503KB : 742 ▇ < 294.066KB: 102 ▇ < 1.741MB : 22 ▇ < 10.556MB : 1 ▇ <=64.000MB : 1 ▇ -------------------------------------------- max: 64.000MB Before this change: 📦 Total memory allocated: 1.295GB 📊 Histogram of allocation size: min: 1.000B -------------------------------------------- < 6.000B : 3064 ▇▇ < 36.000B : 7543 ▇▇▇▇ < 222.000B : 10009 ▇▇▇▇▇ < 1.319KB : 53269 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ < 7.999KB : 5192 ▇▇▇ < 48.503KB : 761 ▇ < 294.066KB: 102 ▇ < 1.741MB : 22 ▇ < 10.556MB : 1 ▇ <=64.000MB : 19 ▇ -------------------------------------------- max: 64.000MB ### What changes are included in this PR? The allocation of `memo_table` and `unique_values` have been moved underneath an `if (options.deduplicate_objects)` block. Since they are used within a lambda, they have been changed to shared pointers, so that their values exist for the lifetime needed. ### Are these changes tested? `deduplicate_objects` has extensive existing tests: https://github.com/apache/arrow/blob/b235f83ed10bcad174b267113479a24ca045def5/python/pyarrow/tests/test_pandas.py#L3211 and https://github.com/apache/arrow/blob/b235f83ed10bcad174b267113479a24ca045def5/python/benchmarks/convert_pandas.py#L71 ### Are there any user-facing changes? Nope. * GitHub Issue: #40316 Lead-authored-by: anjakefala Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- .../src/arrow/python/arrow_to_pandas.cc | 58 +++++++++---------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 023ba5585e704..5bfd5853fae69 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -622,40 +622,40 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra using ArrayType = typename TypeTraits::ArrayType; using Scalar = typename MemoizationTraits::Scalar; - ::arrow::internal::ScalarMemoTable memo_table(options.pool); - std::vector unique_values; - int32_t memo_size = 0; - - auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { - int32_t memo_index; - RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); - if (memo_index == memo_size) { - // New entry - RETURN_NOT_OK(wrap_func(value, out_values)); - unique_values.push_back(*out_values); - ++memo_size; - } else { - // Duplicate entry - Py_INCREF(unique_values[memo_index]); - *out_values = unique_values[memo_index]; + auto convert_chunks = [&](auto&& wrap_func) -> Status { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); + RETURN_NOT_OK(internal::WriteArrayObjects(arr, wrap_func, out_values)); + out_values += arr.length(); } return Status::OK(); }; - auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { - return wrap_func(value, out_values); - }; - - for (int c = 0; c < data.num_chunks(); c++) { - const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); - if (options.deduplicate_objects) { - RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values)); - } else { - RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values)); - } - out_values += arr.length(); + if (options.deduplicate_objects) { + // GH-40316: only allocate a memo table if deduplication is enabled. + ::arrow::internal::ScalarMemoTable memo_table(options.pool); + std::vector unique_values; + int32_t memo_size = 0; + + auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index; + RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); + if (memo_index == memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values.push_back(*out_values); + ++memo_size; + } else { + // Duplicate entry + Py_INCREF(unique_values[memo_index]); + *out_values = unique_values[memo_index]; + } + return Status::OK(); + }; + return convert_chunks(std::move(WrapMemoized)); + } else { + return convert_chunks(std::forward(wrap_func)); } - return Status::OK(); } Status ConvertStruct(PandasOptions options, const ChunkedArray& data,