apacheGH-40316: [Python] only allocate the ScalarMemoTable when used (a…

…pache#40565) ### Rationale for this change Mimalloc and jemalloc can allocate a [relatively large amount of memory for the ScalarMemoTable](apache#40301). For this reason, the ScalarMemoTable should only be allocated when it is used (when `options.deduplicate_objects=True`). I tested this change, and for small tables it does improve memory allocation. `options.deduplicate_objects=False` After this change: 📦 Total memory allocated: 174.422MB 📊 Histogram of allocation size: min: 1.000B -------------------------------------------- < 6.000B : 3064 ▇▇ < 36.000B : 7533 ▇▇▇▇ < 222.000B : 9974 ▇▇▇▇▇ < 1.319KB : 53264 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ < 7.999KB : 5188 ▇▇▇ < 48.503KB : 742 ▇ < 294.066KB: 102 ▇ < 1.741MB : 22 ▇ < 10.556MB : 1 ▇ <=64.000MB : 1 ▇ -------------------------------------------- max: 64.000MB Before this change: 📦 Total memory allocated: 1.295GB 📊 Histogram of allocation size: min: 1.000B -------------------------------------------- < 6.000B : 3064 ▇▇ < 36.000B : 7543 ▇▇▇▇ < 222.000B : 10009 ▇▇▇▇▇ < 1.319KB : 53269 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇ < 7.999KB : 5192 ▇▇▇ < 48.503KB : 761 ▇ < 294.066KB: 102 ▇ < 1.741MB : 22 ▇ < 10.556MB : 1 ▇ <=64.000MB : 19 ▇ -------------------------------------------- max: 64.000MB ### What changes are included in this PR? The allocation of `memo_table` and `unique_values` have been moved underneath an `if (options.deduplicate_objects)` block. Since they are used within a lambda, they have been changed to shared pointers, so that their values exist for the lifetime needed. ### Are these changes tested? `deduplicate_objects` has extensive existing tests: https://github.com/apache/arrow/blob/b235f83ed10bcad174b267113479a24ca045def5/python/pyarrow/tests/test_pandas.py#L3211 and https://github.com/apache/arrow/blob/b235f83ed10bcad174b267113479a24ca045def5/python/benchmarks/convert_pandas.py#L71 ### Are there any user-facing changes? Nope. * GitHub Issue: apache#40316 Lead-authored-by: anjakefala <anja.kefala@gmail.com> Co-authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
pitrou · Mar 20, 2024 · b6c4fbc · b6c4fbc
1 parent 0eefb07
commit b6c4fbc
Showing 1 changed file with 29 additions and 29 deletions.
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -622,40 +622,40 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using Scalar = typename MemoizationTraits<Type>::Scalar;
 
-  ::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
-  std::vector<PyObject*> unique_values;
-  int32_t memo_size = 0;
-
-  auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) {
-    int32_t memo_index;
-    RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index));
-    if (memo_index == memo_size) {
-      // New entry
-      RETURN_NOT_OK(wrap_func(value, out_values));
-      unique_values.push_back(*out_values);
-      ++memo_size;
-    } else {
-      // Duplicate entry
-      Py_INCREF(unique_values[memo_index]);
-      *out_values = unique_values[memo_index];
+  auto convert_chunks = [&](auto&& wrap_func) -> Status {
+    for (int c = 0; c < data.num_chunks(); c++) {
+      const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
+      RETURN_NOT_OK(internal::WriteArrayObjects(arr, wrap_func, out_values));
+      out_values += arr.length();
     }
     return Status::OK();
   };
 
-  auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) {
-    return wrap_func(value, out_values);
-  };
-
-  for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
-    if (options.deduplicate_objects) {
-      RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values));
-    } else {
-      RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values));
-    }
-    out_values += arr.length();
+  if (options.deduplicate_objects) {
+    // GH-40316: only allocate a memo table if deduplication is enabled.
+    ::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
+    std::vector<PyObject*> unique_values;
+    int32_t memo_size = 0;
+
+    auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) {
+      int32_t memo_index;
+      RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index));
+      if (memo_index == memo_size) {
+        // New entry
+        RETURN_NOT_OK(wrap_func(value, out_values));
+        unique_values.push_back(*out_values);
+        ++memo_size;
+      } else {
+        // Duplicate entry
+        Py_INCREF(unique_values[memo_index]);
+        *out_values = unique_values[memo_index];
+      }
+      return Status::OK();
+    };
+    return convert_chunks(std::move(WrapMemoized));
+  } else {
+    return convert_chunks(std::forward<WrapFunction>(wrap_func));
   }
-  return Status::OK();
 }
 
 Status ConvertStruct(PandasOptions options, const ChunkedArray& data,