From b6c4fbc2ce9281829b625d33a736262f3f9ae739 Mon Sep 17 00:00:00 2001
From: Anja Kefala <anja.kefala@gmail.com>
Date: Wed, 20 Mar 2024 11:32:19 -0400
Subject: [PATCH] GH-40316: [Python] only allocate the ScalarMemoTable when
 used (#40565)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Rationale for this change

Mimalloc and jemalloc can allocate a [relatively large amount of memory for the ScalarMemoTable](https://github.com/apache/arrow/issues/40301). For this reason, the ScalarMemoTable should only be allocated when it is used (when `options.deduplicate_objects=True`).

I tested this change, and for small tables it does improve memory allocation.

`options.deduplicate_objects=False`

After this change:

📦 Total memory allocated:
        174.422MB

📊 Histogram of allocation size:
        min: 1.000B
        --------------------------------------------
        < 6.000B   :  3064 ▇▇
        < 36.000B  :  7533 ▇▇▇▇
        < 222.000B :  9974 ▇▇▇▇▇
        < 1.319KB  : 53264 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
        < 7.999KB  :  5188 ▇▇▇
        < 48.503KB :   742 ▇
        < 294.066KB:   102 ▇
        < 1.741MB  :    22 ▇
        < 10.556MB :     1 ▇
        <=64.000MB :     1 ▇
        --------------------------------------------
        max: 64.000MB

Before this change:

📦 Total memory allocated:
        1.295GB

📊 Histogram of allocation size:
        min: 1.000B
        --------------------------------------------
        < 6.000B   :  3064 ▇▇
        < 36.000B  :  7543 ▇▇▇▇
        < 222.000B : 10009 ▇▇▇▇▇
        < 1.319KB  : 53269 ▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
        < 7.999KB  :  5192 ▇▇▇
        < 48.503KB :   761 ▇
        < 294.066KB:   102 ▇
        < 1.741MB  :    22 ▇
        < 10.556MB :     1 ▇
        <=64.000MB :    19 ▇
        --------------------------------------------
        max: 64.000MB

### What changes are included in this PR?

The allocation of `memo_table` and `unique_values` have been moved underneath an `if (options.deduplicate_objects)` block. Since they are used within a lambda, they have been changed to shared pointers, so that their values exist for the lifetime needed.

### Are these changes tested?

`deduplicate_objects` has extensive existing tests: https://github.com/apache/arrow/blob/b235f83ed10bcad174b267113479a24ca045def5/python/pyarrow/tests/test_pandas.py#L3211 and https://github.com/apache/arrow/blob/b235f83ed10bcad174b267113479a24ca045def5/python/benchmarks/convert_pandas.py#L71

### Are there any user-facing changes?

Nope.
* GitHub Issue: #40316

Lead-authored-by: anjakefala <anja.kefala@gmail.com>
Co-authored-by: Antoine Pitrou <antoine@python.org>
Signed-off-by: Antoine Pitrou <antoine@python.org>
---
 .../src/arrow/python/arrow_to_pandas.cc       | 58 +++++++++----------
 1 file changed, 29 insertions(+), 29 deletions(-)
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index 023ba5585e704..5bfd5853fae69 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -622,40 +622,40 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using Scalar = typename MemoizationTraits<Type>::Scalar;
 
-  ::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
-  std::vector<PyObject*> unique_values;
-  int32_t memo_size = 0;
-
-  auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) {
-    int32_t memo_index;
-    RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index));
-    if (memo_index == memo_size) {
-      // New entry
-      RETURN_NOT_OK(wrap_func(value, out_values));
-      unique_values.push_back(*out_values);
-      ++memo_size;
-    } else {
-      // Duplicate entry
-      Py_INCREF(unique_values[memo_index]);
-      *out_values = unique_values[memo_index];
+  auto convert_chunks = [&](auto&& wrap_func) -> Status {
+    for (int c = 0; c < data.num_chunks(); c++) {
+      const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
+      RETURN_NOT_OK(internal::WriteArrayObjects(arr, wrap_func, out_values));
+      out_values += arr.length();
     }
     return Status::OK();
   };
 
-  auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) {
-    return wrap_func(value, out_values);
-  };
-
-  for (int c = 0; c < data.num_chunks(); c++) {
-    const auto& arr = arrow::internal::checked_cast<const ArrayType&>(*data.chunk(c));
-    if (options.deduplicate_objects) {
-      RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values));
-    } else {
-      RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values));
-    }
-    out_values += arr.length();
+  if (options.deduplicate_objects) {
+    // GH-40316: only allocate a memo table if deduplication is enabled.
+    ::arrow::internal::ScalarMemoTable<Scalar> memo_table(options.pool);
+    std::vector<PyObject*> unique_values;
+    int32_t memo_size = 0;
+
+    auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) {
+      int32_t memo_index;
+      RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index));
+      if (memo_index == memo_size) {
+        // New entry
+        RETURN_NOT_OK(wrap_func(value, out_values));
+        unique_values.push_back(*out_values);
+        ++memo_size;
+      } else {
+        // Duplicate entry
+        Py_INCREF(unique_values[memo_index]);
+        *out_values = unique_values[memo_index];
+      }
+      return Status::OK();
+    };
+    return convert_chunks(std::move(WrapMemoized));
+  } else {
+    return convert_chunks(std::forward<WrapFunction>(wrap_func));
   }
-  return Status::OK();
 }
 
 Status ConvertStruct(PandasOptions options, const ChunkedArray& data,