From 9bb7d7db848de788e96ccd6e49f88621add413a4 Mon Sep 17 00:00:00 2001 From: hrishisd Date: Mon, 25 Sep 2023 10:55:29 -0400 Subject: [PATCH] GH-37829: [Java] Avoid resizing data buffer twice when appending variable length vectors (#37844) ### Rationale for this change This change prevents avoidable `OversizedAllocationException`s when appending a variable-length vector with many small elements to a variable-length vector with a few large elements. When appending variable-length vectors, `VectorAppender` iteratively doubles the offset and validity buffers until they can accommodate the combined elements. In the previous implementation, each iteration would also double the data buffer's capacity. This behavior is appropriate for vectors of fixed-size types but can result in an oversized data buffers when appending many small elements to a variable length vector with a large data buffer. ### What changes are included in this PR? The new behavior only resizes the offset and validity buffers when resizing the target vector's buffers to ensure they can hold the total number of combined elements. The data buffer is resized based on the total required data size of the combined elements. ### Are these changes tested? Yes. I added a unit test that results in an `OversizedAllocationException` when run against the previous version of the code. ### Are there any user-facing changes? No. * Closes: #37829 Authored-by: hrishisd Signed-off-by: David Li --- .../arrow/vector/util/VectorAppender.java | 4 +-- .../arrow/vector/util/TestVectorAppender.java | 27 ++++++++++++++++++- .../util/TestVectorSchemaRootAppender.java | 2 +- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java index 9f73732ccfdd3..c5de380f9c173 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/util/VectorAppender.java @@ -116,7 +116,7 @@ public ValueVector visit(BaseVariableWidthVector deltaVector, Void value) { // make sure there is enough capacity while (targetVector.getValueCapacity() < newValueCount) { - targetVector.reAlloc(); + ((BaseVariableWidthVector) targetVector).reallocValidityAndOffsetBuffers(); } while (targetVector.getDataBuffer().capacity() < newValueCapacity) { ((BaseVariableWidthVector) targetVector).reallocDataBuffer(); @@ -170,7 +170,7 @@ public ValueVector visit(BaseLargeVariableWidthVector deltaVector, Void value) { // make sure there is enough capacity while (targetVector.getValueCapacity() < newValueCount) { - targetVector.reAlloc(); + ((BaseLargeVariableWidthVector) targetVector).reallocValidityAndOffsetBuffers(); } while (targetVector.getDataBuffer().capacity() < newValueCapacity) { ((BaseLargeVariableWidthVector) targetVector).reallocDataBuffer(); diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java index 25d26623d5c05..ab36ea2fd2129 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorAppender.java @@ -21,11 +21,14 @@ import static junit.framework.TestCase.assertTrue; import static org.junit.jupiter.api.Assertions.assertThrows; +import java.nio.charset.StandardCharsets; import java.util.Arrays; +import java.util.Collections; import java.util.List; import org.apache.arrow.memory.BufferAllocator; import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.BaseValueVector; import org.apache.arrow.vector.BigIntVector; import org.apache.arrow.vector.BitVector; import org.apache.arrow.vector.Float4Vector; @@ -63,7 +66,8 @@ public class TestVectorAppender { @Before public void prepare() { - allocator = new RootAllocator(1024 * 1024); + // Permit allocating 4 vectors of max size. + allocator = new RootAllocator(4 * BaseValueVector.MAX_ALLOCATION_SIZE); } @After @@ -185,6 +189,27 @@ public void testAppendEmptyVariableWidthVector() { } } + @Test + public void testAppendLargeAndSmallVariableVectorsWithinLimit() { + int sixteenthOfMaxAllocation = Math.toIntExact(BaseValueVector.MAX_ALLOCATION_SIZE / 16); + try (VarCharVector target = makeVarCharVec(1, sixteenthOfMaxAllocation); + VarCharVector delta = makeVarCharVec(sixteenthOfMaxAllocation, 1)) { + new VectorAppender(delta).visit(target, null); + new VectorAppender(target).visit(delta, null); + } + } + + private VarCharVector makeVarCharVec(int numElements, int bytesPerElement) { + VarCharVector v = new VarCharVector("text", allocator); + v.allocateNew((long) numElements * bytesPerElement, numElements); + for (int i = 0; i < numElements; i++) { + String s = String.join("", Collections.nCopies(bytesPerElement, "a")); + v.setSafe(i, s.getBytes(StandardCharsets.US_ASCII)); + } + v.setValueCount(numElements); + return v; + } + @Test public void testAppendLargeVariableWidthVector() { final int length1 = 5; diff --git a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java index ab0ee3a2075a3..6309d385870c9 100644 --- a/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java +++ b/java/vector/src/test/java/org/apache/arrow/vector/util/TestVectorSchemaRootAppender.java @@ -50,7 +50,7 @@ public void shutdown() { } @Test - public void testVectorScehmaRootAppend() { + public void testVectorSchemaRootAppend() { final int length1 = 5; final int length2 = 3; final int length3 = 2;