Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[scudo] Allow to resize allocation ring buffer #82683

Conversation

fmayer
Copy link
Contributor

@fmayer fmayer commented Feb 22, 2024

@llvmbot
Copy link
Collaborator

llvmbot commented Feb 22, 2024

@llvm/pr-subscribers-compiler-rt-sanitizer

Author: Florian Mayer (fmayer)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/82683.diff

4 Files Affected:

  • (modified) compiler-rt/lib/scudo/standalone/combined.h (+34-10)
  • (modified) compiler-rt/lib/scudo/standalone/include/scudo/interface.h (+2)
  • (modified) compiler-rt/lib/scudo/standalone/tests/combined_test.cpp (+27)
  • (modified) compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp (+4)
diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 79b5891fd35869..479e49c328e3d7 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -174,9 +174,11 @@ class Allocator {
         static_cast<uptr>(getFlags()->quarantine_size_kb << 10),
         static_cast<uptr>(getFlags()->thread_local_quarantine_size_kb << 10));
 
-    mapAndInitializeRingBuffer();
+    mapAndInitializeRingBuffer(getFlags()->allocation_ring_buffer_size);
   }
 
+  bool resizeRingBuffer(int Size) { return mapAndInitializeRingBuffer(Size); }
+
   // Initialize the embedded GWP-ASan instance. Requires the main allocator to
   // be functional, best called from PostInitCallback.
   void initGwpAsan() {
@@ -1531,11 +1533,15 @@ class Allocator {
         RBEntryStart)[N];
   }
 
-  void mapAndInitializeRingBuffer() {
-    if (getFlags()->allocation_ring_buffer_size <= 0)
-      return;
-    u32 AllocationRingBufferSize =
-        static_cast<u32>(getFlags()->allocation_ring_buffer_size);
+  bool mapAndInitializeRingBuffer(int RingBufferSize) {
+    if (RingBufferSize < 0 ||
+        static_cast<unsigned int>(RingBufferSize) >= UINT32_MAX)
+      return false;
+    if (RingBufferSize == 0) {
+      swapOutRingBuffer(nullptr);
+      return true;
+    }
+    u32 AllocationRingBufferSize = static_cast<u32>(RingBufferSize);
 
     // We store alloc and free stacks for each entry.
     constexpr u32 kStacksPerRingBufferEntry = 2;
@@ -1555,11 +1561,11 @@ class Allocator {
                   UINTPTR_MAX);
 
     if (AllocationRingBufferSize > kMaxU32Pow2 / kStacksPerRingBufferEntry)
-      return;
+      return false;
     u32 TabSize = static_cast<u32>(roundUpPowerOfTwo(kStacksPerRingBufferEntry *
                                                      AllocationRingBufferSize));
     if (TabSize > UINT32_MAX / kFramesPerStack)
-      return;
+      return false;
     u32 RingSize = static_cast<u32>(TabSize * kFramesPerStack);
     DCHECK(isPowerOfTwo(RingSize));
 
@@ -1585,12 +1591,30 @@ class Allocator {
     RB->StackDepotSize = StackDepotSize;
     RB->RawStackDepotMap = DepotMap;
 
-    atomic_store(&RingBufferAddress, reinterpret_cast<uptr>(RB),
-                 memory_order_release);
+    swapOutRingBuffer(RB);
     static_assert(sizeof(AllocationRingBuffer) %
                           alignof(typename AllocationRingBuffer::Entry) ==
                       0,
                   "invalid alignment");
+    return true;
+  }
+
+  void swapOutRingBuffer(AllocationRingBuffer *NewRB) {
+    AllocationRingBuffer *PrevRB = reinterpret_cast<AllocationRingBuffer *>(
+        atomic_exchange(&RingBufferAddress, reinterpret_cast<uptr>(NewRB),
+                        memory_order_acq_rel));
+    if (PrevRB) {
+      auto RawStackDepotMap = PrevRB->RawStackDepotMap;
+      if (RawStackDepotMap.isAllocated()) {
+        RawStackDepotMap.releaseAndZeroPagesToOS(
+            RawStackDepotMap.getBase(), RawStackDepotMap.getCapacity());
+      }
+      auto RawRingBufferMap = PrevRB->RawRingBufferMap;
+      if (RawRingBufferMap.isAllocated()) {
+        RawRingBufferMap.releaseAndZeroPagesToOS(
+            RawRingBufferMap.getBase(), RawRingBufferMap.getCapacity());
+      }
+    }
   }
 
   void unmapRingBuffer() {
diff --git a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
index a2dedea910cc08..8aca56b7172d9f 100644
--- a/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
+++ b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h
@@ -123,6 +123,8 @@ size_t __scudo_get_region_info_size(void);
 const char *__scudo_get_ring_buffer_addr(void);
 size_t __scudo_get_ring_buffer_size(void);
 
+bool __scudo_resize_ring_buffer(int);
+
 #ifndef M_DECAY_TIME
 #define M_DECAY_TIME -100
 #endif
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 13d627b116809b..7e7a0e59cfe6d0 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -915,6 +915,33 @@ SCUDO_TYPED_TEST(ScudoCombinedTest, StackDepot) {
   EXPECT_EQ(Depot->at(RingPosPtr + 2), 3u);
 }
 
+SCUDO_TYPED_TEST(ScudoCombinedTest, ResizeRingBuffer) {
+  auto *Allocator = this->Allocator.get();
+  auto Size = static_cast<int>(Allocator->getRingBufferSize());
+  auto DepotSize = Allocator->getStackDepotSize();
+  ASSERT_GT(Size, 0);
+  ASSERT_TRUE(Allocator->resizeRingBuffer(Size + 1024));
+  EXPECT_GT(Allocator->getRingBufferSize(), Size);
+  EXPECT_GT(Allocator->getStackDepotSize(), DepotSize);
+}
+
+SCUDO_TYPED_TEST(ScudoCombinedTest, ResizeRingBufferToZero) {
+  auto *Allocator = this->Allocator.get();
+  const char *PrevRB = Allocator->getRingBufferAddress();
+  auto PrevRBSize = Allocator->getRingBufferSize();
+  ASSERT_TRUE(Allocator->resizeRingBuffer(0));
+  EXPECT_EQ(Allocator->getRingBufferSize(), 0);
+  EXPECT_EQ(Allocator->getStackDepotSize(), 0);
+  EXPECT_EQ(Allocator->getStackDepotAddress(), nullptr);
+  EXPECT_EQ(Allocator->getRingBufferAddress(), nullptr);
+  // Make sure the old buffer is still accessible without a segfault.
+  // We DONTNEED the buffer, so we free the underlying pages, but we keep the
+  // VMA around to prevent races.
+  ASSERT_NE(PrevRB, nullptr);
+  ASSERT_GT(PrevRBSize, 0);
+  const_cast<volatile const char *>(PrevRB)[PrevRBSize - 1];
+}
+
 #if SCUDO_CAN_USE_PRIMARY64
 #if SCUDO_TRUSTY
 
diff --git a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
index e9d8c1e8d3db2f..004c9b9ef36cf6 100644
--- a/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
+++ b/compiler-rt/lib/scudo/standalone/wrappers_c_bionic.cpp
@@ -72,4 +72,8 @@ INTERFACE size_t __scudo_get_ring_buffer_size() {
   return Allocator.getRingBufferSize();
 }
 
+INTERFACE bool __scudo_resize_ring_buffer(int new_size) {
+  return Allocator.resizeRingBuffer(new_size);
+}
+
 #endif // SCUDO_ANDROID && _BIONIC

Created using spr 1.3.4
Created using spr 1.3.4

[skip ci]
Created using spr 1.3.4
@fmayer fmayer changed the base branch from users/fmayer/sprmain.scudo-allow-to-resize-allocation-ring-buffer to main February 23, 2024 19:32
@fmayer
Copy link
Contributor Author

fmayer commented Feb 28, 2024

PTAL

@ChiaHungDuan
Copy link
Contributor

PTAL

Sorry I'm current tied up with some other task. Will get back to this on next Monday

u32 TabSize = static_cast<u32>(roundUpPowerOfTwo(kStacksPerRingBufferEntry *
AllocationRingBufferSize));
if (TabSize > UINT32_MAX / kFramesPerStack)
return;
return false;
u32 RingSize = static_cast<u32>(TabSize * kFramesPerStack);
DCHECK(isPowerOfTwo(RingSize));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be a real check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure (seems like clang knows to optimize this away anyway https://godbolt.org/z/GM6sKM5YE): #84255

static_cast<u32>(getFlags()->allocation_ring_buffer_size);
bool mapAndInitializeRingBuffer(int RingBufferSize) {
if (RingBufferSize < 0 ||
static_cast<unsigned int>(RingBufferSize) >= UINT32_MAX)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This condition won't be triggered. What's the limit of the RingBufferSize?

Comment on lines +1612 to +1614
// To allow resizeRingBuffer to be called in a multi-threaded context by apps,
// we do not actually unmap, but only madvise(DONTNEED) the pages. That way,
// straggler threads will not crash.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is weird. We are leaking memory because we never unmap the old AllocationRingBuffer (and we just leak the old mapping).

I think the right way to do this is that we need to disable the AllocationRingBuffer (Like how we disable the allocator) and do all the transition works (map new buffer, copy the content, .etc)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't leak the memory, we DONTNEED it so the kernel releases the actual memory. We only leak a few pages in the case where other threads touch the pages after we unmapped it. We do leak the mapping, that is correct.

We would need to lock protect all the lock-free operations in order to disable and re-enable and make sure no thread currently has operations in progress.

In the case where we know we are single-threaded, we can unmap the buffers afterwards. This is to support the case where apps want to change the buffer (I forwarded you an email on why they might want that).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition, keeping the old buffer doesn't prevent the potential crash. Any access to the old buffer can cause problem. Like if some thread tries to read the data from the old buffer which has been cleaned. Not to say the DONTNEED may happen at the same time some thread is still accessing it, which seems to be some undefined behavior.

And yes, the buffer swap is complicated under current design (Thanks for the email BTW). One easy way in my mind is, we only reset the available size of ring buffer and the ring buffer will be initialized with some value is big enough for all of their cases. As a result, in most cases, they still use small memory for the ring buffer.

I suppose we have the assumption that once we reset the available size, all the writes happened or happening will be viewed as staled (i.e., the buffer contains zero element). This will avoid the need of synchronizing the writing position and the available buffer size.

I'm not sure if we need to do some changes here and there in Android. What do you think?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And yes, the buffer swap is complicated under current design (Thanks for the email BTW). One easy way in my mind is, we only reset the available size of ring buffer and the ring buffer will be initialized with some value is big enough for all of their cases. As a result, in most cases, they still use small memory for the ring buffer.

That is not possible with the current design of the StackDepot as a hash table.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll take some time to review StackDepot and share the thought here later

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would think something like for apps don't need to change the buffer size, they adopt the default behavior. Otherwise, they need to set the buffer size and call malloc_set_track_allocation_stacks themselves.

Before the app starts, I think there some places we can call malloc_set_track_allocation_stacks and maybe we can read the AndroidManifest.xml to determine if we want to change the default ring buffer size (Sorry I'm not familiar with app, not sure if AndroidManifest.xml can be used for this purpose)

We need some changes for native processes as well (like a different place to call malloc_set_track_allocation_stacks)

Overall, I would guess it's still doable but may require some additional works in Android. I can ask around to see other's opinions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An app developer explicitly asked for this to be a mallopt, not a manifest attribute.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This DONTNEED usage looks safe to me. A trailing access with either repopulate the page or substitute a zero page, according to the spec. Not completely sure, but the latter might be the only actual possibility in Linux. In any case, the worst that may happen is we waste (leak) a page or two.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The case I'm concerning is like, one thread is writing to the old buffer and another thread is calling DONTNEED on the old buffer. Both are writing ("either repopulate the page or substitute a zero page" is still a writing) to the same memory and they are not atomic operations. This is why I think it's an undefined behavior.

About the wasting (leaking) pages, right, this is another concern I have. Even it's only for debugging, intentionally leaking memory in a memory allocator seems weird to me. In the perspective of maintenance, it's confusing people who are not familiar with this.

I know it's unlikely to cause problem but I would prefer considering any approaches that are safer and reasonable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the spec says that the data is either repopulated or reads back as zero. However, if multiple threads are accessing this at the same time, I would presume that the repopulate or read back zero is on a page by page basis. Is there something in the kernel that makes the operation atomic for the entire mapped data? The kernel might make this atomic by preventing all interrupts while executing the madvise.

Honestly, if this is only really used for debugging purposes, it might be better to simply let it leak away and skip the madvise completely. Maybe even log that you are leaking away the old data. Even the data going to zero requires that anyone ever touching this code has to remember that is a possibility, and might result in undetected bugs in the future.

The best way would have been to copy the data for callers rather than having pointers to it. But since this is really a debugging item, a simpler interface with some known issues is probably fine.

Comment on lines 1604 to 1607
static_assert(sizeof(AllocationRingBuffer) %
alignof(typename AllocationRingBuffer::Entry) ==
0,
"invalid alignment");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move this to the class definition of AllocationRingBuffer

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@@ -1564,11 +1570,11 @@ class Allocator {
UINTPTR_MAX);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please move this to the class definition of StackDepot

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't belong to StackDepot, this belongs here to make sure that even with the biggest possible TabSize and RingSize we don't overflow our 64 bit integer here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's nothing that relates to any constexpr arguments or variables here. That's the constraint to the StackDepot (especially the static_assert(sizeof(StackDepot) % alignof(atomic_u64) == 0))

Besides, I'm not sure if the overflow is intended,

UINT32_MAX * sizeof(atomic_u64) * UINT32_MAX * sizeof(atomic_u32)

I guess the assertion will never be triggered.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you read the comment, this assumes the max RingSize and TabSize, and the static_assert proves that we don't need to bother with overflow logic because it can never overflow

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(except of course it doesn't work, because that will also overflow. let me remove it for now).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, from the statements,

static_assert(sizeof(StackDepot) % alignof(atomic_u64) == 0);
static_assert(sizeof(StackDepot) + UINT32_MAX * sizeof(atomic_u64) *
                                           UINT32_MAX * sizeof(atomic_u32) < UINTPTR_MAX);

They are checking things independent to RingSize and TabSize here. The checks don't need the value of two variables. Any values of RingSize/TabSize will static assert the same thing. They are general constraints for StackDepot.

The expression results in a number larger than uint32_t but it'll wrap-around and the value will always be in the range of uint32_t. When it compares with UINTPTR_MAX, it's almost always true on 32 bit and always true on 64 bit. static_assert doesn't help with this.

In addition, even the `StackDepot meets the requirement, you still can't allocate ring buffer with UINT32_MAX. This seems to be an unreasonable assertion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just did a quick glance on this, another problem in that expression is that we may want to change it from

sizeof(StackDepot) + UINT32_MAX * sizeof(atomic_u64) * UINT32_MAX * sizeof(atomic_u32)

to

sizeof(StackDepot) + UINT32_MAX * sizeof(atomic_u64)  + UINT32_MAX * sizeof(atomic_u32)

which change the middle "*" to "+". But still, the UINT32_MAX may not be a good boundary.

Comment on lines +126 to +127
bool __scudo_resize_ring_buffer(int);

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I barely remember that we had discussion over the email about how we will implement this. I think it'll be set through mallopt, right? Besides, I'm not sure if we have agreed to implement it like this. For example, can we just grow the buffer when it meets certain condition? Or can't we use the env variable to set the size before the start of debugging app?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this will be used in the mallopt, but we need to do additional stuff in libc (see the AOSP draft CL linked in description)

Comment on lines +1544 to +1547
if (RingBufferSize == 0) {
swapOutRingBuffer(nullptr);
return true;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I feel like we may want to hint when TrackAllocationStacks is set but the ring buffer is nullptr.

fmayer added a commit to fmayer/llvm-project that referenced this pull request Apr 11, 2024
@fmayer
Copy link
Contributor Author

fmayer commented Apr 19, 2024

We decided to not pursue this further.

@fmayer fmayer closed this Apr 19, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

5 participants