[scudo][standalone] Compact pointers for Caches/Batches

This CL introduces configuration options to allow pointers to be compacted in the thread-specific caches and transfer batches. This offers the possibility to have them use 32-bit of space instead of 64-bit for the 64-bit Primary, thus cutting the size of the caches and batches by nearly half (and as such the memory used in size class 0). The cost is an additional read from the region information in the fast path. This is not a new idea, as it's being used in the sanitizer_common 64-bit primary. The difference here is that it is configurable via the allocator config, with the possibility of not compacting at all. This CL enables compacting pointers in the Android and Fuchsia default configurations. Differential Revision: https://reviews.llvm.org/D96435
llvm · Feb 25, 2021 · 2c56776 · 2c56776
1 parent ec4408a
commit 2c56776
Show file tree

Hide file tree

Showing 9 changed files with 177 additions and 74 deletions.
diff --git a/compiler-rt/lib/scudo/standalone/allocator_config.h b/compiler-rt/lib/scudo/standalone/allocator_config.h
@@ -21,6 +21,35 @@
 
 namespace scudo {
 
+// The combined allocator uses a structure as a template argument that
+// specifies the configuration options for the various subcomponents of the
+// allocator.
+//
+// struct ExampleConfig {
+//   // SizeClasMmap to use with the Primary.
+//   using SizeClassMap = DefaultSizeClassMap;
+//   // Indicates possible support for Memory Tagging.
+//   static const bool MaySupportMemoryTagging = false;
+//   // Defines the Primary allocator to use.
+//   typedef SizeClassAllocator64<ExampleConfig> Primary;
+//   // Log2 of the size of a size class region, as used by the Primary.
+//   static const uptr PrimaryRegionSizeLog = 30U;
+//   // Defines the type and scale of a compact pointer. A compact pointer can
+//   // be understood as the offset of a pointer within the region it belongs
+//   // to, in increments of a power-of-2 scale.
+//   // eg: Ptr = Base + (CompactPtr << Scale).
+//   typedef u32 PrimaryCompactPtrT;
+//   static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
+//   // Defines the minimal & maximal release interval that can be set.
+//   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
+//   static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
+//   // Defines the type of cache used by the Secondary. Some additional
+//   // configuration entries can be necessary depending on the Cache.
+//   typedef MapAllocatorNoCache SecondaryCache;
+//   // Thread-Specific Data Registry used, shared or exclusive.
+//   template <class A> using TSDRegistryT = TSDRegistrySharedT<A, 8U, 4U>;
+// };
+
 // Default configurations for various platforms.
 
 struct DefaultConfig {
@@ -29,10 +58,13 @@ struct DefaultConfig {
 
 #if SCUDO_CAN_USE_PRIMARY64
   typedef SizeClassAllocator64<DefaultConfig> Primary;
-  static const uptr PrimaryRegionSizeLog = 30U;
+  static const uptr PrimaryRegionSizeLog = 32U;
+  typedef uptr PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = 0;
 #else
   typedef SizeClassAllocator32<DefaultConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 19U;
+  typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
@@ -55,9 +87,12 @@ struct AndroidConfig {
 #if SCUDO_CAN_USE_PRIMARY64
   typedef SizeClassAllocator64<AndroidConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 28U;
+  typedef u32 PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
 #else
   typedef SizeClassAllocator32<AndroidConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 18U;
+  typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
   static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
@@ -81,9 +116,12 @@ struct AndroidSvelteConfig {
 #if SCUDO_CAN_USE_PRIMARY64
   typedef SizeClassAllocator64<AndroidSvelteConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 27U;
+  typedef u32 PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
 #else
   typedef SizeClassAllocator32<AndroidSvelteConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 16U;
+  typedef uptr PrimaryCompactPtrT;
 #endif
   static const s32 PrimaryMinReleaseToOsIntervalMs = 1000;
   static const s32 PrimaryMaxReleaseToOsIntervalMs = 1000;
@@ -107,6 +145,8 @@ struct FuchsiaConfig {
 
   typedef SizeClassAllocator64<FuchsiaConfig> Primary;
   static const uptr PrimaryRegionSizeLog = 30U;
+  typedef u32 PrimaryCompactPtrT;
+  static const uptr PrimaryCompactPtrScale = SCUDO_MIN_ALIGNMENT_LOG;
   static const s32 PrimaryMinReleaseToOsIntervalMs = INT32_MIN;
   static const s32 PrimaryMaxReleaseToOsIntervalMs = INT32_MAX;
 

diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -17,24 +17,25 @@ namespace scudo {
 
 template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
   typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
+  typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
 
   struct TransferBatch {
     static const u32 MaxNumCached = SizeClassMap::MaxNumCachedHint;
-    void setFromArray(void **Array, u32 N) {
+    void setFromArray(CompactPtrT *Array, u32 N) {
       DCHECK_LE(N, MaxNumCached);
       Count = N;
-      memcpy(Batch, Array, sizeof(void *) * Count);
+      memcpy(Batch, Array, sizeof(Batch[0]) * Count);
     }
     void clear() { Count = 0; }
-    void add(void *P) {
+    void add(CompactPtrT P) {
       DCHECK_LT(Count, MaxNumCached);
       Batch[Count++] = P;
     }
-    void copyToArray(void **Array) const {
-      memcpy(Array, Batch, sizeof(void *) * Count);
+    void copyToArray(CompactPtrT *Array) const {
+      memcpy(Array, Batch, sizeof(Batch[0]) * Count);
     }
     u32 getCount() const { return Count; }
-    void *get(u32 I) const {
+    CompactPtrT get(u32 I) const {
       DCHECK_LE(I, Count);
       return Batch[I];
     }
@@ -45,7 +46,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
 
   private:
     u32 Count;
-    void *Batch[MaxNumCached];
+    CompactPtrT Batch[MaxNumCached];
   };
 
   void initLinkerInitialized(GlobalStats *S, SizeClassAllocator *A) {
@@ -78,13 +79,10 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     // Count, while Chunks might be further off (depending on Count). That keeps
     // the memory accesses in close quarters.
     const uptr ClassSize = C->ClassSize;
-    void *P = C->Chunks[--C->Count];
-    // The jury is still out as to whether any kind of PREFETCH here increases
-    // performance. It definitely decreases performance on Android though.
-    // if (!SCUDO_ANDROID) PREFETCH(P);
+    CompactPtrT CompactP = C->Chunks[--C->Count];
     Stats.add(StatAllocated, ClassSize);
     Stats.sub(StatFree, ClassSize);
-    return P;
+    return Allocator->decompactPtr(ClassId, CompactP);
   }
 
   void deallocate(uptr ClassId, void *P) {
@@ -97,7 +95,8 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
       drain(C, ClassId);
     // See comment in allocate() about memory accesses.
     const uptr ClassSize = C->ClassSize;
-    C->Chunks[C->Count++] = P;
+    C->Chunks[C->Count++] =
+        Allocator->compactPtr(ClassId, reinterpret_cast<uptr>(P));
     Stats.sub(StatAllocated, ClassSize);
     Stats.add(StatFree, ClassSize);
   }
@@ -124,7 +123,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     u32 Count;
     u32 MaxCount;
     uptr ClassSize;
-    void *Chunks[2 * TransferBatch::MaxNumCached];
+    CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
   };
   PerClass PerClassArray[NumClasses];
   LocalStats Stats;
@@ -166,7 +165,8 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
 
   NOINLINE void drain(PerClass *C, uptr ClassId) {
     const u32 Count = Min(C->MaxCount / 2, C->Count);
-    TransferBatch *B = createBatch(ClassId, C->Chunks[0]);
+    TransferBatch *B =
+        createBatch(ClassId, Allocator->decompactPtr(ClassId, C->Chunks[0]));
     if (UNLIKELY(!B))
       reportOutOfMemory(
           SizeClassAllocator::getSizeByClassId(SizeClassMap::BatchClassId));

diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -41,6 +41,7 @@ namespace scudo {
 
 template <typename Config> class SizeClassAllocator32 {
 public:
+  typedef typename Config::PrimaryCompactPtrT CompactPtrT;
   typedef typename Config::SizeClassMap SizeClassMap;
   // The bytemap can only track UINT8_MAX - 1 classes.
   static_assert(SizeClassMap::LargestClassId <= (UINT8_MAX - 1), "");
@@ -67,7 +68,7 @@ template <typename Config> class SizeClassAllocator32 {
 
     u32 Seed;
     const u64 Time = getMonotonicTime();
-    if (UNLIKELY(!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed))))
+    if (!getRandom(reinterpret_cast<void *>(&Seed), sizeof(Seed)))
       Seed = static_cast<u32>(
           Time ^ (reinterpret_cast<uptr>(SizeClassInfoArray) >> 6));
     for (uptr I = 0; I < NumClasses; I++) {
@@ -102,6 +103,14 @@ template <typename Config> class SizeClassAllocator32 {
     PossibleRegions.unmapTestOnly();
   }
 
+  CompactPtrT compactPtr(UNUSED uptr ClassId, uptr Ptr) const {
+    return static_cast<CompactPtrT>(Ptr);
+  }
+
+  void *decompactPtr(UNUSED uptr ClassId, CompactPtrT CompactPtr) const {
+    return reinterpret_cast<void *>(static_cast<uptr>(CompactPtr));
+  }
+
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
@@ -359,17 +368,18 @@ template <typename Config> class SizeClassAllocator32 {
     // Fill the transfer batches and put them in the size-class freelist. We
     // need to randomize the blocks for security purposes, so we first fill a
     // local array that we then shuffle before populating the batches.
-    void *ShuffleArray[ShuffleArraySize];
+    CompactPtrT ShuffleArray[ShuffleArraySize];
     DCHECK_LE(NumberOfBlocks, ShuffleArraySize);
 
     uptr P = Region + Offset;
     for (u32 I = 0; I < NumberOfBlocks; I++, P += Size)
-      ShuffleArray[I] = reinterpret_cast<void *>(P);
+      ShuffleArray[I] = reinterpret_cast<CompactPtrT>(P);
     // No need to shuffle the batches size class.
     if (ClassId != SizeClassMap::BatchClassId)
       shuffle(ShuffleArray, NumberOfBlocks, &Sci->RandState);
     for (u32 I = 0; I < NumberOfBlocks;) {
-      TransferBatch *B = C->createBatch(ClassId, ShuffleArray[I]);
+      TransferBatch *B =
+          C->createBatch(ClassId, reinterpret_cast<void *>(ShuffleArray[I]));
       if (UNLIKELY(!B))
         return nullptr;
       const u32 N = Min(MaxCount, NumberOfBlocks - I);
@@ -435,7 +445,7 @@ template <typename Config> class SizeClassAllocator32 {
     if (BlockSize < PageSize / 16U) {
       if (!Force && BytesPushed < Sci->AllocatedUser / 16U)
         return 0;
-      // We want 8x% to 9x% free bytes (the larger the bock, the lower the %).
+      // We want 8x% to 9x% free bytes (the larger the block, the lower the %).
       if ((BytesInFreeList * 100U) / Sci->AllocatedUser <
           (100U - 1U - BlockSize / 16U))
         return 0;
@@ -463,8 +473,11 @@ template <typename Config> class SizeClassAllocator32 {
     auto SkipRegion = [this, First, ClassId](uptr RegionIndex) {
       return (PossibleRegions[First + RegionIndex] - 1U) != ClassId;
     };
-    releaseFreeMemoryToOS(Sci->FreeList, Base, RegionSize, NumberOfRegions,
-                          BlockSize, &Recorder, SkipRegion);
+    auto DecompactPtr = [](CompactPtrT CompactPtr) {
+      return reinterpret_cast<uptr>(CompactPtr);
+    };
+    releaseFreeMemoryToOS(Sci->FreeList, RegionSize, NumberOfRegions, BlockSize,
+                          &Recorder, DecompactPtr, SkipRegion);
     if (Recorder.getReleasedRangesCount() > 0) {
       Sci->ReleaseInfo.PushedBlocksAtLastRelease = Sci->Stats.PushedBlocks;
       Sci->ReleaseInfo.RangesReleased += Recorder.getReleasedRangesCount();