-
Notifications
You must be signed in to change notification settings - Fork 12k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[scudo] Make local cache be agnostic to the type of node in freelist #67379
Conversation
@llvm/pr-subscribers-compiler-rt-sanitizer ChangesThis change moves the That means we will be able to store more blocks in each node of freelist and therefore reduce the memory used by BatchClass (with little performance overhead). Note that we haven't enabled that in this patch. This is the first step of this transition. Full diff: https://github.com/llvm/llvm-project/pull/67379.diff 6 Files Affected:
diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
index 2b7a613066b83b3..c4d3ea1e4f05ba8 100644
--- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt
+++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt
@@ -56,6 +56,7 @@ if(ANDROID)
endif()
set(SCUDO_HEADERS
+ allocator_common.h
allocator_config.h
atomic_helpers.h
bytemap.h
diff --git a/compiler-rt/lib/scudo/standalone/allocator_common.h b/compiler-rt/lib/scudo/standalone/allocator_common.h
new file mode 100644
index 000000000000000..95f4776ac596dc0
--- /dev/null
+++ b/compiler-rt/lib/scudo/standalone/allocator_common.h
@@ -0,0 +1,85 @@
+//===-- allocator_common.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SCUDO_ALLOCATOR_COMMON_H_
+#define SCUDO_ALLOCATOR_COMMON_H_
+
+#include "common.h"
+#include "list.h"
+
+namespace scudo {
+
+template <class SizeClassAllocator> struct TransferBatch {
+ typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
+ typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
+
+ static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
+ void setFromArray(CompactPtrT *Array, u16 N) {
+ DCHECK_LE(N, MaxNumCached);
+ Count = N;
+ memcpy(Batch, Array, sizeof(Batch[0]) * Count);
+ }
+ void appendFromArray(CompactPtrT *Array, u16 N) {
+ DCHECK_LE(N, MaxNumCached - Count);
+ memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
+ // u16 will be promoted to int by arithmetic type conversion.
+ Count = static_cast<u16>(Count + N);
+ }
+ void appendFromTransferBatch(TransferBatch *B, u16 N) {
+ DCHECK_LE(N, MaxNumCached - Count);
+ DCHECK_GE(B->Count, N);
+ // Append from the back of `B`.
+ memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
+ // u16 will be promoted to int by arithmetic type conversion.
+ Count = static_cast<u16>(Count + N);
+ B->Count = static_cast<u16>(B->Count - N);
+ }
+ void clear() { Count = 0; }
+ void add(CompactPtrT P) {
+ DCHECK_LT(Count, MaxNumCached);
+ Batch[Count++] = P;
+ }
+ void moveToArray(CompactPtrT *Array) {
+ memcpy(Array, Batch, sizeof(Batch[0]) * Count);
+ clear();
+ }
+ u16 getCount() const { return Count; }
+ bool isEmpty() const { return Count == 0U; }
+ CompactPtrT get(u16 I) const {
+ DCHECK_LE(I, Count);
+ return Batch[I];
+ }
+ TransferBatch *Next;
+
+private:
+ CompactPtrT Batch[MaxNumCached];
+ u16 Count;
+};
+
+// A BatchGroup is used to collect blocks. Each group has a group id to
+// identify the group kind of contained blocks.
+template <class SizeClassAllocator> struct BatchGroup {
+ // `Next` is used by IntrusiveList.
+ BatchGroup *Next;
+ // The compact base address of each group
+ uptr CompactPtrGroupBase;
+ // Cache value of SizeClassAllocatorLocalCache::getMaxCached()
+ u16 MaxCachedPerBatch;
+ // Number of blocks pushed into this group. This is an increment-only
+ // counter.
+ uptr PushedBlocks;
+ // This is used to track how many bytes are not in-use since last time we
+ // tried to release pages.
+ uptr BytesInBGAtLastCheckpoint;
+ // Blocks are managed by TransferBatch in a list.
+ SinglyLinkedList<TransferBatch<SizeClassAllocator>> Batches;
+};
+
+} // namespace scudo
+
+#endif // SCUDO_ALLOCATOR_COMMON_H_
diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
index 1095eb5f186d1e1..8cb02abd16fd750 100644
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -22,74 +22,6 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
typedef typename SizeClassAllocator::SizeClassMap SizeClassMap;
typedef typename SizeClassAllocator::CompactPtrT CompactPtrT;
- struct TransferBatch {
- static const u16 MaxNumCached = SizeClassMap::MaxNumCachedHint;
- void setFromArray(CompactPtrT *Array, u16 N) {
- DCHECK_LE(N, MaxNumCached);
- Count = N;
- memcpy(Batch, Array, sizeof(Batch[0]) * Count);
- }
- void appendFromArray(CompactPtrT *Array, u16 N) {
- DCHECK_LE(N, MaxNumCached - Count);
- memcpy(Batch + Count, Array, sizeof(Batch[0]) * N);
- // u16 will be promoted to int by arithmetic type conversion.
- Count = static_cast<u16>(Count + N);
- }
- void appendFromTransferBatch(TransferBatch *B, u16 N) {
- DCHECK_LE(N, MaxNumCached - Count);
- DCHECK_GE(B->Count, N);
- // Append from the back of `B`.
- memcpy(Batch + Count, B->Batch + (B->Count - N), sizeof(Batch[0]) * N);
- // u16 will be promoted to int by arithmetic type conversion.
- Count = static_cast<u16>(Count + N);
- B->Count = static_cast<u16>(B->Count - N);
- }
- void clear() { Count = 0; }
- void add(CompactPtrT P) {
- DCHECK_LT(Count, MaxNumCached);
- Batch[Count++] = P;
- }
- void copyToArray(CompactPtrT *Array) const {
- memcpy(Array, Batch, sizeof(Batch[0]) * Count);
- }
- u16 getCount() const { return Count; }
- bool isEmpty() const { return Count == 0U; }
- CompactPtrT get(u16 I) const {
- DCHECK_LE(I, Count);
- return Batch[I];
- }
- static u16 getMaxCached(uptr Size) {
- return Min(MaxNumCached, SizeClassMap::getMaxCachedHint(Size));
- }
- TransferBatch *Next;
-
- private:
- CompactPtrT Batch[MaxNumCached];
- u16 Count;
- };
-
- // A BatchGroup is used to collect blocks. Each group has a group id to
- // identify the group kind of contained blocks.
- struct BatchGroup {
- // `Next` is used by IntrusiveList.
- BatchGroup *Next;
- // The compact base address of each group
- uptr CompactPtrGroupBase;
- // Cache value of TransferBatch::getMaxCached()
- u16 MaxCachedPerBatch;
- // Number of blocks pushed into this group. This is an increment-only
- // counter.
- uptr PushedBlocks;
- // This is used to track how many bytes are not in-use since last time we
- // tried to release pages.
- uptr BytesInBGAtLastCheckpoint;
- // Blocks are managed by TransferBatch in a list.
- SinglyLinkedList<TransferBatch> Batches;
- };
-
- static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
- "BatchGroup uses the same class size as TransferBatch");
-
void init(GlobalStats *S, SizeClassAllocator *A) {
DCHECK(isEmpty());
Stats.init();
@@ -151,7 +83,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
}
void drain() {
- // Drain BatchClassId last as createBatch can refill it.
+ // Drain BatchClassId last as it may be needed while draining normal blocks.
for (uptr I = 0; I < NumClasses; ++I) {
if (I == BatchClassId)
continue;
@@ -163,19 +95,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
DCHECK(isEmpty());
}
- TransferBatch *createBatch(uptr ClassId, void *B) {
- if (ClassId != BatchClassId)
- B = allocate(BatchClassId);
+ void *getBatchClassBlock() {
+ void *B = allocate(BatchClassId);
if (UNLIKELY(!B))
reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
- return reinterpret_cast<TransferBatch *>(B);
- }
-
- BatchGroup *createGroup() {
- void *Ptr = allocate(BatchClassId);
- if (UNLIKELY(!Ptr))
- reportOutOfMemory(SizeClassAllocator::getSizeByClassId(BatchClassId));
- return reinterpret_cast<BatchGroup *>(Ptr);
+ return B;
}
LocalStats &getStats() { return Stats; }
@@ -203,6 +127,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
Str->append(" No block is cached.\n");
}
+ static u16 getMaxCached(uptr Size) {
+ return Min(SizeClassMap::MaxNumCachedHint,
+ SizeClassMap::getMaxCachedHint(Size));
+ }
+
private:
static const uptr NumClasses = SizeClassMap::NumClasses;
static const uptr BatchClassId = SizeClassMap::BatchClassId;
@@ -211,7 +140,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
u16 MaxCount;
// Note: ClassSize is zero for the transfer batch.
uptr ClassSize;
- CompactPtrT Chunks[2 * TransferBatch::MaxNumCached];
+ CompactPtrT Chunks[2 * SizeClassMap::MaxNumCachedHint];
};
PerClass PerClassArray[NumClasses] = {};
LocalStats Stats;
@@ -228,7 +157,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
for (uptr I = 0; I < NumClasses; I++) {
PerClass *P = &PerClassArray[I];
const uptr Size = SizeClassAllocator::getSizeByClassId(I);
- P->MaxCount = static_cast<u16>(2 * TransferBatch::getMaxCached(Size));
+ P->MaxCount = static_cast<u16>(2 * getMaxCached(Size));
if (I != BatchClassId) {
P->ClassSize = Size;
} else {
@@ -246,15 +175,14 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
NOINLINE bool refill(PerClass *C, uptr ClassId) {
initCacheMaybe(C);
- TransferBatch *B = Allocator->popBatch(this, ClassId);
- if (UNLIKELY(!B))
- return false;
- DCHECK_GT(B->getCount(), 0);
- C->Count = B->getCount();
- B->copyToArray(C->Chunks);
- B->clear();
- destroyBatch(ClassId, B);
- return true;
+
+ // TODO(chiahungduan): Pass the max number cached for each size class.
+ const u16 NumBlocksRefilled =
+ Allocator->popBlocks(this, ClassId, C->Chunks);
+ DCHECK_LE(NumBlocksRefilled,
+ getMaxCached(SizeClassAllocator::getSizeByClassId(ClassId)));
+ C->Count += NumBlocksRefilled;
+ return NumBlocksRefilled != 0;
}
NOINLINE void drain(PerClass *C, uptr ClassId) {
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index 533615ad3816d09..ed597e64dadd15e 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -9,6 +9,7 @@
#ifndef SCUDO_PRIMARY32_H_
#define SCUDO_PRIMARY32_H_
+#include "allocator_common.h"
#include "bytemap.h"
#include "common.h"
#include "list.h"
@@ -53,8 +54,11 @@ template <typename Config> class SizeClassAllocator32 {
"");
typedef SizeClassAllocator32<Config> ThisT;
typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
- typedef typename CacheT::TransferBatch TransferBatch;
- typedef typename CacheT::BatchGroup BatchGroup;
+ typedef TransferBatch<ThisT> TransferBatch;
+ typedef BatchGroup<ThisT> BatchGroup;
+
+ static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
+ "BatchGroup uses the same class size as TransferBatch");
static uptr getSizeByClassId(uptr ClassId) {
return (ClassId == SizeClassMap::BatchClassId)
@@ -187,6 +191,20 @@ template <typename Config> class SizeClassAllocator32 {
return BlockSize > PageSize;
}
+ u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray) {
+ TransferBatch *B = popBatch(C, ClassId);
+ if (!B)
+ return 0;
+
+ const u16 Count = B->getCount();
+ B->moveToArray(ToArray);
+
+ if (ClassId != SizeClassMap::BatchClassId)
+ C->deallocate(SizeClassMap::BatchClassId, B);
+
+ return Count;
+ }
+
TransferBatch *popBatch(CacheT *C, uptr ClassId) {
DCHECK_LT(ClassId, NumClasses);
SizeClassInfo *Sci = getSizeClassInfo(ClassId);
@@ -520,8 +538,8 @@ template <typename Config> class SizeClassAllocator32 {
// from `CreateGroup` in `pushBlocksImpl`
BG->PushedBlocks = 1;
BG->BytesInBGAtLastCheckpoint = 0;
- BG->MaxCachedPerBatch = TransferBatch::getMaxCached(
- getSizeByClassId(SizeClassMap::BatchClassId));
+ BG->MaxCachedPerBatch =
+ CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
Sci->FreeListInfo.BlockList.push_front(BG);
}
@@ -600,17 +618,17 @@ template <typename Config> class SizeClassAllocator32 {
DCHECK_GT(Size, 0U);
auto CreateGroup = [&](uptr CompactPtrGroupBase) {
- BatchGroup *BG = C->createGroup();
+ BatchGroup *BG = reinterpret_cast<BatchGroup *>(C->getBatchClassBlock());
BG->Batches.clear();
- TransferBatch *TB = C->createBatch(ClassId, nullptr);
+ TransferBatch *TB =
+ reinterpret_cast<TransferBatch *>(C->getBatchClassBlock());
TB->clear();
BG->CompactPtrGroupBase = CompactPtrGroupBase;
BG->Batches.push_front(TB);
BG->PushedBlocks = 0;
BG->BytesInBGAtLastCheckpoint = 0;
- BG->MaxCachedPerBatch =
- TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+ BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
return BG;
};
@@ -625,9 +643,7 @@ template <typename Config> class SizeClassAllocator32 {
u16 UnusedSlots =
static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
if (UnusedSlots == 0) {
- CurBatch = C->createBatch(
- ClassId,
- reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+ CurBatch = reinterpret_cast<TransferBatch *>(C->getBatchClassBlock());
CurBatch->clear();
Batches.push_front(CurBatch);
UnusedSlots = BG->MaxCachedPerBatch;
@@ -775,7 +791,7 @@ template <typename Config> class SizeClassAllocator32 {
}
const uptr Size = getSizeByClassId(ClassId);
- const u16 MaxCount = TransferBatch::getMaxCached(Size);
+ const u16 MaxCount = CacheT::getMaxCached(Size);
DCHECK_GT(MaxCount, 0U);
// The maximum number of blocks we should carve in the region is dictated
// by the maximum number of batches we want to fill, and the amount of
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index ed0c4deaac2c8bf..ca9a4ce70e04332 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -9,6 +9,7 @@
#ifndef SCUDO_PRIMARY64_H_
#define SCUDO_PRIMARY64_H_
+#include "allocator_common.h"
#include "bytemap.h"
#include "common.h"
#include "list.h"
@@ -55,8 +56,11 @@ template <typename Config> class SizeClassAllocator64 {
static const uptr GroupScale = GroupSizeLog - CompactPtrScale;
typedef SizeClassAllocator64<Config> ThisT;
typedef SizeClassAllocatorLocalCache<ThisT> CacheT;
- typedef typename CacheT::TransferBatch TransferBatch;
- typedef typename CacheT::BatchGroup BatchGroup;
+ typedef TransferBatch<ThisT> TransferBatch;
+ typedef BatchGroup<ThisT> BatchGroup;
+
+ static_assert(sizeof(BatchGroup) <= sizeof(TransferBatch),
+ "BatchGroup uses the same class size as TransferBatch");
static uptr getSizeByClassId(uptr ClassId) {
return (ClassId == SizeClassMap::BatchClassId)
@@ -203,6 +207,20 @@ template <typename Config> class SizeClassAllocator64 {
DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
}
+ u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray) {
+ TransferBatch *B = popBatch(C, ClassId);
+ if (!B)
+ return 0;
+
+ const u16 Count = B->getCount();
+ B->moveToArray(ToArray);
+
+ if (ClassId != SizeClassMap::BatchClassId)
+ C->deallocate(SizeClassMap::BatchClassId, B);
+
+ return Count;
+ }
+
TransferBatch *popBatch(CacheT *C, uptr ClassId) {
DCHECK_LT(ClassId, NumClasses);
RegionInfo *Region = getRegionInfo(ClassId);
@@ -636,8 +654,8 @@ template <typename Config> class SizeClassAllocator64 {
// from `CreateGroup` in `pushBlocksImpl`
BG->PushedBlocks = 1;
BG->BytesInBGAtLastCheckpoint = 0;
- BG->MaxCachedPerBatch = TransferBatch::getMaxCached(
- getSizeByClassId(SizeClassMap::BatchClassId));
+ BG->MaxCachedPerBatch =
+ CacheT::getMaxCached(getSizeByClassId(SizeClassMap::BatchClassId));
Region->FreeListInfo.BlockList.push_front(BG);
}
@@ -715,17 +733,17 @@ template <typename Config> class SizeClassAllocator64 {
DCHECK_GT(Size, 0U);
auto CreateGroup = [&](uptr CompactPtrGroupBase) {
- BatchGroup *BG = C->createGroup();
+ BatchGroup *BG = reinterpret_cast<BatchGroup *>(C->getBatchClassBlock());
BG->Batches.clear();
- TransferBatch *TB = C->createBatch(ClassId, nullptr);
+ TransferBatch *TB =
+ reinterpret_cast<TransferBatch *>(C->getBatchClassBlock());
TB->clear();
BG->CompactPtrGroupBase = CompactPtrGroupBase;
BG->Batches.push_front(TB);
BG->PushedBlocks = 0;
BG->BytesInBGAtLastCheckpoint = 0;
- BG->MaxCachedPerBatch =
- TransferBatch::getMaxCached(getSizeByClassId(ClassId));
+ BG->MaxCachedPerBatch = CacheT::getMaxCached(getSizeByClassId(ClassId));
return BG;
};
@@ -740,9 +758,7 @@ template <typename Config> class SizeClassAllocator64 {
u16 UnusedSlots =
static_cast<u16>(BG->MaxCachedPerBatch - CurBatch->getCount());
if (UnusedSlots == 0) {
- CurBatch = C->createBatch(
- ClassId,
- reinterpret_cast<void *>(decompactPtr(ClassId, Array[I])));
+ CurBatch = reinterpret_cast<TransferBatch *>(C->getBatchClassBlock());
CurBatch->clear();
Batches.push_front(CurBatch);
UnusedSlots = BG->MaxCachedPerBatch;
@@ -873,7 +889,7 @@ template <typename Config> class SizeClassAllocator64 {
RegionInfo *Region)
REQUIRES(Region->MMLock) EXCLUDES(Region->FLLock) {
const uptr Size = getSizeByClassId(ClassId);
- const u16 MaxCount = TransferBatch::getMaxCached(Size);
+ const u16 MaxCount = CacheT::getMaxCached(Size);
const uptr RegionBeg = Region->RegionBeg;
const uptr MappedUser = Region->MemMapInfo.MappedUser;
diff --git a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
index 074977ff27e65f6..e368f521bea71a2 100644
--- a/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/primary_test.cpp
@@ -207,7 +207,7 @@ struct SmallRegionsConfig {
// For the 32-bit one, it requires actually exhausting memory, so we skip it.
TEST(ScudoPrimaryTest, Primary64OOM) {
using Primary = scudo::SizeClassAllocator64<SmallRegionsConfig>;
- using TransferBatch = Primary::CacheT::TransferBatch;
+ using TransferBatch = Primary::TransferBatch;
Primary Allocator;
Allocator.init(/*ReleaseToOsInterval=*/-1);
typename Primary::CacheT Cache;
@@ -233,8 +233,9 @@ TEST(ScudoPrimaryTest, Primary64OOM) {
while (!Batches.empty()) {
TransferBatch *B = Batches.back();
Batches.pop_back();
- B->copyToArray(Blocks);
- Allocator.pushBlocks(&Cache, ClassId, Blocks, B->getCount());
+ const scudo::u16 Count = B->getCount();
+ B->moveToArray(Blocks);
+ Allocator.pushBlocks(&Cache, ClassId, Blocks, Count);
Cache.deallocate(Primary::SizeClassMap::BatchClassId, B);
}
Cache.destroy(nullptr);
|
Add one note, The only change in
BatchGroup is the same as before (except one comment update) |
This change moves the `TransferBatch` and `BatchGroup` out of SizeClassAllocatorLocalCache. It allows us that the node in freelist can store more blocks instead of depending on the number of blocks cached. That means we will be able to store more blocks in each node of freelist and therefore reduce the memory used by BatchClass (with little performance overhead). Note that we haven't enabled that in this patch. This is the first step of this transition.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One very small nit.
if (!B) | ||
return 0; | ||
|
||
const u16 Count = B->getCount(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There used to be a DCHECK that count was not zero. Should that DCHECK still be here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice catch!
5bc669b
to
1dad033
Compare
…reelist (llvm#67379)" This reverts commit b9c6737.
This change moves the
TransferBatch
andBatchGroup
out of SizeClassAllocatorLocalCache. It allows us that the node in freelist can store more blocks instead of depending on the number of blocks cached.That means we will be able to store more blocks in each node of freelist and therefore reduce the memory used by BatchClass (with little performance overhead). Note that we haven't enabled that in this patch. This is the first step of this transition.