Skip to content
Permalink
Browse files

[scudo][standalone] Optimization pass

Summary:
This introduces a bunch of small optimizations with the purpose of
making the fastpath tighter:
- tag more conditions as `LIKELY`/`UNLIKELY`: as a rule of thumb we
  consider that every operation related to the secondary is unlikely
- attempt to reduce the number of potentially extraneous instructions
- reorganize the `Chunk` header to not straddle a word boundary and
  use more appropriate types

Note that some `LIKELY`/`UNLIKELY` impact might be less obvious as
they are in slow paths (for example in `secondary.cc`), but at this
point I am throwing a pretty wide net, and it's consistant and doesn't
hurt.

This was mosly done for the benfit of Android, but other platforms
benefit from it too. An aarch64 Android benchmark gives:
- before:
```
  BM_youtube/min_time:15.000/repeats:4/manual_time_mean              445244 us       659385 us            4
  BM_youtube/min_time:15.000/repeats:4/manual_time_median            445007 us       658970 us            4
  BM_youtube/min_time:15.000/repeats:4/manual_time_stddev               885 us         1332 us            4
```
- after:
```
  BM_youtube/min_time:15.000/repeats:4/manual_time_mean       415697 us       621925 us            4
  BM_youtube/min_time:15.000/repeats:4/manual_time_median     415913 us       622061 us            4
  BM_youtube/min_time:15.000/repeats:4/manual_time_stddev        990 us         1163 us            4
```

Additional since `-Werror=conversion` is enabled on some platforms we
are built on, enable it upstream to catch things early: a few sign
conversions had slept through and needed additional casting.

Reviewers: hctim, morehouse, eugenis, vitalybuka

Reviewed By: vitalybuka

Subscribers: srhines, mgorny, javed.absar, kristof.beyls, delcypher, #sanitizers, llvm-commits

Tags: #llvm, #sanitizers

Differential Revision: https://reviews.llvm.org/D64664

llvm-svn: 366918
  • Loading branch information...
cryptoad committed Jul 24, 2019
1 parent 5cdacea commit 419f1a4185d551594dc453b258bc4b8417edcfeb
@@ -5,6 +5,7 @@ include_directories(../..)
set(SCUDO_CFLAGS)

list(APPEND SCUDO_CFLAGS
-Werror=conversion
-Wall
-nostdinc++)

@@ -29,15 +29,15 @@ INLINE u16 computeChecksum(u32 Seed, uptr Value, uptr *Array, uptr ArraySize) {
u32 Crc = static_cast<u32>(CRC32_INTRINSIC(Seed, Value));
for (uptr I = 0; I < ArraySize; I++)
Crc = static_cast<u32>(CRC32_INTRINSIC(Crc, Array[I]));
return static_cast<u16>((Crc & 0xffff) ^ (Crc >> 16));
return static_cast<u16>(Crc ^ (Crc >> 16));
#else
if (HashAlgorithm == Checksum::HardwareCRC32) {
u32 Crc = computeHardwareCRC32(Seed, Value);
for (uptr I = 0; I < ArraySize; I++)
Crc = computeHardwareCRC32(Crc, Array[I]);
return static_cast<u16>((Crc & 0xffff) ^ (Crc >> 16));
return static_cast<u16>(Crc ^ (Crc >> 16));
} else {
u16 Checksum = computeBSDChecksum(static_cast<u16>(Seed & 0xffff), Value);
u16 Checksum = computeBSDChecksum(static_cast<u16>(Seed), Value);
for (uptr I = 0; I < ArraySize; I++)
Checksum = computeBSDChecksum(Checksum, Array[I]);
return Checksum;
@@ -63,24 +63,24 @@ enum State : u8 { Available = 0, Allocated = 1, Quarantined = 2 };
typedef u64 PackedHeader;
// Update the 'Mask' constants to reflect changes in this structure.
struct UnpackedHeader {
u64 Checksum : 16;
u64 ClassId : 8;
u64 SizeOrUnusedBytes : 20;
uptr ClassId : 8;
u8 State : 2;
u8 Origin : 2;
u64 Offset : 16;
uptr SizeOrUnusedBytes : 20;
uptr Offset : 16;
uptr Checksum : 16;
};
typedef atomic_u64 AtomicPackedHeader;
COMPILER_CHECK(sizeof(UnpackedHeader) == sizeof(PackedHeader));

// Those constants are required to silence some -Werror=conversion errors when
// assigning values to the related bitfield variables.
constexpr uptr ChecksumMask = (1UL << 16) - 1;
constexpr uptr ClassIdMask = (1UL << 8) - 1;
constexpr u8 StateMask = (1U << 2) - 1;
constexpr u8 OriginMask = (1U << 2) - 1;
constexpr uptr SizeOrUnusedBytesMask = (1UL << 20) - 1;
constexpr uptr StateMask = (1UL << 2) - 1;
constexpr uptr OriginMask = (1UL << 2) - 1;
constexpr uptr OffsetMask = (1UL << 16) - 1;
constexpr uptr ChecksumMask = (1UL << 16) - 1;

constexpr uptr getHeaderSize() {
return roundUpTo(sizeof(PackedHeader), 1U << SCUDO_MIN_ALIGNMENT_LOG);
@@ -46,8 +46,8 @@ template <class Params> class Allocator {
Chunk::compareExchangeHeader(Allocator.Cookie, Ptr, &NewHeader, &Header);

void *BlockBegin = Allocator::getBlockBegin(Ptr, &NewHeader);
const uptr ClassId = Header.ClassId;
if (ClassId)
const uptr ClassId = NewHeader.ClassId;
if (LIKELY(ClassId))
Cache.deallocate(ClassId, BlockBegin);
else
Allocator.Secondary.deallocate(BlockBegin);
@@ -123,14 +123,16 @@ template <class Params> class Allocator {
Options.ZeroContents = getFlags()->zero_contents;
Options.DeallocTypeMismatch = getFlags()->dealloc_type_mismatch;
Options.DeleteSizeMismatch = getFlags()->delete_size_mismatch;
Options.QuarantineMaxChunkSize = getFlags()->quarantine_max_chunk_size;
Options.QuarantineMaxChunkSize =
static_cast<u32>(getFlags()->quarantine_max_chunk_size);

Stats.initLinkerInitialized();
Primary.initLinkerInitialized(getFlags()->release_to_os_interval_ms);
Secondary.initLinkerInitialized(&Stats);

Quarantine.init(getFlags()->quarantine_size_kb << 10,
getFlags()->thread_local_quarantine_size_kb << 10);
Quarantine.init(
static_cast<uptr>(getFlags()->quarantine_size_kb << 10),
static_cast<uptr>(getFlags()->thread_local_quarantine_size_kb << 10));
}

void reset() { memset(this, 0, sizeof(*this)); }
@@ -165,16 +167,17 @@ template <class Params> class Allocator {
return nullptr;
reportAlignmentTooBig(Alignment, MaxAlignment);
}
if (UNLIKELY(Alignment < MinAlignment))
if (Alignment < MinAlignment)
Alignment = MinAlignment;

// If the requested size happens to be 0 (more common than you might think),
// allocate 1 byte on top of the header. Then add the extra bytes required
// to fulfill the alignment requirements: we allocate enough to be sure that
// there will be an address in the block that will satisfy the alignment.
// allocate MinAlignment bytes on top of the header. Then add the extra
// bytes required to fulfill the alignment requirements: we allocate enough
// to be sure that there will be an address in the block that will satisfy
// the alignment.
const uptr NeededSize =
Chunk::getHeaderSize() + roundUpTo(Size ? Size : 1, MinAlignment) +
((Alignment > MinAlignment) ? (Alignment - Chunk::getHeaderSize()) : 0);
roundUpTo(Size, MinAlignment) +
((Alignment > MinAlignment) ? Alignment : Chunk::getHeaderSize());

// Takes care of extravagantly large sizes as well as integer overflows.
if (UNLIKELY(Size >= MaxAllowedMallocSize ||
@@ -186,9 +189,10 @@ template <class Params> class Allocator {

void *Block;
uptr ClassId;
uptr BlockEnd = 0;
if (PrimaryT::canAllocate(NeededSize)) {
uptr BlockEnd;
if (LIKELY(PrimaryT::canAllocate(NeededSize))) {
ClassId = SizeClassMap::getClassIdBySize(NeededSize);
DCHECK_NE(ClassId, 0U);
bool UnlockRequired;
auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
Block = TSD->Cache.allocate(ClassId);
@@ -205,17 +209,17 @@ template <class Params> class Allocator {
reportOutOfMemory(NeededSize);
}

// We only need to zero the contents for Primary backed allocations.
if ((ZeroContents || Options.ZeroContents) && ClassId)
// We only need to zero the contents for Primary backed allocations. This
// condition is not necessarily unlikely, but since memset is costly, we
// might as well mark it as such.
if (UNLIKELY((ZeroContents || Options.ZeroContents) && ClassId))
memset(Block, 0, PrimaryT::getSizeByClassId(ClassId));

Chunk::UnpackedHeader Header = {};
uptr UserPtr = reinterpret_cast<uptr>(Block) + Chunk::getHeaderSize();
// The following condition isn't necessarily "UNLIKELY".
if (!isAligned(UserPtr, Alignment)) {
if (UNLIKELY(!isAligned(UserPtr, Alignment))) {
const uptr AlignedUserPtr = roundUpTo(UserPtr, Alignment);
const uptr Offset = AlignedUserPtr - UserPtr;
Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
DCHECK_GT(Offset, 2 * sizeof(u32));
// The BlockMarker has no security purpose, but is specifically meant for
// the chunk iteration function that can be used in debugging situations.
@@ -224,16 +228,13 @@ template <class Params> class Allocator {
reinterpret_cast<u32 *>(Block)[0] = BlockMarker;
reinterpret_cast<u32 *>(Block)[1] = static_cast<u32>(Offset);
UserPtr = AlignedUserPtr;
Header.Offset = (Offset >> MinAlignmentLog) & Chunk::OffsetMask;
}
Header.ClassId = ClassId & Chunk::ClassIdMask;
Header.State = Chunk::State::Allocated;
Header.Origin = Origin & Chunk::OriginMask;
if (ClassId) {
Header.ClassId = ClassId & Chunk::ClassIdMask;
Header.SizeOrUnusedBytes = Size & Chunk::SizeOrUnusedBytesMask;
} else {
Header.SizeOrUnusedBytes =
(BlockEnd - (UserPtr + Size)) & Chunk::SizeOrUnusedBytesMask;
}
Header.SizeOrUnusedBytes = (ClassId ? Size : BlockEnd - (UserPtr + Size)) &
Chunk::SizeOrUnusedBytesMask;
void *Ptr = reinterpret_cast<void *>(UserPtr);
Chunk::storeHeader(Cookie, Ptr, &Header);

@@ -313,7 +314,7 @@ template <class Params> class Allocator {
const uptr OldSize = getSize(OldPtr, &OldHeader);
// If the new size is identical to the old one, or lower but within an
// acceptable range, we just keep the old chunk, and update its header.
if (NewSize == OldSize)
if (UNLIKELY(NewSize == OldSize))
return OldPtr;
if (NewSize < OldSize) {
const uptr Delta = OldSize - NewSize;
@@ -471,8 +472,7 @@ template <class Params> class Allocator {
// last and last class sizes, as well as the dynamic base for the Primary.
// The following is an over-approximation that works for our needs.
const uptr MaxSizeOrUnusedBytes = SizeClassMap::MaxSize - 1;
Header.SizeOrUnusedBytes =
MaxSizeOrUnusedBytes & Chunk::SizeOrUnusedBytesMask;
Header.SizeOrUnusedBytes = MaxSizeOrUnusedBytes;
if (UNLIKELY(Header.SizeOrUnusedBytes != MaxSizeOrUnusedBytes))
reportSanityCheckError("size (or unused bytes)");

@@ -484,15 +484,15 @@ template <class Params> class Allocator {

static INLINE void *getBlockBegin(const void *Ptr,
Chunk::UnpackedHeader *Header) {
return reinterpret_cast<void *>(reinterpret_cast<uptr>(Ptr) -
Chunk::getHeaderSize() -
(Header->Offset << MinAlignmentLog));
return reinterpret_cast<void *>(
reinterpret_cast<uptr>(Ptr) - Chunk::getHeaderSize() -
(static_cast<uptr>(Header->Offset) << MinAlignmentLog));
}

// Return the size of a chunk as requested during its allocation.
INLINE uptr getSize(const void *Ptr, Chunk::UnpackedHeader *Header) {
const uptr SizeOrUnusedBytes = Header->SizeOrUnusedBytes;
if (Header->ClassId)
if (LIKELY(Header->ClassId))
return SizeOrUnusedBytes;
return SecondaryT::getBlockEnd(getBlockBegin(Ptr, Header)) -
reinterpret_cast<uptr>(Ptr) - SizeOrUnusedBytes;
@@ -514,7 +514,7 @@ template <class Params> class Allocator {
Chunk::compareExchangeHeader(Cookie, Ptr, &NewHeader, Header);
void *BlockBegin = getBlockBegin(Ptr, &NewHeader);
const uptr ClassId = NewHeader.ClassId;
if (ClassId) {
if (LIKELY(ClassId)) {
bool UnlockRequired;
auto *TSD = TSDRegistry.getTSDAndLock(&UnlockRequired);
TSD->Cache.deallocate(ClassId, BlockBegin);
@@ -40,7 +40,7 @@ static void *allocateVmar(uptr Size, MapPlatformData *Data, bool AllowNoMem) {
_zx_vmar_root_self(),
ZX_VM_CAN_MAP_READ | ZX_VM_CAN_MAP_WRITE | ZX_VM_CAN_MAP_SPECIFIC, 0,
Size, &Data->Vmar, &Data->VmarBase);
if (Status != ZX_OK) {
if (UNLIKELY(Status != ZX_OK)) {
if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
return nullptr;
@@ -78,7 +78,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
} else {
// Otherwise, create a Vmo and set its name.
Status = _zx_vmo_create(Size, ZX_VMO_RESIZABLE, &Vmo);
if (Status != ZX_OK) {
if (UNLIKELY(Status != ZX_OK)) {
if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
return nullptr;
@@ -102,7 +102,7 @@ void *map(void *Addr, uptr Size, const char *Name, uptr Flags,
} else {
CHECK_EQ(_zx_handle_close(Vmo), ZX_OK);
}
if (Status != ZX_OK) {
if (UNLIKELY(Status != ZX_OK)) {
if (Status != ZX_ERR_NO_MEMORY || !AllowNoMem)
dieOnMapUnmapError(Status == ZX_ERR_NO_MEMORY);
return nullptr;
@@ -125,7 +125,7 @@ void unmap(void *Addr, uptr Size, uptr Flags, MapPlatformData *Data) {
const zx_handle_t Vmar = Data ? Data->Vmar : _zx_vmar_root_self();
const zx_status_t Status =
_zx_vmar_unmap(Vmar, reinterpret_cast<uintptr_t>(Addr), Size);
if (Status != ZX_OK)
if (UNLIKELY(Status != ZX_OK))
dieOnMapUnmapError();
}
if (Data) {
@@ -172,7 +172,7 @@ u32 getNumberOfCPUs() { return _zx_system_get_num_cpus(); }

bool getRandom(void *Buffer, uptr Length, bool Blocking) {
COMPILER_CHECK(MaxRandomLength <= ZX_CPRNG_DRAW_MAX_LEN);
if (!Buffer || !Length || Length > MaxRandomLength)
if (UNLIKELY(!Buffer || !Length || Length > MaxRandomLength))
return false;
_zx_cprng_draw(Buffer, Length);
return true;
@@ -22,18 +22,16 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
static const u32 MaxNumCached = SizeClassMap::MaxNumCachedHint;
void setFromArray(void **Array, u32 N) {
DCHECK_LE(N, MaxNumCached);
for (u32 I = 0; I < N; I++)
Batch[I] = Array[I];
Count = N;
memcpy(Batch, Array, sizeof(void *) * Count);
}
void clear() { Count = 0; }
void add(void *P) {
DCHECK_LT(Count, MaxNumCached);
Batch[Count++] = P;
}
void copyToArray(void **Array) const {
for (u32 I = 0; I < Count; I++)
Array[I] = Batch[I];
memcpy(Array, Batch, sizeof(void *) * Count);
}
u32 getCount() const { return Count; }
void *get(u32 I) const {
@@ -52,7 +50,7 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {

void initLinkerInitialized(GlobalStats *S, SizeClassAllocator *A) {
Stats.initLinkerInitialized();
if (S)
if (LIKELY(S))
S->link(&Stats);
Allocator = A;
}
@@ -64,12 +62,12 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {

void destroy(GlobalStats *S) {
drain();
if (S)
if (LIKELY(S))
S->unlink(&Stats);
}

void *allocate(uptr ClassId) {
CHECK_LT(ClassId, NumClasses);
DCHECK_LT(ClassId, NumClasses);
PerClass *C = &PerClassArray[ClassId];
if (C->Count == 0) {
if (UNLIKELY(!refill(C, ClassId)))
@@ -157,8 +155,8 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
if (UNLIKELY(!B))
return false;
DCHECK_GT(B->getCount(), 0);
B->copyToArray(C->Chunks);
C->Count = B->getCount();
B->copyToArray(C->Chunks);
destroyBatch(ClassId, B);
return true;
}
@@ -25,12 +25,12 @@ class HybridMutex {
void init() { memset(this, 0, sizeof(*this)); }
bool tryLock();
NOINLINE void lock() {
if (tryLock())
if (LIKELY(tryLock()))
return;
// The compiler may try to fully unroll the loop, ending up in a
// NumberOfTries*NumberOfYields block of pauses mixed with tryLocks. This
// is large, ugly and unneeded, a compact loop is better for our purpose
// here. Use a pragma to tell the compiler not to unroll the loop.
// The compiler may try to fully unroll the loop, ending up in a
// NumberOfTries*NumberOfYields block of pauses mixed with tryLocks. This
// is large, ugly and unneeded, a compact loop is better for our purpose
// here. Use a pragma to tell the compiler not to unroll the loop.
#ifdef __clang__
#pragma nounroll
#endif
@@ -44,8 +44,8 @@ class HybridMutex {
void unlock();

private:
static constexpr u8 NumberOfTries = 10U;
static constexpr u8 NumberOfYields = 10U;
static constexpr u8 NumberOfTries = 8U;
static constexpr u8 NumberOfYields = 8U;

#if SCUDO_LINUX
atomic_u32 M;
@@ -74,7 +74,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
// See comment in the 64-bit primary about releasing smaller size classes.
Sci->CanRelease = (ReleaseToOsInterval > 0) &&
(I != SizeClassMap::BatchClassId) &&
(getSizeByClassId(I) >= (PageSize / 32));
(getSizeByClassId(I) >= (PageSize / 16));
}
ReleaseToOsIntervalMs = ReleaseToOsInterval;
}
@@ -99,9 +99,9 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
SizeClassInfo *Sci = getSizeClassInfo(ClassId);
ScopedLock L(Sci->Mutex);
TransferBatch *B = Sci->FreeList.front();
if (B)
if (B) {
Sci->FreeList.pop_front();
else {
} else {
B = populateFreeList(C, ClassId, Sci);
if (UNLIKELY(!B))
return nullptr;
@@ -129,7 +129,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {

void enable() {
for (sptr I = static_cast<sptr>(NumClasses) - 1; I >= 0; I--)
getSizeClassInfo(I)->Mutex.unlock();
getSizeClassInfo(static_cast<uptr>(I))->Mutex.unlock();
}

template <typename F> void iterateOverBlocks(F Callback) {
@@ -356,7 +356,8 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
const s32 IntervalMs = ReleaseToOsIntervalMs;
if (IntervalMs < 0)
return;
if (Sci->ReleaseInfo.LastReleaseAtNs + IntervalMs * 1000000ULL >
if (Sci->ReleaseInfo.LastReleaseAtNs +
static_cast<uptr>(IntervalMs) * 1000000ULL >
getMonotonicTime()) {
return; // Memory was returned recently.
}

0 comments on commit 419f1a4

Please sign in to comment.
You can’t perform that action at this time.