Skip to content

Commit

Permalink
[scudo][standalone] Merge Spin & Blocking mutex into a Hybrid one
Browse files Browse the repository at this point in the history
Summary:
We ran into a problem on Fuchsia where yielding threads would never
be deboosted, ultimately resulting in several threads spinning on the
same TSD, and no possibility for another thread to be scheduled,
dead-locking the process.

While this was fixed in Zircon, this lead to discussions about if
spinning without a break condition was a good decision, and settled on
a new hybrid model that would spin for a while then block.

Currently we are using a number of iterations for spinning that is
mostly arbitrary (based on sanitizer_common values), but this can
be tuned in the future.

Since we are touching `common.h`, we also use this change as a vehicle
for an Android optimization (the page size is fixed in Bionic, so use
a fixed value too).

Reviewers: morehouse, hctim, eugenis, dvyukov, vitalybuka

Reviewed By: hctim

Subscribers: srhines, delcypher, jfb, #sanitizers, llvm-commits

Tags: #llvm, #sanitizers

Differential Revision: https://reviews.llvm.org/D64358

llvm-svn: 365790
  • Loading branch information
Kostya Kortchinsky committed Jul 11, 2019
1 parent 96dff91 commit aeb3826
Show file tree
Hide file tree
Showing 17 changed files with 144 additions and 158 deletions.
8 changes: 8 additions & 0 deletions compiler-rt/lib/scudo/standalone/atomic_helpers.h
Expand Up @@ -126,6 +126,14 @@ INLINE void atomic_store_relaxed(volatile T *A, typename T::Type V) {
atomic_store(A, V, memory_order_relaxed);
}

template <typename T>
INLINE typename T::Type atomic_compare_exchange(volatile T *A,
typename T::Type Cmp,
typename T::Type Xchg) {
atomic_compare_exchange_strong(A, &Cmp, Xchg, memory_order_acquire);
return Cmp;
}

} // namespace scudo

#endif // SCUDO_ATOMIC_H_
6 changes: 3 additions & 3 deletions compiler-rt/lib/scudo/standalone/bytemap.h
Expand Up @@ -45,8 +45,8 @@ template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
map(nullptr, sizeof(atomic_uptr) * Level1Size, "scudo:bytemap"));
}
void init() {
initLinkerInitialized();
Mutex.init();
initLinkerInitialized();
}

void reset() {
Expand Down Expand Up @@ -92,7 +92,7 @@ template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
u8 *getOrCreate(uptr Index) {
u8 *Res = get(Index);
if (!Res) {
SpinMutexLock L(&Mutex);
ScopedLock L(Mutex);
if (!(Res = get(Index))) {
Res = reinterpret_cast<u8 *>(map(nullptr, Level2Size, "scudo:bytemap"));
atomic_store(&Level1Map[Index], reinterpret_cast<uptr>(Res),
Expand All @@ -103,7 +103,7 @@ template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
}

atomic_uptr *Level1Map;
StaticSpinMutex Mutex;
HybridMutex Mutex;
};

} // namespace scudo
Expand Down
5 changes: 3 additions & 2 deletions compiler-rt/lib/scudo/standalone/common.h
Expand Up @@ -115,11 +115,12 @@ INLINE void yieldProcessor(u8 Count) {

// Platform specific functions.

void yieldPlatform();

extern uptr PageSizeCached;
uptr getPageSizeSlow();
INLINE uptr getPageSizeCached() {
// Bionic uses a hardcoded value.
if (SCUDO_ANDROID)
return 4096U;
if (LIKELY(PageSizeCached))
return PageSizeCached;
return getPageSizeSlow();
Expand Down
21 changes: 9 additions & 12 deletions compiler-rt/lib/scudo/standalone/fuchsia.cc
Expand Up @@ -23,11 +23,6 @@

namespace scudo {

void yieldPlatform() {
const zx_status_t Status = _zx_nanosleep(0);
CHECK_EQ(Status, ZX_OK);
}

uptr getPageSize() { return PAGE_SIZE; }

void NORETURN die() { __builtin_trap(); }
Expand Down Expand Up @@ -155,18 +150,20 @@ const char *getEnv(const char *Name) { return getenv(Name); }
// Note: we need to flag these methods with __TA_NO_THREAD_SAFETY_ANALYSIS
// because the Fuchsia implementation of sync_mutex_t has clang thread safety
// annotations. Were we to apply proper capability annotations to the top level
// BlockingMutex class itself, they would not be needed. As it stands, the
// HybridMutex class itself, they would not be needed. As it stands, the
// thread analysis thinks that we are locking the mutex and accidentally leaving
// it locked on the way out.
void BlockingMutex::lock() __TA_NO_THREAD_SAFETY_ANALYSIS {
bool HybridMutex::tryLock() __TA_NO_THREAD_SAFETY_ANALYSIS {
// Size and alignment must be compatible between both types.
COMPILER_CHECK(sizeof(sync_mutex_t) <= sizeof(OpaqueStorage));
COMPILER_CHECK(!(alignof(decltype(OpaqueStorage)) % alignof(sync_mutex_t)));
sync_mutex_lock(reinterpret_cast<sync_mutex_t *>(OpaqueStorage));
return sync_mutex_trylock(&M) == ZX_OK;
}

void HybridMutex::lockSlow() __TA_NO_THREAD_SAFETY_ANALYSIS {
sync_mutex_lock(&M);
}

void BlockingMutex::unlock() __TA_NO_THREAD_SAFETY_ANALYSIS {
sync_mutex_unlock(reinterpret_cast<sync_mutex_t *>(OpaqueStorage));
void HybridMutex::unlock() __TA_NO_THREAD_SAFETY_ANALYSIS {
sync_mutex_unlock(&M);
}

u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }
Expand Down
53 changes: 33 additions & 20 deletions compiler-rt/lib/scudo/standalone/linux.cc
Expand Up @@ -37,24 +37,25 @@

namespace scudo {

void yieldPlatform() { sched_yield(); }

uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }

void NORETURN die() { abort(); }

void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
UNUSED MapPlatformData *Data) {
int MmapFlags = MAP_PRIVATE | MAP_ANON;
if (Flags & MAP_NOACCESS)
int MmapProt;
if (Flags & MAP_NOACCESS) {
MmapFlags |= MAP_NORESERVE;
MmapProt = PROT_NONE;
} else {
MmapProt = PROT_READ | PROT_WRITE;
}
if (Addr) {
// Currently no scenario for a noaccess mapping with a fixed address.
DCHECK_EQ(Flags & MAP_NOACCESS, 0);
MmapFlags |= MAP_FIXED;
}
const int MmapProt =
(Flags & MAP_NOACCESS) ? PROT_NONE : PROT_READ | PROT_WRITE;
void *P = mmap(Addr, Size, MmapProt, MmapFlags, -1, 0);
if (P == MAP_FAILED) {
if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
Expand Down Expand Up @@ -84,22 +85,34 @@ void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
// Calling getenv should be fine (c)(tm) at any time.
const char *getEnv(const char *Name) { return getenv(Name); }

void BlockingMutex::lock() {
atomic_u32 *M = reinterpret_cast<atomic_u32 *>(&OpaqueStorage);
if (atomic_exchange(M, MtxLocked, memory_order_acquire) == MtxUnlocked)
namespace {
enum State : u32 { Unlocked = 0, Locked = 1, Sleeping = 2 };
}

bool HybridMutex::tryLock() {
return atomic_compare_exchange(&M, Unlocked, Locked) == Unlocked;
}

// The following is based on https://akkadia.org/drepper/futex.pdf.
void HybridMutex::lockSlow() {
u32 V = atomic_compare_exchange(&M, Unlocked, Locked);
if (V == Unlocked)
return;
while (atomic_exchange(M, MtxSleeping, memory_order_acquire) != MtxUnlocked)
syscall(SYS_futex, reinterpret_cast<uptr>(OpaqueStorage),
FUTEX_WAIT_PRIVATE, MtxSleeping, nullptr, nullptr, 0);
if (V != Sleeping)
V = atomic_exchange(&M, Sleeping, memory_order_acquire);
while (V != Unlocked) {
syscall(SYS_futex, reinterpret_cast<uptr>(&M), FUTEX_WAIT_PRIVATE, Sleeping,
nullptr, nullptr, 0);
V = atomic_exchange(&M, Sleeping, memory_order_acquire);
}
}

void BlockingMutex::unlock() {
atomic_u32 *M = reinterpret_cast<atomic_u32 *>(&OpaqueStorage);
const u32 V = atomic_exchange(M, MtxUnlocked, memory_order_release);
DCHECK_NE(V, MtxUnlocked);
if (V == MtxSleeping)
syscall(SYS_futex, reinterpret_cast<uptr>(OpaqueStorage),
FUTEX_WAKE_PRIVATE, 1, nullptr, nullptr, 0);
void HybridMutex::unlock() {
if (atomic_fetch_sub(&M, 1U, memory_order_release) != Locked) {
atomic_store(&M, Unlocked, memory_order_release);
syscall(SYS_futex, reinterpret_cast<uptr>(&M), FUTEX_WAKE_PRIVATE, 1,
nullptr, nullptr, 0);
}
}

u64 getMonotonicTime() {
Expand Down Expand Up @@ -141,8 +154,8 @@ bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
}

void outputRaw(const char *Buffer) {
static StaticSpinMutex Mutex;
SpinMutexLock L(&Mutex);
static HybridMutex Mutex;
ScopedLock L(Mutex);
write(2, Buffer, strlen(Buffer));
}

Expand Down
92 changes: 36 additions & 56 deletions compiler-rt/lib/scudo/standalone/mutex.h
Expand Up @@ -12,82 +12,62 @@
#include "atomic_helpers.h"
#include "common.h"

#include <string.h>

#if SCUDO_FUCHSIA
#include <lib/sync/mutex.h> // for sync_mutex_t
#endif

namespace scudo {

class StaticSpinMutex {
class HybridMutex {
public:
void init() { atomic_store_relaxed(&State, 0); }

void lock() {
void init() { memset(this, 0, sizeof(*this)); }
bool tryLock();
NOINLINE void lock() {
if (tryLock())
return;
lockSlow();
}

bool tryLock() {
return atomic_exchange(&State, 1, memory_order_acquire) == 0;
}

void unlock() { atomic_store(&State, 0, memory_order_release); }

void checkLocked() { CHECK_EQ(atomic_load_relaxed(&State), 1); }

private:
atomic_u8 State;

void NOINLINE lockSlow() {
for (u32 I = 0;; I++) {
if (I < 10)
yieldProcessor(10);
else
yieldPlatform();
if (atomic_load_relaxed(&State) == 0 &&
atomic_exchange(&State, 1, memory_order_acquire) == 0)
// The compiler may try to fully unroll the loop, ending up in a
// NumberOfTries*NumberOfYields block of pauses mixed with tryLocks. This
// is large, ugly and unneeded, a compact loop is better for our purpose
// here. Use a pragma to tell the compiler not to unroll the loop.
#ifdef __clang__
#pragma nounroll
#endif
for (u8 I = 0U; I < NumberOfTries; I++) {
yieldProcessor(NumberOfYields);
if (tryLock())
return;
}
lockSlow();
}
};

class SpinMutex : public StaticSpinMutex {
public:
SpinMutex() { init(); }
void unlock();

private:
SpinMutex(const SpinMutex &) = delete;
void operator=(const SpinMutex &) = delete;
};
static constexpr u8 NumberOfTries = 10U;
static constexpr u8 NumberOfYields = 10U;

class BlockingMutex {
public:
explicit constexpr BlockingMutex(LinkerInitialized) : OpaqueStorage{} {}
BlockingMutex() { memset(this, 0, sizeof(*this)); }
void lock();
void unlock();
void checkLocked() {
atomic_u32 *M = reinterpret_cast<atomic_u32 *>(&OpaqueStorage);
CHECK_NE(MtxUnlocked, atomic_load_relaxed(M));
}
#if SCUDO_LINUX
atomic_u32 M;
#elif SCUDO_FUCHSIA
sync_mutex_t M;
#endif

private:
enum MutexState { MtxUnlocked = 0, MtxLocked = 1, MtxSleeping = 2 };
uptr OpaqueStorage[1];
void lockSlow();
};

template <typename MutexType> class GenericScopedLock {
class ScopedLock {
public:
explicit GenericScopedLock(MutexType *M) : Mutex(M) { Mutex->lock(); }
~GenericScopedLock() { Mutex->unlock(); }
explicit ScopedLock(HybridMutex &M) : Mutex(M) { Mutex.lock(); }
~ScopedLock() { Mutex.unlock(); }

private:
MutexType *Mutex;
HybridMutex &Mutex;

GenericScopedLock(const GenericScopedLock &) = delete;
void operator=(const GenericScopedLock &) = delete;
ScopedLock(const ScopedLock &) = delete;
void operator=(const ScopedLock &) = delete;
};

typedef GenericScopedLock<StaticSpinMutex> SpinMutexLock;
typedef GenericScopedLock<BlockingMutex> BlockingMutexLock;

} // namespace scudo

#endif // SCUDO_MUTEX_H_
14 changes: 7 additions & 7 deletions compiler-rt/lib/scudo/standalone/primary32.h
Expand Up @@ -97,7 +97,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
TransferBatch *popBatch(CacheT *C, uptr ClassId) {
DCHECK_LT(ClassId, NumClasses);
SizeClassInfo *Sci = getSizeClassInfo(ClassId);
BlockingMutexLock L(&Sci->Mutex);
ScopedLock L(Sci->Mutex);
TransferBatch *B = Sci->FreeList.front();
if (B)
Sci->FreeList.pop_front();
Expand All @@ -115,7 +115,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
DCHECK_LT(ClassId, NumClasses);
DCHECK_GT(B->getCount(), 0);
SizeClassInfo *Sci = getSizeClassInfo(ClassId);
BlockingMutexLock L(&Sci->Mutex);
ScopedLock L(Sci->Mutex);
Sci->FreeList.push_front(B);
Sci->Stats.PushedBlocks += B->getCount();
if (Sci->CanRelease)
Expand Down Expand Up @@ -164,7 +164,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
void releaseToOS() {
for (uptr I = 1; I < NumClasses; I++) {
SizeClassInfo *Sci = getSizeClassInfo(I);
BlockingMutexLock L(&Sci->Mutex);
ScopedLock L(Sci->Mutex);
releaseToOSMaybe(Sci, I, /*Force=*/true);
}
}
Expand Down Expand Up @@ -192,7 +192,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
};

struct ALIGNED(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
BlockingMutex Mutex;
HybridMutex Mutex;
IntrusiveList<TransferBatch> FreeList;
SizeClassStats Stats;
bool CanRelease;
Expand All @@ -217,7 +217,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
const uptr MapEnd = MapBase + MapSize;
uptr Region = MapBase;
if (isAligned(Region, RegionSize)) {
SpinMutexLock L(&RegionsStashMutex);
ScopedLock L(RegionsStashMutex);
if (NumberOfStashedRegions < MaxStashedRegions)
RegionsStash[NumberOfStashedRegions++] = MapBase + RegionSize;
else
Expand All @@ -237,7 +237,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
DCHECK_LT(ClassId, NumClasses);
uptr Region = 0;
{
SpinMutexLock L(&RegionsStashMutex);
ScopedLock L(RegionsStashMutex);
if (NumberOfStashedRegions > 0)
Region = RegionsStash[--NumberOfStashedRegions];
}
Expand Down Expand Up @@ -389,7 +389,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
// Unless several threads request regions simultaneously from different size
// classes, the stash rarely contains more than 1 entry.
static constexpr uptr MaxStashedRegions = 4;
StaticSpinMutex RegionsStashMutex;
HybridMutex RegionsStashMutex;
uptr NumberOfStashedRegions;
uptr RegionsStash[MaxStashedRegions];
};
Expand Down

0 comments on commit aeb3826

Please sign in to comment.