[scudo][standalone] Merge Spin & Blocking mutex into a Hybrid one

Summary: We ran into a problem on Fuchsia where yielding threads would never be deboosted, ultimately resulting in several threads spinning on the same TSD, and no possibility for another thread to be scheduled, dead-locking the process. While this was fixed in Zircon, this lead to discussions about if spinning without a break condition was a good decision, and settled on a new hybrid model that would spin for a while then block. Currently we are using a number of iterations for spinning that is mostly arbitrary (based on sanitizer_common values), but this can be tuned in the future. Since we are touching `common.h`, we also use this change as a vehicle for an Android optimization (the page size is fixed in Bionic, so use a fixed value too). Reviewers: morehouse, hctim, eugenis, dvyukov, vitalybuka Reviewed By: hctim Subscribers: srhines, delcypher, jfb, #sanitizers, llvm-commits Tags: #llvm, #sanitizers Differential Revision: https://reviews.llvm.org/D64358 llvm-svn: 365790
llvm · Jul 11, 2019 · aeb3826 · aeb3826
1 parent 96dff91
commit aeb3826
Show file tree

Hide file tree

Showing 17 changed files with 144 additions and 158 deletions.
diff --git a/compiler-rt/lib/scudo/standalone/atomic_helpers.h b/compiler-rt/lib/scudo/standalone/atomic_helpers.h
@@ -126,6 +126,14 @@ INLINE void atomic_store_relaxed(volatile T *A, typename T::Type V) {
   atomic_store(A, V, memory_order_relaxed);
 }
 
+template <typename T>
+INLINE typename T::Type atomic_compare_exchange(volatile T *A,
+                                                typename T::Type Cmp,
+                                                typename T::Type Xchg) {
+  atomic_compare_exchange_strong(A, &Cmp, Xchg, memory_order_acquire);
+  return Cmp;
+}
+
 } // namespace scudo
 
 #endif // SCUDO_ATOMIC_H_
diff --git a/compiler-rt/lib/scudo/standalone/bytemap.h b/compiler-rt/lib/scudo/standalone/bytemap.h
@@ -45,8 +45,8 @@ template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
         map(nullptr, sizeof(atomic_uptr) * Level1Size, "scudo:bytemap"));
   }
   void init() {
-    initLinkerInitialized();
     Mutex.init();
+    initLinkerInitialized();
   }
 
   void reset() {
@@ -92,7 +92,7 @@ template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
   u8 *getOrCreate(uptr Index) {
     u8 *Res = get(Index);
     if (!Res) {
-      SpinMutexLock L(&Mutex);
+      ScopedLock L(Mutex);
       if (!(Res = get(Index))) {
         Res = reinterpret_cast<u8 *>(map(nullptr, Level2Size, "scudo:bytemap"));
         atomic_store(&Level1Map[Index], reinterpret_cast<uptr>(Res),
@@ -103,7 +103,7 @@ template <uptr Level1Size, uptr Level2Size> class TwoLevelByteMap {
   }
 
   atomic_uptr *Level1Map;
-  StaticSpinMutex Mutex;
+  HybridMutex Mutex;
 };
 
 } // namespace scudo

diff --git a/compiler-rt/lib/scudo/standalone/common.h b/compiler-rt/lib/scudo/standalone/common.h
@@ -115,11 +115,12 @@ INLINE void yieldProcessor(u8 Count) {
 
 // Platform specific functions.
 
-void yieldPlatform();
-
 extern uptr PageSizeCached;
 uptr getPageSizeSlow();
 INLINE uptr getPageSizeCached() {
+  // Bionic uses a hardcoded value.
+  if (SCUDO_ANDROID)
+    return 4096U;
   if (LIKELY(PageSizeCached))
     return PageSizeCached;
   return getPageSizeSlow();

diff --git a/compiler-rt/lib/scudo/standalone/fuchsia.cc b/compiler-rt/lib/scudo/standalone/fuchsia.cc
@@ -23,11 +23,6 @@
 
 namespace scudo {
 
-void yieldPlatform() {
-  const zx_status_t Status = _zx_nanosleep(0);
-  CHECK_EQ(Status, ZX_OK);
-}
-
 uptr getPageSize() { return PAGE_SIZE; }
 
 void NORETURN die() { __builtin_trap(); }
@@ -155,18 +150,20 @@ const char *getEnv(const char *Name) { return getenv(Name); }
 // Note: we need to flag these methods with __TA_NO_THREAD_SAFETY_ANALYSIS
 // because the Fuchsia implementation of sync_mutex_t has clang thread safety
 // annotations. Were we to apply proper capability annotations to the top level
-// BlockingMutex class itself, they would not be needed. As it stands, the
+// HybridMutex class itself, they would not be needed. As it stands, the
 // thread analysis thinks that we are locking the mutex and accidentally leaving
 // it locked on the way out.
-void BlockingMutex::lock() __TA_NO_THREAD_SAFETY_ANALYSIS {
+bool HybridMutex::tryLock() __TA_NO_THREAD_SAFETY_ANALYSIS {
   // Size and alignment must be compatible between both types.
-  COMPILER_CHECK(sizeof(sync_mutex_t) <= sizeof(OpaqueStorage));
-  COMPILER_CHECK(!(alignof(decltype(OpaqueStorage)) % alignof(sync_mutex_t)));
-  sync_mutex_lock(reinterpret_cast<sync_mutex_t *>(OpaqueStorage));
+  return sync_mutex_trylock(&M) == ZX_OK;
+}
+
+void HybridMutex::lockSlow() __TA_NO_THREAD_SAFETY_ANALYSIS {
+  sync_mutex_lock(&M);
 }
 
-void BlockingMutex::unlock() __TA_NO_THREAD_SAFETY_ANALYSIS {
-  sync_mutex_unlock(reinterpret_cast<sync_mutex_t *>(OpaqueStorage));
+void HybridMutex::unlock() __TA_NO_THREAD_SAFETY_ANALYSIS {
+  sync_mutex_unlock(&M);
 }
 
 u64 getMonotonicTime() { return _zx_clock_get_monotonic(); }

diff --git a/compiler-rt/lib/scudo/standalone/linux.cc b/compiler-rt/lib/scudo/standalone/linux.cc
@@ -37,24 +37,25 @@
 
 namespace scudo {
 
-void yieldPlatform() { sched_yield(); }
-
 uptr getPageSize() { return static_cast<uptr>(sysconf(_SC_PAGESIZE)); }
 
 void NORETURN die() { abort(); }
 
 void *map(void *Addr, uptr Size, UNUSED const char *Name, uptr Flags,
           UNUSED MapPlatformData *Data) {
   int MmapFlags = MAP_PRIVATE | MAP_ANON;
-  if (Flags & MAP_NOACCESS)
+  int MmapProt;
+  if (Flags & MAP_NOACCESS) {
     MmapFlags |= MAP_NORESERVE;
+    MmapProt = PROT_NONE;
+  } else {
+    MmapProt = PROT_READ | PROT_WRITE;
+  }
   if (Addr) {
     // Currently no scenario for a noaccess mapping with a fixed address.
     DCHECK_EQ(Flags & MAP_NOACCESS, 0);
     MmapFlags |= MAP_FIXED;
   }
-  const int MmapProt =
-      (Flags & MAP_NOACCESS) ? PROT_NONE : PROT_READ | PROT_WRITE;
   void *P = mmap(Addr, Size, MmapProt, MmapFlags, -1, 0);
   if (P == MAP_FAILED) {
     if (!(Flags & MAP_ALLOWNOMEM) || errno != ENOMEM)
@@ -84,22 +85,34 @@ void releasePagesToOS(uptr BaseAddress, uptr Offset, uptr Size,
 // Calling getenv should be fine (c)(tm) at any time.
 const char *getEnv(const char *Name) { return getenv(Name); }
 
-void BlockingMutex::lock() {
-  atomic_u32 *M = reinterpret_cast<atomic_u32 *>(&OpaqueStorage);
-  if (atomic_exchange(M, MtxLocked, memory_order_acquire) == MtxUnlocked)
+namespace {
+enum State : u32 { Unlocked = 0, Locked = 1, Sleeping = 2 };
+}
+
+bool HybridMutex::tryLock() {
+  return atomic_compare_exchange(&M, Unlocked, Locked) == Unlocked;
+}
+
+// The following is based on https://akkadia.org/drepper/futex.pdf.
+void HybridMutex::lockSlow() {
+  u32 V = atomic_compare_exchange(&M, Unlocked, Locked);
+  if (V == Unlocked)
     return;
-  while (atomic_exchange(M, MtxSleeping, memory_order_acquire) != MtxUnlocked)
-    syscall(SYS_futex, reinterpret_cast<uptr>(OpaqueStorage),
-            FUTEX_WAIT_PRIVATE, MtxSleeping, nullptr, nullptr, 0);
+  if (V != Sleeping)
+    V = atomic_exchange(&M, Sleeping, memory_order_acquire);
+  while (V != Unlocked) {
+    syscall(SYS_futex, reinterpret_cast<uptr>(&M), FUTEX_WAIT_PRIVATE, Sleeping,
+            nullptr, nullptr, 0);
+    V = atomic_exchange(&M, Sleeping, memory_order_acquire);
+  }
 }
 
-void BlockingMutex::unlock() {
-  atomic_u32 *M = reinterpret_cast<atomic_u32 *>(&OpaqueStorage);
-  const u32 V = atomic_exchange(M, MtxUnlocked, memory_order_release);
-  DCHECK_NE(V, MtxUnlocked);
-  if (V == MtxSleeping)
-    syscall(SYS_futex, reinterpret_cast<uptr>(OpaqueStorage),
-            FUTEX_WAKE_PRIVATE, 1, nullptr, nullptr, 0);
+void HybridMutex::unlock() {
+  if (atomic_fetch_sub(&M, 1U, memory_order_release) != Locked) {
+    atomic_store(&M, Unlocked, memory_order_release);
+    syscall(SYS_futex, reinterpret_cast<uptr>(&M), FUTEX_WAKE_PRIVATE, 1,
+            nullptr, nullptr, 0);
+  }
 }
 
 u64 getMonotonicTime() {
@@ -141,8 +154,8 @@ bool getRandom(void *Buffer, uptr Length, UNUSED bool Blocking) {
 }
 
 void outputRaw(const char *Buffer) {
-  static StaticSpinMutex Mutex;
-  SpinMutexLock L(&Mutex);
+  static HybridMutex Mutex;
+  ScopedLock L(Mutex);
   write(2, Buffer, strlen(Buffer));
 }
 

diff --git a/compiler-rt/lib/scudo/standalone/mutex.h b/compiler-rt/lib/scudo/standalone/mutex.h
@@ -12,82 +12,62 @@
 #include "atomic_helpers.h"
 #include "common.h"
 
+#include <string.h>
+
+#if SCUDO_FUCHSIA
+#include <lib/sync/mutex.h> // for sync_mutex_t
+#endif
+
 namespace scudo {
 
-class StaticSpinMutex {
+class HybridMutex {
 public:
-  void init() { atomic_store_relaxed(&State, 0); }
-
-  void lock() {
+  void init() { memset(this, 0, sizeof(*this)); }
+  bool tryLock();
+  NOINLINE void lock() {
     if (tryLock())
       return;
-    lockSlow();
-  }
-
-  bool tryLock() {
-    return atomic_exchange(&State, 1, memory_order_acquire) == 0;
-  }
-
-  void unlock() { atomic_store(&State, 0, memory_order_release); }
-
-  void checkLocked() { CHECK_EQ(atomic_load_relaxed(&State), 1); }
-
-private:
-  atomic_u8 State;
-
-  void NOINLINE lockSlow() {
-    for (u32 I = 0;; I++) {
-      if (I < 10)
-        yieldProcessor(10);
-      else
-        yieldPlatform();
-      if (atomic_load_relaxed(&State) == 0 &&
-          atomic_exchange(&State, 1, memory_order_acquire) == 0)
+      // The compiler may try to fully unroll the loop, ending up in a
+      // NumberOfTries*NumberOfYields block of pauses mixed with tryLocks. This
+      // is large, ugly and unneeded, a compact loop is better for our purpose
+      // here. Use a pragma to tell the compiler not to unroll the loop.
+#ifdef __clang__
+#pragma nounroll
+#endif
+    for (u8 I = 0U; I < NumberOfTries; I++) {
+      yieldProcessor(NumberOfYields);
+      if (tryLock())
         return;
     }
+    lockSlow();
   }
-};
-
-class SpinMutex : public StaticSpinMutex {
-public:
-  SpinMutex() { init(); }
+  void unlock();
 
 private:
-  SpinMutex(const SpinMutex &) = delete;
-  void operator=(const SpinMutex &) = delete;
-};
+  static constexpr u8 NumberOfTries = 10U;
+  static constexpr u8 NumberOfYields = 10U;
 
-class BlockingMutex {
-public:
-  explicit constexpr BlockingMutex(LinkerInitialized) : OpaqueStorage{} {}
-  BlockingMutex() { memset(this, 0, sizeof(*this)); }
-  void lock();
-  void unlock();
-  void checkLocked() {
-    atomic_u32 *M = reinterpret_cast<atomic_u32 *>(&OpaqueStorage);
-    CHECK_NE(MtxUnlocked, atomic_load_relaxed(M));
-  }
+#if SCUDO_LINUX
+  atomic_u32 M;
+#elif SCUDO_FUCHSIA
+  sync_mutex_t M;
+#endif
 
-private:
-  enum MutexState { MtxUnlocked = 0, MtxLocked = 1, MtxSleeping = 2 };
-  uptr OpaqueStorage[1];
+  void lockSlow();
 };
 
-template <typename MutexType> class GenericScopedLock {
+class ScopedLock {
 public:
-  explicit GenericScopedLock(MutexType *M) : Mutex(M) { Mutex->lock(); }
-  ~GenericScopedLock() { Mutex->unlock(); }
+  explicit ScopedLock(HybridMutex &M) : Mutex(M) { Mutex.lock(); }
+  ~ScopedLock() { Mutex.unlock(); }
 
 private:
-  MutexType *Mutex;
+  HybridMutex &Mutex;
 
-  GenericScopedLock(const GenericScopedLock &) = delete;
-  void operator=(const GenericScopedLock &) = delete;
+  ScopedLock(const ScopedLock &) = delete;
+  void operator=(const ScopedLock &) = delete;
 };
 
-typedef GenericScopedLock<StaticSpinMutex> SpinMutexLock;
-typedef GenericScopedLock<BlockingMutex> BlockingMutexLock;
-
 } // namespace scudo
 
 #endif // SCUDO_MUTEX_H_
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -97,7 +97,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
   TransferBatch *popBatch(CacheT *C, uptr ClassId) {
     DCHECK_LT(ClassId, NumClasses);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
-    BlockingMutexLock L(&Sci->Mutex);
+    ScopedLock L(Sci->Mutex);
     TransferBatch *B = Sci->FreeList.front();
     if (B)
       Sci->FreeList.pop_front();
@@ -115,7 +115,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
     DCHECK_LT(ClassId, NumClasses);
     DCHECK_GT(B->getCount(), 0);
     SizeClassInfo *Sci = getSizeClassInfo(ClassId);
-    BlockingMutexLock L(&Sci->Mutex);
+    ScopedLock L(Sci->Mutex);
     Sci->FreeList.push_front(B);
     Sci->Stats.PushedBlocks += B->getCount();
     if (Sci->CanRelease)
@@ -164,7 +164,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
   void releaseToOS() {
     for (uptr I = 1; I < NumClasses; I++) {
       SizeClassInfo *Sci = getSizeClassInfo(I);
-      BlockingMutexLock L(&Sci->Mutex);
+      ScopedLock L(Sci->Mutex);
       releaseToOSMaybe(Sci, I, /*Force=*/true);
     }
   }
@@ -192,7 +192,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
   };
 
   struct ALIGNED(SCUDO_CACHE_LINE_SIZE) SizeClassInfo {
-    BlockingMutex Mutex;
+    HybridMutex Mutex;
     IntrusiveList<TransferBatch> FreeList;
     SizeClassStats Stats;
     bool CanRelease;
@@ -217,7 +217,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
     const uptr MapEnd = MapBase + MapSize;
     uptr Region = MapBase;
     if (isAligned(Region, RegionSize)) {
-      SpinMutexLock L(&RegionsStashMutex);
+      ScopedLock L(RegionsStashMutex);
       if (NumberOfStashedRegions < MaxStashedRegions)
         RegionsStash[NumberOfStashedRegions++] = MapBase + RegionSize;
       else
@@ -237,7 +237,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
     DCHECK_LT(ClassId, NumClasses);
     uptr Region = 0;
     {
-      SpinMutexLock L(&RegionsStashMutex);
+      ScopedLock L(RegionsStashMutex);
       if (NumberOfStashedRegions > 0)
         Region = RegionsStash[--NumberOfStashedRegions];
     }
@@ -389,7 +389,7 @@ template <class SizeClassMapT, uptr RegionSizeLog> class SizeClassAllocator32 {
   // Unless several threads request regions simultaneously from different size
   // classes, the stash rarely contains more than 1 entry.
   static constexpr uptr MaxStashedRegions = 4;
-  StaticSpinMutex RegionsStashMutex;
+  HybridMutex RegionsStashMutex;
   uptr NumberOfStashedRegions;
   uptr RegionsStash[MaxStashedRegions];
 };