8225631: Consider replacing muxAcquire/Release with PlatformMonitor

David Holmes · David Holmes · commit 99eac5358064 · 2020-11-18T22:45:49.000Z
Reviewed-by: coleenp, dcubed, kbarrett
diff --git a/src/hotspot/os/posix/os_posix.cpp b/src/hotspot/os/posix/os_posix.cpp
@@ -1520,7 +1520,7 @@ void os::PlatformEvent::unpark() {
   // shake out uses of park() and unpark() without checking state conditions
   // properly. This spurious return doesn't manifest itself in any user code
   // but only in the correctly written condition checking loops of ObjectMonitor,
-  // Mutex/Monitor, Thread::muxAcquire and JavaThread::sleep
+  // Mutex/Monitor, and JavaThread::sleep
 
   if (Atomic::xchg(&_event, 1) >= 0) return;
 
diff --git a/src/hotspot/share/runtime/park.hpp b/src/hotspot/share/runtime/park.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -125,7 +125,6 @@ class ParkEvent : public os::PlatformEvent {
   public:
     // MCS-CLH list linkage and Native Mutex/Monitor
     ParkEvent * volatile ListNext ;
-    volatile intptr_t OnList ;
     volatile int TState ;
     volatile int Notified ;             // for native monitor construct
 
@@ -146,7 +145,6 @@ class ParkEvent : public os::PlatformEvent {
        AssociatedWith = NULL ;
        FreeNext       = NULL ;
        ListNext       = NULL ;
-       OnList         = 0 ;
        TState         = 0 ;
        Notified       = 0 ;
     }
diff --git a/src/hotspot/share/runtime/synchronizer.cpp b/src/hotspot/share/runtime/synchronizer.cpp
@@ -227,8 +227,14 @@ int dtrace_waited_probe(ObjectMonitor* monitor, Handle obj, Thread* thr) {
   return 0;
 }
 
-#define NINFLATIONLOCKS 256
-static volatile intptr_t gInflationLocks[NINFLATIONLOCKS];
+static const int NINFLATIONLOCKS = 256;
+static os::PlatformMutex* gInflationLocks[NINFLATIONLOCKS];
+
+void ObjectSynchronizer::initialize() {
+  for (int i = 0; i < NINFLATIONLOCKS; i++) {
+    gInflationLocks[i] = new os::PlatformMutex();
+  }
+}
 
 static MonitorList _in_use_list;
 // The ratio of the current _in_use_list count to the ceiling is used
@@ -749,13 +755,7 @@ static markWord read_stable_mark(oop obj) {
 
     // The object is being inflated by some other thread.
     // The caller of read_stable_mark() must wait for inflation to complete.
-    // Avoid live-lock
-    // TODO: consider calling SafepointSynchronize::do_call_back() while
-    // spinning to see if there's a safepoint pending.  If so, immediately
-    // yielding or blocking would be appropriate.  Avoid spinning while
-    // there is a safepoint pending.
-    // TODO: add inflation contention performance counters.
-    // TODO: restrict the aggregate number of spinners.
+    // Avoid live-lock.
 
     ++its;
     if (its > 10000 || !os::is_MP()) {
@@ -775,15 +775,15 @@ static markWord read_stable_mark(oop obj) {
         // and calling park().  When inflation was complete the thread that accomplished inflation
         // would detach the list and set the markword to inflated with a single CAS and
         // then for each thread on the list, set the flag and unpark() the thread.
-        // This is conceptually similar to muxAcquire-muxRelease, except that muxRelease
-        // wakes at most one thread whereas we need to wake the entire list.
+
+        // Index into the lock array based on the current object address.
+        static_assert(is_power_of_2(NINFLATIONLOCKS), "must be");
         int ix = (cast_from_oop<intptr_t>(obj) >> 5) & (NINFLATIONLOCKS-1);
         int YieldThenBlock = 0;
         assert(ix >= 0 && ix < NINFLATIONLOCKS, "invariant");
-        assert((NINFLATIONLOCKS & (NINFLATIONLOCKS-1)) == 0, "invariant");
-        Thread::muxAcquire(gInflationLocks + ix, "gInflationLock");
+        gInflationLocks[ix]->lock();
         while (obj->mark() == markWord::INFLATING()) {
-          // Beware: NakedYield() is advisory and has almost no effect on some platforms
+          // Beware: naked_yield() is advisory and has almost no effect on some platforms
           // so we periodically call self->_ParkEvent->park(1).
           // We use a mixed spin/yield/block mechanism.
           if ((YieldThenBlock++) >= 16) {
@@ -792,7 +792,7 @@ static markWord read_stable_mark(oop obj) {
             os::naked_yield();
           }
         }
-        Thread::muxRelease(gInflationLocks + ix);
+        gInflationLocks[ix]->unlock();
       }
     } else {
       SpinPause();       // SMP-polite spinning
diff --git a/src/hotspot/share/runtime/synchronizer.hpp b/src/hotspot/share/runtime/synchronizer.hpp
@@ -29,6 +29,7 @@
 #include "oops/markWord.hpp"
 #include "runtime/basicLock.hpp"
 #include "runtime/handles.hpp"
+#include "runtime/os.hpp"
 #include "runtime/perfData.hpp"
 
 class LogStream;
@@ -114,6 +115,9 @@ class ObjectSynchronizer : AllStatic {
   static void release_monitors_owned_by_thread(TRAPS);
   static void monitors_iterate(MonitorClosure* m);
 
+  // Initialize the gInflationLocks
+  static void initialize();
+
   // GC: we current use aggressive monitor deflation policy
   // Basically we try to deflate all monitors that are not busy.
   static size_t deflate_idle_monitors();
diff --git a/src/hotspot/share/runtime/thread.cpp b/src/hotspot/share/runtime/thread.cpp
@@ -291,7 +291,6 @@ Thread::Thread() {
   // The stack would act as a cache to avoid calls to ParkEvent::Allocate()
   // and ::Release()
   _ParkEvent   = ParkEvent::Allocate(this);
-  _MuxEvent    = ParkEvent::Allocate(this);
 
 #ifdef CHECK_UNHANDLED_OOPS
   if (CheckUnhandledOops) {
@@ -439,7 +438,6 @@ Thread::~Thread() {
   // It's possible we can encounter a null _ParkEvent, etc., in stillborn threads.
   // We NULL out the fields for good hygiene.
   ParkEvent::Release(_ParkEvent); _ParkEvent   = NULL;
-  ParkEvent::Release(_MuxEvent); _MuxEvent    = NULL;
 
   delete handle_area();
   delete metadata_handles();
@@ -3560,6 +3558,7 @@ jint Threads::create_vm(JavaVMInitArgs* args, bool* canTryAgain) {
 
   // Initialize Java-Level synchronization subsystem
   ObjectMonitor::Initialize();
+  ObjectSynchronizer::initialize();
 
   // Initialize global modules
   jint status = init_globals();
@@ -4582,22 +4581,11 @@ void Threads::print_threads_compiling(outputStream* st, char* buf, int buflen, b
 }
 
 
-// Internal SpinLock and Mutex
-// Based on ParkEvent
-
-// Ad-hoc mutual exclusion primitives: SpinLock and Mux
+// Ad-hoc mutual exclusion primitives: SpinLock
 //
 // We employ SpinLocks _only for low-contention, fixed-length
 // short-duration critical sections where we're concerned
 // about native mutex_t or HotSpot Mutex:: latency.
-// The mux construct provides a spin-then-block mutual exclusion
-// mechanism.
-//
-// Testing has shown that contention on the ListLock guarding gFreeList
-// is common.  If we implement ListLock as a simple SpinLock it's common
-// for the JVM to devolve to yielding with little progress.  This is true
-// despite the fact that the critical sections protected by ListLock are
-// extremely short.
 //
 // TODO-FIXME: ListLock should be of type SpinLock.
 // We should make this a 1st-class type, integrated into the lock
@@ -4650,150 +4638,6 @@ void Thread::SpinRelease(volatile int * adr) {
   *adr = 0;
 }
 
-// muxAcquire and muxRelease:
-//
-// *  muxAcquire and muxRelease support a single-word lock-word construct.
-//    The LSB of the word is set IFF the lock is held.
-//    The remainder of the word points to the head of a singly-linked list
-//    of threads blocked on the lock.
-//
-// *  The current implementation of muxAcquire-muxRelease uses its own
-//    dedicated Thread._MuxEvent instance.  If we're interested in
-//    minimizing the peak number of extant ParkEvent instances then
-//    we could eliminate _MuxEvent and "borrow" _ParkEvent as long
-//    as certain invariants were satisfied.  Specifically, care would need
-//    to be taken with regards to consuming unpark() "permits".
-//    A safe rule of thumb is that a thread would never call muxAcquire()
-//    if it's enqueued (cxq, EntryList, WaitList, etc) and will subsequently
-//    park().  Otherwise the _ParkEvent park() operation in muxAcquire() could
-//    consume an unpark() permit intended for monitorenter, for instance.
-//    One way around this would be to widen the restricted-range semaphore
-//    implemented in park().  Another alternative would be to provide
-//    multiple instances of the PlatformEvent() for each thread.  One
-//    instance would be dedicated to muxAcquire-muxRelease, for instance.
-//
-// *  Usage:
-//    -- Only as leaf locks
-//    -- for short-term locking only as muxAcquire does not perform
-//       thread state transitions.
-//
-// Alternatives:
-// *  We could implement muxAcquire and muxRelease with MCS or CLH locks
-//    but with parking or spin-then-park instead of pure spinning.
-// *  Use Taura-Oyama-Yonenzawa locks.
-// *  It's possible to construct a 1-0 lock if we encode the lockword as
-//    (List,LockByte).  Acquire will CAS the full lockword while Release
-//    will STB 0 into the LockByte.  The 1-0 scheme admits stranding, so
-//    acquiring threads use timers (ParkTimed) to detect and recover from
-//    the stranding window.  Thread/Node structures must be aligned on 256-byte
-//    boundaries by using placement-new.
-// *  Augment MCS with advisory back-link fields maintained with CAS().
-//    Pictorially:  LockWord -> T1 <-> T2 <-> T3 <-> ... <-> Tn <-> Owner.
-//    The validity of the backlinks must be ratified before we trust the value.
-//    If the backlinks are invalid the exiting thread must back-track through the
-//    the forward links, which are always trustworthy.
-// *  Add a successor indication.  The LockWord is currently encoded as
-//    (List, LOCKBIT:1).  We could also add a SUCCBIT or an explicit _succ variable
-//    to provide the usual futile-wakeup optimization.
-//    See RTStt for details.
-//
-
-
-const intptr_t LOCKBIT = 1;
-
-void Thread::muxAcquire(volatile intptr_t * Lock, const char * LockName) {
-  intptr_t w = Atomic::cmpxchg(Lock, (intptr_t)0, LOCKBIT);
-  if (w == 0) return;
-  if ((w & LOCKBIT) == 0 && Atomic::cmpxchg(Lock, w, w|LOCKBIT) == w) {
-    return;
-  }
-
-  ParkEvent * const Self = Thread::current()->_MuxEvent;
-  assert((intptr_t(Self) & LOCKBIT) == 0, "invariant");
-  for (;;) {
-    int its = (os::is_MP() ? 100 : 0) + 1;
-
-    // Optional spin phase: spin-then-park strategy
-    while (--its >= 0) {
-      w = *Lock;
-      if ((w & LOCKBIT) == 0 && Atomic::cmpxchg(Lock, w, w|LOCKBIT) == w) {
-        return;
-      }
-    }
-
-    Self->reset();
-    Self->OnList = intptr_t(Lock);
-    // The following fence() isn't _strictly necessary as the subsequent
-    // CAS() both serializes execution and ratifies the fetched *Lock value.
-    OrderAccess::fence();
-    for (;;) {
-      w = *Lock;
-      if ((w & LOCKBIT) == 0) {
-        if (Atomic::cmpxchg(Lock, w, w|LOCKBIT) == w) {
-          Self->OnList = 0;   // hygiene - allows stronger asserts
-          return;
-        }
-        continue;      // Interference -- *Lock changed -- Just retry
-      }
-      assert(w & LOCKBIT, "invariant");
-      Self->ListNext = (ParkEvent *) (w & ~LOCKBIT);
-      if (Atomic::cmpxchg(Lock, w, intptr_t(Self)|LOCKBIT) == w) break;
-    }
-
-    while (Self->OnList != 0) {
-      Self->park();
-    }
-  }
-}
-
-// Release() must extract a successor from the list and then wake that thread.
-// It can "pop" the front of the list or use a detach-modify-reattach (DMR) scheme
-// similar to that used by ParkEvent::Allocate() and ::Release().  DMR-based
-// Release() would :
-// (A) CAS() or swap() null to *Lock, releasing the lock and detaching the list.
-// (B) Extract a successor from the private list "in-hand"
-// (C) attempt to CAS() the residual back into *Lock over null.
-//     If there were any newly arrived threads and the CAS() would fail.
-//     In that case Release() would detach the RATs, re-merge the list in-hand
-//     with the RATs and repeat as needed.  Alternately, Release() might
-//     detach and extract a successor, but then pass the residual list to the wakee.
-//     The wakee would be responsible for reattaching and remerging before it
-//     competed for the lock.
-//
-// Both "pop" and DMR are immune from ABA corruption -- there can be
-// multiple concurrent pushers, but only one popper or detacher.
-// This implementation pops from the head of the list.  This is unfair,
-// but tends to provide excellent throughput as hot threads remain hot.
-// (We wake recently run threads first).
-//
-// All paths through muxRelease() will execute a CAS.
-// Release consistency -- We depend on the CAS in muxRelease() to provide full
-// bidirectional fence/MEMBAR semantics, ensuring that all prior memory operations
-// executed within the critical section are complete and globally visible before the
-// store (CAS) to the lock-word that releases the lock becomes globally visible.
-void Thread::muxRelease(volatile intptr_t * Lock)  {
-  for (;;) {
-    const intptr_t w = Atomic::cmpxchg(Lock, LOCKBIT, (intptr_t)0);
-    assert(w & LOCKBIT, "invariant");
-    if (w == LOCKBIT) return;
-    ParkEvent * const List = (ParkEvent *) (w & ~LOCKBIT);
-    assert(List != NULL, "invariant");
-    assert(List->OnList == intptr_t(Lock), "invariant");
-    ParkEvent * const nxt = List->ListNext;
-    guarantee((intptr_t(nxt) & LOCKBIT) == 0, "invariant");
-
-    // The following CAS() releases the lock and pops the head element.
-    // The CAS() also ratifies the previously fetched lock-word value.
-    if (Atomic::cmpxchg(Lock, w, intptr_t(nxt)) != w) {
-      continue;
-    }
-    List->OnList = 0;
-    OrderAccess::fence();
-    List->unpark();
-    return;
-  }
-}
-
 
 void Threads::verify() {
   ALL_JAVA_THREADS(p) {
diff --git a/src/hotspot/share/runtime/thread.hpp b/src/hotspot/share/runtime/thread.hpp
@@ -827,8 +827,8 @@ class Thread: public ThreadShadow {
  public:
   volatile intptr_t _Stalled;
   volatile int _TypeTag;
-  ParkEvent * _ParkEvent;                     // for Object monitors and JVMTI raw monitors
-  ParkEvent * _MuxEvent;                      // for low-level muxAcquire-muxRelease
+  ParkEvent * _ParkEvent;                     // for Object monitors, JVMTI raw monitors,
+                                              // and ObjectSynchronizer::read_stable_mark
   int NativeSyncRecursion;                    // diagnostic
 
   volatile int _OnTrap;                       // Resume-at IP delta
@@ -837,13 +837,10 @@ class Thread: public ThreadShadow {
   jint _hashStateY;
   jint _hashStateZ;
 
-  // Low-level leaf-lock primitives used to implement synchronization
-  // and native monitor-mutex infrastructure.
+  // Low-level leaf-lock primitives used to implement synchronization.
   // Not for general synchronization use.
   static void SpinAcquire(volatile int * Lock, const char * Name);
   static void SpinRelease(volatile int * Lock);
-  static void muxAcquire(volatile intptr_t * Lock, const char * Name);
-  static void muxRelease(volatile intptr_t * Lock);
 };
 
 // Inline implementation of Thread::current()