Skip to content

Commit 30462f9

Browse files
committed
8318986: Improve GenericWaitBarrier performance
Reviewed-by: rehn, iwalulya, pchilanomate
1 parent 407cdd4 commit 30462f9

File tree

2 files changed

+274
-61
lines changed

2 files changed

+274
-61
lines changed
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
/*
2-
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2019, 2023, Oracle and/or its affiliates. All rights reserved.
3+
* Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
34
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
45
*
56
* This code is free software; you can redistribute it and/or modify it
@@ -29,66 +30,228 @@
2930
#include "utilities/waitBarrier_generic.hpp"
3031
#include "utilities/spinYield.hpp"
3132

33+
// Implements the striped semaphore wait barrier.
34+
//
35+
// To guarantee progress and safety, we need to make sure that new barrier tag
36+
// starts with the completely empty set of waiters and free semaphore. This
37+
// requires either waiting for all threads to leave wait() for current barrier
38+
// tag on disarm(), or waiting for all threads to leave the previous tag before
39+
// reusing the semaphore in arm().
40+
//
41+
// When there are multiple threads, it is normal for some threads to take
42+
// significant time to leave the barrier. Waiting for these threads introduces
43+
// stalls on barrier reuse.
44+
//
45+
// If we wait on disarm(), this stall is nearly guaranteed to happen if some threads
46+
// are de-scheduled by prior wait(). It would be especially bad if there are more
47+
// waiting threads than CPUs: every thread would need to wake up and register itself
48+
// as leaving, before we can unblock from disarm().
49+
//
50+
// If we wait on arm(), we can get lucky that most threads would be able to catch up,
51+
// exit wait(), and so we arrive to arm() with semaphore ready for reuse. However,
52+
// that is still insufficient in practice.
53+
//
54+
// Therefore, this implementation goes a step further and implements the _striped_
55+
// semaphores. We maintain several semaphores in cells. The barrier tags are assigned
56+
// to cells in some simple manner. Most of the current uses have sequential barrier
57+
// tags, so simple modulo works well. We then operate on a cell like we would operate
58+
// on a single semaphore: we wait at arm() for all threads to catch up before reusing
59+
// the cell. For the cost of maintaining just a few cells, we have enough window for
60+
// threads to catch up.
61+
//
62+
// The correctness is guaranteed by using a single atomic state variable per cell,
63+
// with updates always done with CASes:
64+
//
65+
// [.......... barrier tag ..........][.......... waiters ..........]
66+
// 63 31 0
67+
//
68+
// Cell starts with zero tag and zero waiters. Arming the cell swings barrier tag from
69+
// zero to some tag, while checking that no waiters have appeared. Disarming swings
70+
// the barrier tag back from tag to zero. Every waiter registers itself by incrementing
71+
// the "waiters", while checking that barrier tag is still the same. Every completing waiter
72+
// decrements the "waiters". When all waiters complete, a cell ends up in initial state,
73+
// ready to be armed again. This allows accurate tracking of how many signals
74+
// to issue and does not race with disarm.
75+
//
76+
// The implementation uses the strongest (default) barriers for extra safety, even
77+
// when not strictly required to do so for correctness. Extra barrier overhead is
78+
// dominated by the actual wait/notify latency anyway.
79+
//
80+
3281
void GenericWaitBarrier::arm(int barrier_tag) {
33-
assert(_barrier_tag == 0, "Already armed");
34-
assert(_waiters == 0, "We left a thread hanging");
35-
_barrier_tag = barrier_tag;
36-
_waiters = 0;
82+
assert(barrier_tag != 0, "Pre arm: Should be arming with armed value");
83+
assert(Atomic::load(&_barrier_tag) == 0,
84+
"Pre arm: Should not be already armed. Tag: %d",
85+
Atomic::load(&_barrier_tag));
86+
Atomic::release_store(&_barrier_tag, barrier_tag);
87+
88+
Cell &cell = tag_to_cell(barrier_tag);
89+
cell.arm(barrier_tag);
90+
91+
// API specifies arm() must provide a trailing fence.
3792
OrderAccess::fence();
3893
}
3994

40-
int GenericWaitBarrier::wake_if_needed() {
41-
assert(_barrier_tag == 0, "Not disarmed");
42-
int w = _waiters;
43-
if (w == 0) {
44-
// Load of _barrier_threads in caller must not pass the load of _waiters.
45-
OrderAccess::loadload();
46-
return 0;
47-
}
48-
assert(w > 0, "Bad counting");
49-
// We need an exact count which never goes below zero,
50-
// otherwise the semaphore may be signalled too many times.
51-
if (Atomic::cmpxchg(&_waiters, w, w - 1) == w) {
52-
_sem_barrier.signal();
53-
return w - 1;
54-
}
55-
return w;
95+
void GenericWaitBarrier::disarm() {
96+
int barrier_tag = Atomic::load_acquire(&_barrier_tag);
97+
assert(barrier_tag != 0, "Pre disarm: Should be armed. Tag: %d", barrier_tag);
98+
Atomic::release_store(&_barrier_tag, 0);
99+
100+
Cell &cell = tag_to_cell(barrier_tag);
101+
cell.disarm(barrier_tag);
102+
103+
// API specifies disarm() must provide a trailing fence.
104+
OrderAccess::fence();
56105
}
57106

58-
void GenericWaitBarrier::disarm() {
59-
assert(_barrier_tag != 0, "Not armed");
60-
_barrier_tag = 0;
61-
// Loads of _barrier_threads/_waiters must not float above disarm store and
62-
// disarm store must not sink below.
107+
void GenericWaitBarrier::wait(int barrier_tag) {
108+
assert(barrier_tag != 0, "Pre wait: Should be waiting on armed value");
109+
110+
Cell &cell = tag_to_cell(barrier_tag);
111+
cell.wait(barrier_tag);
112+
113+
// API specifies wait() must provide a trailing fence.
63114
OrderAccess::fence();
64-
int left;
115+
}
116+
117+
void GenericWaitBarrier::Cell::arm(int32_t requested_tag) {
118+
// Before we continue to arm, we need to make sure that all threads
119+
// have left the previous cell.
120+
121+
int64_t state;
122+
65123
SpinYield sp;
66-
do {
67-
left = GenericWaitBarrier::wake_if_needed();
68-
if (left == 0 && _barrier_threads > 0) {
69-
// There is no thread to wake but we still have barrier threads.
124+
while (true) {
125+
state = Atomic::load_acquire(&_state);
126+
assert(decode_tag(state) == 0,
127+
"Pre arm: Should not be armed. "
128+
"Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT,
129+
decode_tag(state), decode_waiters(state));
130+
if (decode_waiters(state) == 0) {
131+
break;
132+
}
133+
sp.wait();
134+
}
135+
136+
// Try to swing cell to armed. This should always succeed after the check above.
137+
int64_t new_state = encode(requested_tag, 0);
138+
int64_t prev_state = Atomic::cmpxchg(&_state, state, new_state);
139+
if (prev_state != state) {
140+
fatal("Cannot arm the wait barrier. "
141+
"Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT,
142+
decode_tag(prev_state), decode_waiters(prev_state));
143+
}
144+
}
145+
146+
int GenericWaitBarrier::Cell::signal_if_needed(int max) {
147+
int signals = 0;
148+
while (true) {
149+
int cur = Atomic::load_acquire(&_outstanding_wakeups);
150+
if (cur == 0) {
151+
// All done, no more waiters.
152+
return 0;
153+
}
154+
assert(cur > 0, "Sanity");
155+
156+
int prev = Atomic::cmpxchg(&_outstanding_wakeups, cur, cur - 1);
157+
if (prev != cur) {
158+
// Contention, return to caller for early return or backoff.
159+
return prev;
160+
}
161+
162+
// Signal!
163+
_sem.signal();
164+
165+
if (++signals >= max) {
166+
// Signalled requested number of times, break out.
167+
return prev;
168+
}
169+
}
170+
}
171+
172+
void GenericWaitBarrier::Cell::disarm(int32_t expected_tag) {
173+
int32_t waiters;
174+
175+
while (true) {
176+
int64_t state = Atomic::load_acquire(&_state);
177+
int32_t tag = decode_tag(state);
178+
waiters = decode_waiters(state);
179+
180+
assert((tag == expected_tag) && (waiters >= 0),
181+
"Mid disarm: Should be armed with expected tag and have sane waiters. "
182+
"Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT,
183+
tag, waiters);
184+
185+
int64_t new_state = encode(0, waiters);
186+
if (Atomic::cmpxchg(&_state, state, new_state) == state) {
187+
// Successfully disarmed.
188+
break;
189+
}
190+
}
191+
192+
// Wake up waiters, if we have at least one.
193+
// Allow other threads to assist with wakeups, if possible.
194+
if (waiters > 0) {
195+
Atomic::release_store(&_outstanding_wakeups, waiters);
196+
SpinYield sp;
197+
while (signal_if_needed(INT_MAX) > 0) {
70198
sp.wait();
71199
}
72-
// We must loop here until there are no waiters or potential waiters.
73-
} while (left > 0 || _barrier_threads > 0);
74-
// API specifies disarm() must provide a trailing fence.
75-
OrderAccess::fence();
200+
}
201+
assert(Atomic::load(&_outstanding_wakeups) == 0, "Post disarm: Should not have outstanding wakeups");
76202
}
77203

78-
void GenericWaitBarrier::wait(int barrier_tag) {
79-
assert(barrier_tag != 0, "Trying to wait on disarmed value");
80-
if (barrier_tag != _barrier_tag) {
81-
// API specifies wait() must provide a trailing fence.
82-
OrderAccess::fence();
83-
return;
204+
void GenericWaitBarrier::Cell::wait(int32_t expected_tag) {
205+
// Try to register ourselves as pending waiter.
206+
while (true) {
207+
int64_t state = Atomic::load_acquire(&_state);
208+
int32_t tag = decode_tag(state);
209+
if (tag != expected_tag) {
210+
// Cell tag had changed while waiting here. This means either the cell had
211+
// been disarmed, or we are late and the cell was armed with a new tag.
212+
// Exit without touching anything else.
213+
return;
214+
}
215+
int32_t waiters = decode_waiters(state);
216+
217+
assert((tag == expected_tag) && (waiters >= 0 && waiters < INT32_MAX),
218+
"Before wait: Should be armed with expected tag and waiters are in range. "
219+
"Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT,
220+
tag, waiters);
221+
222+
int64_t new_state = encode(tag, waiters + 1);
223+
if (Atomic::cmpxchg(&_state, state, new_state) == state) {
224+
// Success! Proceed to wait.
225+
break;
226+
}
84227
}
85-
Atomic::add(&_barrier_threads, 1);
86-
if (barrier_tag != 0 && barrier_tag == _barrier_tag) {
87-
Atomic::add(&_waiters, 1);
88-
_sem_barrier.wait();
89-
// We help out with posting, but we need to do so before we decrement the
90-
// _barrier_threads otherwise we might wake threads up in next wait.
91-
GenericWaitBarrier::wake_if_needed();
228+
229+
// Wait for notification.
230+
_sem.wait();
231+
232+
// Unblocked! We help out with waking up two siblings. This allows to avalanche
233+
// the wakeups for many threads, even if some threads are lagging behind.
234+
// Note that we can only do this *before* reporting back as completed waiter,
235+
// otherwise we might prematurely wake up threads for another barrier tag.
236+
// Current arm() sequence protects us from this trouble by waiting until all waiters
237+
// leave.
238+
signal_if_needed(2);
239+
240+
// Register ourselves as completed waiter before leaving.
241+
while (true) {
242+
int64_t state = Atomic::load_acquire(&_state);
243+
int32_t tag = decode_tag(state);
244+
int32_t waiters = decode_waiters(state);
245+
246+
assert((tag == 0) && (waiters > 0),
247+
"After wait: Should be not armed and have non-complete waiters. "
248+
"Tag: " INT32_FORMAT "; Waiters: " INT32_FORMAT,
249+
tag, waiters);
250+
251+
int64_t new_state = encode(tag, waiters - 1);
252+
if (Atomic::cmpxchg(&_state, state, new_state) == state) {
253+
// Success!
254+
break;
255+
}
92256
}
93-
Atomic::add(&_barrier_threads, -1);
94257
}

src/hotspot/share/utilities/waitBarrier_generic.hpp

+62-12
Original file line numberDiff line numberDiff line change
@@ -26,29 +26,79 @@
2626
#define SHARE_UTILITIES_WAITBARRIER_GENERIC_HPP
2727

2828
#include "memory/allocation.hpp"
29+
#include "memory/padded.hpp"
2930
#include "runtime/semaphore.hpp"
3031
#include "utilities/globalDefinitions.hpp"
3132

32-
// In addition to the barrier tag, it uses two counters to keep the semaphore
33-
// count correct and not leave any late thread waiting.
3433
class GenericWaitBarrier : public CHeapObj<mtInternal> {
34+
private:
35+
class Cell : public CHeapObj<mtInternal> {
36+
private:
37+
// Pad out the cells to avoid interference between the cells.
38+
// This would insulate from stalls when adjacent cells have returning
39+
// workers and contend over the cache line for current latency-critical
40+
// cell.
41+
DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
42+
43+
Semaphore _sem;
44+
45+
// Cell state, tracks the arming + waiters status
46+
volatile int64_t _state;
47+
48+
// Wakeups to deliver for current waiters
49+
volatile int _outstanding_wakeups;
50+
51+
int signal_if_needed(int max);
52+
53+
static int64_t encode(int32_t barrier_tag, int32_t waiters) {
54+
int64_t val = (((int64_t) barrier_tag) << 32) |
55+
(((int64_t) waiters) & 0xFFFFFFFF);
56+
assert(decode_tag(val) == barrier_tag, "Encoding is reversible");
57+
assert(decode_waiters(val) == waiters, "Encoding is reversible");
58+
return val;
59+
}
60+
61+
static int32_t decode_tag(int64_t value) {
62+
return (int32_t)(value >> 32);
63+
}
64+
65+
static int32_t decode_waiters(int64_t value) {
66+
return (int32_t)(value & 0xFFFFFFFF);
67+
}
68+
69+
public:
70+
Cell() : _sem(0), _state(encode(0, 0)), _outstanding_wakeups(0) {}
71+
NONCOPYABLE(Cell);
72+
73+
void arm(int32_t requested_tag);
74+
void disarm(int32_t expected_tag);
75+
void wait(int32_t expected_tag);
76+
};
77+
78+
// Should be enough for most uses without exploding the footprint.
79+
static constexpr int CELLS_COUNT = 16;
80+
81+
Cell _cells[CELLS_COUNT];
82+
83+
// Trailing padding to protect the last cell.
84+
DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
85+
3586
volatile int _barrier_tag;
36-
// The number of threads waiting on or about to wait on the semaphore.
37-
volatile int _waiters;
38-
// The number of threads in the wait path, before or after the tag check.
39-
// These threads can become waiters.
40-
volatile int _barrier_threads;
41-
Semaphore _sem_barrier;
87+
88+
// Trailing padding to insulate the rest of the barrier from adjacent
89+
// data structures. The leading padding is not needed, as cell padding
90+
// handles this for us.
91+
DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0);
4292

4393
NONCOPYABLE(GenericWaitBarrier);
4494

45-
int wake_if_needed();
95+
Cell& tag_to_cell(int tag) { return _cells[tag & (CELLS_COUNT - 1)]; }
4696

47-
public:
48-
GenericWaitBarrier() : _barrier_tag(0), _waiters(0), _barrier_threads(0), _sem_barrier(0) {}
97+
public:
98+
GenericWaitBarrier() : _cells(), _barrier_tag(0) {}
4999
~GenericWaitBarrier() {}
50100

51-
const char* description() { return "semaphore"; }
101+
const char* description() { return "striped semaphore"; }
52102

53103
void arm(int barrier_tag);
54104
void disarm();

0 commit comments

Comments
 (0)