Skip to content

Commit 5e89985

Browse files
committed
bufmgr: Don't lock buffer header in StrategyGetBuffer()
Previously StrategyGetBuffer() acquired the buffer header spinlock for every buffer, whether it was reusable or not. If reusable, it'd be returned, with the lock held, to GetVictimBuffer(), which then would pin the buffer with PinBuffer_Locked(). That's somewhat violating the spirit of the guidelines for holding spinlocks (i.e. that they are only held for a few lines of consecutive code) and necessitates using PinBuffer_Locked(), which scales worse than PinBuffer() due to holding the spinlock. This alone makes it worth changing the code. However, the main reason to change this is that a future commit will make PinBuffer_Locked() slower (due to making UnlockBufHdr() slower), to gain scalability for the much more common case of pinning a pre-existing buffer. By pinning the buffer with a single atomic operation, iff the buffer is reusable, we avoid any potential regression for miss-heavy workloads. There strictly are fewer atomic operations for each potential buffer after this change. The price for this improvement is that freelist.c needs two CAS loops and needs to be able to set up the resource accounting for pinned buffers. The latter is achieved by exposing a new function for that purpose from bufmgr.c, that seems better than exposing the entire private refcount infrastructure. The improvement seems worth the complexity. Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Matthias van de Meent <boekewurm+postgres@gmail.com> Discussion: https://postgr.es/m/fvfmkr5kk4nyex56ejgxj3uzi63isfxovp2biecb4bspbjrze7@az2pljabhnff
1 parent 3baae90 commit 5e89985

File tree

3 files changed

+132
-66
lines changed

3 files changed

+132
-66
lines changed

src/backend/storage/buffer/bufmgr.c

Lines changed: 22 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -518,7 +518,6 @@ static void PinBuffer_Locked(BufferDesc *buf);
518518
static void UnpinBuffer(BufferDesc *buf);
519519
static void UnpinBufferNoOwner(BufferDesc *buf);
520520
static void BufferSync(int flags);
521-
static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
522521
static int SyncOneBuffer(int buf_id, bool skip_recently_used,
523522
WritebackContext *wb_context);
524523
static void WaitIO(BufferDesc *buf);
@@ -2326,8 +2325,8 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
23262325
bool from_ring;
23272326

23282327
/*
2329-
* Ensure, while the spinlock's not yet held, that there's a free refcount
2330-
* entry, and a resource owner slot for the pin.
2328+
* Ensure, before we pin a victim buffer, that there's a free refcount
2329+
* entry and resource owner slot for the pin.
23312330
*/
23322331
ReservePrivateRefCountEntry();
23332332
ResourceOwnerEnlarge(CurrentResourceOwner);
@@ -2336,17 +2335,12 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
23362335
again:
23372336

23382337
/*
2339-
* Select a victim buffer. The buffer is returned with its header
2340-
* spinlock still held!
2338+
* Select a victim buffer. The buffer is returned pinned and owned by
2339+
* this backend.
23412340
*/
23422341
buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
23432342
buf = BufferDescriptorGetBuffer(buf_hdr);
23442343

2345-
Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
2346-
2347-
/* Pin the buffer and then release the buffer spinlock */
2348-
PinBuffer_Locked(buf_hdr);
2349-
23502344
/*
23512345
* We shouldn't have any other pins for this buffer.
23522346
*/
@@ -3118,7 +3112,7 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
31183112
{
31193113
result = (buf_state & BM_VALID) != 0;
31203114

3121-
ref = NewPrivateRefCountEntry(b);
3115+
TrackNewBufferPin(b);
31223116

31233117
/*
31243118
* Assume that we acquired a buffer pin for the purposes of
@@ -3150,11 +3144,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
31503144
* cannot meddle with that.
31513145
*/
31523146
result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3147+
3148+
Assert(ref->refcount > 0);
3149+
ref->refcount++;
3150+
ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
31533151
}
31543152

3155-
ref->refcount++;
3156-
Assert(ref->refcount > 0);
3157-
ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
31583153
return result;
31593154
}
31603155

@@ -3183,8 +3178,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
31833178
static void
31843179
PinBuffer_Locked(BufferDesc *buf)
31853180
{
3186-
Buffer b;
3187-
PrivateRefCountEntry *ref;
31883181
uint32 buf_state;
31893182

31903183
/*
@@ -3209,12 +3202,7 @@ PinBuffer_Locked(BufferDesc *buf)
32093202
buf_state += BUF_REFCOUNT_ONE;
32103203
UnlockBufHdr(buf, buf_state);
32113204

3212-
b = BufferDescriptorGetBuffer(buf);
3213-
3214-
ref = NewPrivateRefCountEntry(b);
3215-
ref->refcount++;
3216-
3217-
ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
3205+
TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
32183206
}
32193207

32203208
/*
@@ -3332,6 +3320,17 @@ UnpinBufferNoOwner(BufferDesc *buf)
33323320
}
33333321
}
33343322

3323+
inline void
3324+
TrackNewBufferPin(Buffer buf)
3325+
{
3326+
PrivateRefCountEntry *ref;
3327+
3328+
ref = NewPrivateRefCountEntry(buf);
3329+
ref->refcount++;
3330+
3331+
ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
3332+
}
3333+
33353334
#define ST_SORT sort_checkpoint_bufferids
33363335
#define ST_ELEMENT_TYPE CkptSortItem
33373336
#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
@@ -6288,7 +6287,7 @@ LockBufHdr(BufferDesc *desc)
62886287
* Obviously the buffer could be locked by the time the value is returned, so
62896288
* this is primarily useful in CAS style loops.
62906289
*/
6291-
static uint32
6290+
pg_noinline uint32
62926291
WaitBufHdrUnlocked(BufferDesc *buf)
62936292
{
62946293
SpinDelayStatus delayStatus;

src/backend/storage/buffer/freelist.c

Lines changed: 106 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -159,21 +159,23 @@ ClockSweepTick(void)
159159
* StrategyGetBuffer
160160
*
161161
* Called by the bufmgr to get the next candidate buffer to use in
162-
* BufferAlloc(). The only hard requirement BufferAlloc() has is that
162+
* GetVictimBuffer(). The only hard requirement GetVictimBuffer() has is that
163163
* the selected buffer must not currently be pinned by anyone.
164164
*
165165
* strategy is a BufferAccessStrategy object, or NULL for default strategy.
166166
*
167-
* To ensure that no one else can pin the buffer before we do, we must
168-
* return the buffer with the buffer header spinlock still held.
167+
* It is the callers responsibility to ensure the buffer ownership can be
168+
* tracked via TrackNewBufferPin().
169+
*
170+
* The buffer is pinned and marked as owned, using TrackNewBufferPin(),
171+
* before returning.
169172
*/
170173
BufferDesc *
171174
StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
172175
{
173176
BufferDesc *buf;
174177
int bgwprocno;
175178
int trycounter;
176-
uint32 local_buf_state; /* to avoid repeated (de-)referencing */
177179

178180
*from_ring = false;
179181

@@ -228,44 +230,79 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
228230
trycounter = NBuffers;
229231
for (;;)
230232
{
233+
uint32 old_buf_state;
234+
uint32 local_buf_state;
235+
231236
buf = GetBufferDescriptor(ClockSweepTick());
232237

233238
/*
234-
* If the buffer is pinned or has a nonzero usage_count, we cannot use
235-
* it; decrement the usage_count (unless pinned) and keep scanning.
239+
* Check whether the buffer can be used and pin it if so. Do this
240+
* using a CAS loop, to avoid having to lock the buffer header.
236241
*/
237-
local_buf_state = LockBufHdr(buf);
238-
239-
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
242+
old_buf_state = pg_atomic_read_u32(&buf->state);
243+
for (;;)
240244
{
245+
local_buf_state = old_buf_state;
246+
247+
/*
248+
* If the buffer is pinned or has a nonzero usage_count, we cannot
249+
* use it; decrement the usage_count (unless pinned) and keep
250+
* scanning.
251+
*/
252+
253+
if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
254+
{
255+
if (--trycounter == 0)
256+
{
257+
/*
258+
* We've scanned all the buffers without making any state
259+
* changes, so all the buffers are pinned (or were when we
260+
* looked at them). We could hope that someone will free
261+
* one eventually, but it's probably better to fail than
262+
* to risk getting stuck in an infinite loop.
263+
*/
264+
elog(ERROR, "no unpinned buffers available");
265+
}
266+
break;
267+
}
268+
269+
if (unlikely(local_buf_state & BM_LOCKED))
270+
{
271+
old_buf_state = WaitBufHdrUnlocked(buf);
272+
continue;
273+
}
274+
241275
if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
242276
{
243277
local_buf_state -= BUF_USAGECOUNT_ONE;
244278

245-
trycounter = NBuffers;
279+
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
280+
local_buf_state))
281+
{
282+
trycounter = NBuffers;
283+
break;
284+
}
246285
}
247286
else
248287
{
249-
/* Found a usable buffer */
250-
if (strategy != NULL)
251-
AddBufferToRing(strategy, buf);
252-
*buf_state = local_buf_state;
253-
return buf;
288+
/* pin the buffer if the CAS succeeds */
289+
local_buf_state += BUF_REFCOUNT_ONE;
290+
291+
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
292+
local_buf_state))
293+
{
294+
/* Found a usable buffer */
295+
if (strategy != NULL)
296+
AddBufferToRing(strategy, buf);
297+
*buf_state = local_buf_state;
298+
299+
TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
300+
301+
return buf;
302+
}
254303
}
304+
255305
}
256-
else if (--trycounter == 0)
257-
{
258-
/*
259-
* We've scanned all the buffers without making any state changes,
260-
* so all the buffers are pinned (or were when we looked at them).
261-
* We could hope that someone will free one eventually, but it's
262-
* probably better to fail than to risk getting stuck in an
263-
* infinite loop.
264-
*/
265-
UnlockBufHdr(buf, local_buf_state);
266-
elog(ERROR, "no unpinned buffers available");
267-
}
268-
UnlockBufHdr(buf, local_buf_state);
269306
}
270307
}
271308

@@ -614,13 +651,15 @@ FreeAccessStrategy(BufferAccessStrategy strategy)
614651
* GetBufferFromRing -- returns a buffer from the ring, or NULL if the
615652
* ring is empty / not usable.
616653
*
617-
* The bufhdr spin lock is held on the returned buffer.
654+
* The buffer is pinned and marked as owned, using TrackNewBufferPin(), before
655+
* returning.
618656
*/
619657
static BufferDesc *
620658
GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
621659
{
622660
BufferDesc *buf;
623661
Buffer bufnum;
662+
uint32 old_buf_state;
624663
uint32 local_buf_state; /* to avoid repeated (de-)referencing */
625664

626665

@@ -637,24 +676,48 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
637676
if (bufnum == InvalidBuffer)
638677
return NULL;
639678

679+
buf = GetBufferDescriptor(bufnum - 1);
680+
640681
/*
641-
* If the buffer is pinned we cannot use it under any circumstances.
642-
*
643-
* If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
644-
* since our own previous usage of the ring element would have left it
645-
* there, but it might've been decremented by clock-sweep since then). A
646-
* higher usage_count indicates someone else has touched the buffer, so we
647-
* shouldn't re-use it.
682+
* Check whether the buffer can be used and pin it if so. Do this using a
683+
* CAS loop, to avoid having to lock the buffer header.
648684
*/
649-
buf = GetBufferDescriptor(bufnum - 1);
650-
local_buf_state = LockBufHdr(buf);
651-
if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
652-
&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
685+
old_buf_state = pg_atomic_read_u32(&buf->state);
686+
for (;;)
653687
{
654-
*buf_state = local_buf_state;
655-
return buf;
688+
local_buf_state = old_buf_state;
689+
690+
/*
691+
* If the buffer is pinned we cannot use it under any circumstances.
692+
*
693+
* If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
694+
* since our own previous usage of the ring element would have left it
695+
* there, but it might've been decremented by clock-sweep since then).
696+
* A higher usage_count indicates someone else has touched the buffer,
697+
* so we shouldn't re-use it.
698+
*/
699+
if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
700+
|| BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
701+
break;
702+
703+
if (unlikely(local_buf_state & BM_LOCKED))
704+
{
705+
old_buf_state = WaitBufHdrUnlocked(buf);
706+
continue;
707+
}
708+
709+
/* pin the buffer if the CAS succeeds */
710+
local_buf_state += BUF_REFCOUNT_ONE;
711+
712+
if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
713+
local_buf_state))
714+
{
715+
*buf_state = local_buf_state;
716+
717+
TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
718+
return buf;
719+
}
656720
}
657-
UnlockBufHdr(buf, local_buf_state);
658721

659722
/*
660723
* Tell caller to allocate a new buffer with the normal allocation

src/include/storage/buf_internals.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,8 @@ UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
371371
pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
372372
}
373373

374+
extern uint32 WaitBufHdrUnlocked(BufferDesc *buf);
375+
374376
/* in bufmgr.c */
375377

376378
/*
@@ -425,6 +427,8 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co
425427
extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
426428
IOContext io_context, BufferTag *tag);
427429

430+
extern void TrackNewBufferPin(Buffer buf);
431+
428432
/* solely to make it easier to write tests */
429433
extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
430434
extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,

0 commit comments

Comments
 (0)