This repository has been archived by the owner on Feb 21, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 231
/
snapshotqueue.go
508 lines (477 loc) · 16.7 KB
/
snapshotqueue.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
// Copyright 2019 Pilosa Corp.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pilosa
import (
"context"
"fmt"
"io"
"math/bits"
"os"
"sync"
"sync/atomic"
"time"
"github.com/pilosa/pilosa/v2/logger"
"github.com/pilosa/pilosa/v2/testhook"
"github.com/pkg/errors"
)
// snapshotQueue is a thing which can handle enqueuing snapshots. A snapshot
// queue distinguishes between high-priority requests, which get satisfied
// by the next available worker, and regular requests, which get enqueued
// if there's space in the queue, and otherwise dropped. There's also a
// separate background task to scan a holder for fragments which may need
// snapshots, but which is processed only when the queue is empty, and only
// slowly. "Await" awaits an existing snapshot if one is already enqueued.
// "Immediate" tries to do one right away. (If one's already enqueued, this
// can leave it in the queue, which will ignore anything that shows up with
// the request flag cleared.)
//
// Await, Enqueue, and Immediate should be called only with the fragment lock
// held.
//
// If you create a queue, it should get stopped at some point. The
// atomicSnapshotQueue implementation used as defaultSnapshotQueue has
// a Start function which will tell you whether it actually started a
// queue. This logic exists because in a normal server case, you probably
// want the queue to be shut down as part of server shutdown, but if you're
// running cluster tests, you probably want to start and shop the queue as
// part of the test, not stop it when any server terminates.
//
// It's less likely to be desireable to start/stop individual queues,
// because fragments use the defaultSnapshotQueue anyway. This design
// needs revisiting.
type SnapshotQueue interface {
Immediate(*fragment) error
Enqueue(*fragment)
Await(*fragment) error
ScanHolder(*Holder, chan struct{})
Stop()
}
// queuelessSnapshotQueue isn't a snapshot queue, but it satisfies the
// interface.
type queuelessSnapshotQueue struct{}
func (q *queuelessSnapshotQueue) Enqueue(f *fragment) {
// We don't actually try to enqueue the snapshot; it breaks things
// if a snapshot gets caused during a transaction.
}
func (q *queuelessSnapshotQueue) Await(f *fragment) error {
return nil
}
func (q *queuelessSnapshotQueue) Immediate(f *fragment) error {
return f.snapshot()
}
func (q *queuelessSnapshotQueue) ScanHolder(h *Holder, done chan struct{}) {
}
func (q *queuelessSnapshotQueue) Stop() {
}
var defaultSnapshotQueue = &queuelessSnapshotQueue{}
// newSnapshotQueue makes a new snapshot queue, of depth N, with
// w worker threads.
func newSnapshotQueue(n int, w int, l logger.Logger) SnapshotQueue {
ctx, cancel := context.WithCancel(context.Background())
sq := &prioritySnapshotQueue{
normal: make(chan snapshotRequest, n),
urgent: make(chan snapshotRequest),
background: make(chan snapshotRequest),
ctx: ctx,
cancel: cancel,
maxOpN: 10000,
logger: l,
}
if sq.logger == nil {
sq.logger = logger.NewStandardLogger(os.Stderr)
}
_ = testhook.Opened(NewAuditor(), sq, nil)
sq.spawnWorkers(w)
return sq
}
type snapshotRequest struct {
frag *fragment
when time.Time
}
// prioritySnapshotQueue gives preference to "immediate" requests, and
// dispreference to "background" requests from ScanHolder. It timestamps
// requests, so it can discard a request if the most recent snapshot is
// newer than the request. The snapshotPending flag in the fragment is
// used to track that a given fragment thinks it has been successfully
// enqueued. Background requests are not considered enqueued, since
// they'll never get processed if there's anything else. In normal workloads,
// immediate/urgent snapshots should be rare, but we'll happily drop
// most requests on the floor; the scanner should pick them up once things
// are quiet.
type prioritySnapshotQueue struct {
logger logger.Logger
urgent chan snapshotRequest
normal chan snapshotRequest
background chan snapshotRequest
ctx context.Context
cancel context.CancelFunc
mu sync.RWMutex
scanWG, workerWG sync.WaitGroup
maxOpN int
observedOpN [16]uint32
stats struct {
enqueued uint32
skipped uint32
}
stopped bool
}
func (sq *prioritySnapshotQueue) spawnWorkers(w int) {
sq.mu.Lock()
defer sq.mu.Unlock()
if sq.ctx.Err() != nil {
sq.logger.Printf("prioritySnapshotQueue worker: already done")
return
}
sq.workerWG.Add(w)
for i := 0; i < w; i++ {
go sq.worker(sq.ctx, sq.urgent, sq.normal, sq.background)
}
}
func (sq *prioritySnapshotQueue) worker(ctx context.Context, urgent, normal, background chan snapshotRequest) {
defer sq.workerWG.Done()
done := ctx.Done()
ok := true
var req snapshotRequest
for ok {
req.frag = nil
select {
case _, ok = <-done:
case req, ok = <-urgent:
default:
select {
case _, ok = <-done:
case req, ok = <-urgent:
case req, ok = <-normal:
default:
select {
case _, ok = <-done:
case req, ok = <-urgent:
case req, ok = <-normal:
case req, ok = <-background:
}
}
}
if req.frag != nil {
sq.process(req)
}
}
}
// process actually runs a fragment. it will do this if either the fragment
// has a pending snapshot, or the force flag is set.
func (sq *prioritySnapshotQueue) process(req snapshotRequest) {
f := req.frag
f.mu.Lock()
defer f.mu.Unlock()
if f.snapshotStamp.Before(req.when) {
f.snapshotErr = f.snapshot()
if f.snapshotErr != nil {
fmt.Printf("snapshot error: %v\n", f.snapshotErr)
sq.logger.Printf("snapshot error: %v", f.snapshotErr)
}
f.snapshotPending = false
f.snapshotCond.Broadcast()
}
}
// Stop shuts down the snapshot queue. It first marks it as done, causing
// the background scanner(s), if any, to shut down, then waits for them, then
// closes and nils the queues. The background scanner has to get stopped
// because otherwise it might try to write to those closed queues.
func (sq *prioritySnapshotQueue) Stop() {
sq.mu.Lock()
defer sq.mu.Unlock()
if sq.stopped {
return
}
sq.stopped = true
sq.cancel()
// scanners need to be done before we close the other channels.
sq.scanWG.Wait()
close(sq.normal)
sq.normal = nil
close(sq.urgent)
sq.urgent = nil
close(sq.background)
sq.background = nil
_ = testhook.Closed(NewAuditor(), sq, nil)
enqueued := atomic.LoadUint32(&sq.stats.enqueued)
skipped := atomic.LoadUint32(&sq.stats.skipped)
if skipped > 0 || enqueued > 1 {
sq.logger.Printf("snapshot queue: enqueued %d, skipped %d\n", sq.stats.enqueued, sq.stats.skipped)
}
}
// Enqueue tries to add a fragment to the queue, if the fragment is not already
// enqueued. You should hold a lock on the fragment when calling this.
func (sq *prioritySnapshotQueue) Enqueue(f *fragment) {
if f.snapshotPending {
return
}
sq.observeOpN(uint32(f.opN))
sq.mu.RLock()
defer sq.mu.RUnlock()
if sq.normal == nil {
sq.logger.Printf("requested snapshot after snapshot queue was closed")
return
}
// we have to set this before enqueing, because it's
// otherwise possible that we're at the head of the queue,
// and the recipient gets the fragment before we execute the
// line after the send.
f.snapshotPending = true
// try to enqueue snapshot
select {
case sq.normal <- snapshotRequest{frag: f, when: time.Now()}:
atomic.AddUint32(&sq.stats.enqueued, 1)
return
default:
atomic.AddUint32(&sq.stats.skipped, 1)
f.snapshotPending = false
return
}
}
// Await returns when f is not pending a snapshot. Call with the fragment lock
// held. Await waits on a condition variable inside f, associated with the
// fragment's lock, so this does not conflict with the lock being used for
// snapshots.
//
// Note that workers don't stop just because the queue's been stopped; only
// the background scanner is stopped. So an Await shouldn't block forever
// even if the queue gets shut down. If you're reading this, possibly that
// analysis is incorrect.
func (sq *prioritySnapshotQueue) Await(f *fragment) (err error) {
for f.snapshotPending {
f.snapshotCond.Wait()
}
err, f.snapshotErr = f.snapshotErr, nil
return err
}
// Immediate forces an immediate snapshot of the given fragment. Call with
// the fragment locked. If the queue is already closing, the fragment does
// not get snapshotted.
func (sq *prioritySnapshotQueue) Immediate(f *fragment) error {
sq.mu.RLock()
// no deferred unlock, because we want to unlock this before calling Await.
// Not because that needs this lock, but because once we're that far, we
// *don't* need this lock anymore so someone else should have it.
if sq.urgent == nil {
sq.mu.RUnlock()
sq.logger.Printf("requested immediate snapshot after snapshot queue was closed")
return errors.New("requested immediate snapshot after snapshot queue was closed")
}
f.snapshotPending = true
sq.observeOpN(uint32(f.opN))
req := snapshotRequest{frag: f, when: time.Now()}
// if the fragment was already in the work queue, it's *possible*
// that the only available worker just picked it off the queue, and
// is now waiting on getting the fragment's lock, so it can run
// a snapshot. So we let go of the lock on the fragment, send the
// request, then request the fragment lock again, because Await will
// be sleeping on the condition variable associated with the lock,
// which means it needs to hold the lock so it can let it go during
// the wait... No, really, this made sense.
f.mu.Unlock()
sq.urgent <- req
sq.mu.RUnlock()
f.mu.Lock()
return sq.Await(f)
}
// ScanHolder spawns a goroutine which iterates through the holder's
// indexes/fields/views/fragments, looking for fragments which have OpN
// high enough to justify a snapshot but don't seem to have one pending.
// It then dumps these in the low priority background queue.
func (sq *prioritySnapshotQueue) ScanHolder(h *Holder, done chan struct{}) {
sq.mu.Lock()
sq.scanWG.Add(1)
go sq.scanHolderWorker(h, sq.background, done)
sq.mu.Unlock()
}
// observeOpN reports that a given value of opN was "observed", meaning,
// we encountered a fragment which had that value. This happens for every
// enqueue/immediate, including enqueue attempts which fail to actually
// enter the queue, and it also happens for fragments noticed by the background
// scan but which don't have high enough opN to trigger a snapshot.
func (sq *prioritySnapshotQueue) observeOpN(n uint32) {
// aka "log2(n) + 1", or 0 for n==0
pow2 := 32 - bits.LeadingZeros32(n)
// 15 == 16384. Our usual fragment maxOpN is 10k, so most fragments
// should end up in the 8k-16k bucket, rather than the 16k+ bucket,
// unless we've got a lot of ingests with large batches going on,
// in which case the 16k bucket will win.
if pow2 > 15 {
pow2 = 15
}
// store in inverse order so the lowest slot in the array is the
// highest cardinality
atomic.AddUint32(&sq.observedOpN[15-pow2], 1)
}
// computeMaxOpN tries to pick a reasonable new maxOpN for the background
// scan to use. On a quiet system, we want to gradually lower opN, picking
// the fragments with the highest opN values first, because those offer the
// largest benefit. So, whenever we check a fragment in the background, if we
// *don't* snapshot it, we'll "observe" its OpN value, and then we pick a
// value which picks up at least 1/4 of them.
//
// If there's ingest activity, the Immediate and Enqueue operations will
// "observe" the OpN of fragments submitted to them. This can drive OpN back
// up, if those fragments frequently have very high opN values, which reflects
// the fact that we have enough of that activity that we don't need the
// background scanner adding more.
//
// If we have enough ingest activity that the background scanner never actually
// gets to submit work, we'll rarely get here, because the background scanner
// will block until there's no snapshots pending for the normal workload.
// When we do, we'll probably pick a MaxOpN which is dominated by the ingest
// workload's opN values. So for instance, if everything coming in from the
// ingest workload has 10k or more items, because that's the default fragment
// maxOpN, that will probably set the background snapshot queue value to 8k.
func (sq *prioritySnapshotQueue) computeMaxOpN() {
sq.logger.Debugf("observedOpN by power of 2: %d\n", sq.observedOpN[:])
total := uint32(0)
for i := range sq.observedOpN {
total += atomic.LoadUint32(&sq.observedOpN[i])
}
target := (total / 4) + 1
subTotal := uint32(0)
for i := range sq.observedOpN {
v := atomic.LoadUint32(&sq.observedOpN[i])
subTotal += v
if subTotal >= target {
prevMaxOpN := sq.maxOpN
sq.maxOpN = (1 << (15 - uint(i))) / 2
if sq.maxOpN > 0 {
sq.maxOpN--
}
if prevMaxOpN != sq.maxOpN {
sq.logger.Printf("background scan: %d/%d fragments considered have opN %d or higher\n",
subTotal, total, sq.maxOpN)
}
break
}
}
// It's conceptually possible that we'll miss a couple of observations
// here but that's not really important. This is all pretty approximate.
for i := range sq.observedOpN {
atomic.StoreUint32(&sq.observedOpN[i], 0)
}
}
// prioritySnapshotQueueScanner is the data type that implements HolderOperator
// and represents a single scan of a holder, with a given maxOpN.
type prioritySnapshotQueueScanner struct {
HolderFilterAll
HolderProcessNone
sq *prioritySnapshotQueue
holder *Holder
queue chan snapshotRequest
ctx context.Context
maxOpN int
seen, hits, counter int
}
func (s *prioritySnapshotQueueScanner) ProcessFragment(f *fragment) error {
if f == nil {
return nil
}
s.seen++
// we can't defer this reasonably, because otherwise we'll keep
// the fragment locked forever if we end up trying to send it
// to the queue, but the workers are busy on other fragments.
f.mu.Lock()
open := f.open
snapshotPending, opN := f.snapshotPending, f.opN
f.mu.Unlock()
// a pending snapshot is one that is either in the normal or
// immediate queue, or is trying to get into the normal queue
// and about to fail, but either way, it already got observed
// there, so we don't need to observe it here. A closed fragment
// doesn't matter to us -- it should be a transient state that
// happens during a shutdown, or shouldn't happen, but we don't
// care about it.
if snapshotPending || !open {
return nil
}
if opN <= s.maxOpN {
// observe the value but don't do a snapshot
s.sq.observeOpN(uint32(opN))
s.counter++
if s.counter == 1000 {
select {
case <-time.After(1 * time.Second):
case <-s.ctx.Done():
return io.EOF
}
s.counter = 0
}
return nil
}
// we don't observe values when we decide to trigger a snapshot,
// because those values will be changing anyway. we could also
// observe them as zero, but that's also sort of wrong.
s.hits++
select {
case s.queue <- snapshotRequest{frag: f, when: time.Now()}:
s.sq.logger.Debugf("found fragment needing snapshot: %s\n", f.path)
case <-s.ctx.Done():
return io.EOF
}
return nil
}
func contextMergedWithStructChan(ctx context.Context, ch chan struct{}) (context.Context, context.CancelFunc) {
canCancel, cancel := context.WithCancel(ctx)
go func() {
select {
case <-ctx.Done():
cancel()
case <-ch:
cancel()
case <-canCancel.Done():
// don't need to cancel, but do need to exit this
// function
}
}()
return canCancel, cancel
}
// scanHolderWorker is a background task that scans a holder looking for
// fragments which need snapshots taken. It's the cleanup task for snapshots
// that would have been requested by Enqueue, but the queue was full.
func (sq *prioritySnapshotQueue) scanHolderWorker(h *Holder, background chan snapshotRequest, done chan struct{}) {
defer sq.scanWG.Done()
ctx, cancel := contextMergedWithStructChan(sq.ctx, done)
defer cancel()
scanner := &prioritySnapshotQueueScanner{
sq: sq,
holder: h,
queue: background,
ctx: sq.ctx,
maxOpN: sq.maxOpN,
}
for {
err := h.Process(ctx, scanner)
if err != nil {
return
}
if scanner.hits > 0 {
sq.logger.Printf("background scan: %d/%d fragments needed snapshots\n", scanner.hits, scanner.seen)
scanner.hits = 0
} else {
sq.logger.Debugf("background scan: no fragments needed snapshots, waiting\n")
// No reason to be active if we're not finding anything.
select {
case <-time.After(60 * time.Second):
case <-ctx.Done():
return
}
}
scanner.seen = 0
sq.computeMaxOpN()
scanner.maxOpN = sq.maxOpN
}
}