-
Notifications
You must be signed in to change notification settings - Fork 5.5k
/
bt_slvg.c
2521 lines (2250 loc) · 73.7 KB
/
bt_slvg.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*-
* Copyright (c) 2014-2018 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#include "wt_internal.h"
struct __wt_stuff; typedef struct __wt_stuff WT_STUFF;
struct __wt_track; typedef struct __wt_track WT_TRACK;
struct __wt_track_shared; typedef struct __wt_track_shared WT_TRACK_SHARED;
/*
* There's a bunch of stuff we pass around during salvage, group it together
* to make the code prettier.
*/
struct __wt_stuff {
WT_SESSION_IMPL *session; /* Salvage session */
WT_TRACK **pages; /* Pages */
uint32_t pages_next; /* Next empty slot */
size_t pages_allocated; /* Bytes allocated */
WT_TRACK **ovfl; /* Overflow pages */
uint32_t ovfl_next; /* Next empty slot */
size_t ovfl_allocated; /* Bytes allocated */
WT_REF root_ref; /* Created root page */
uint8_t page_type; /* Page type */
/* If need to free blocks backing merged page ranges. */
bool merge_free;
WT_ITEM *tmp1; /* Verbose print buffer */
WT_ITEM *tmp2; /* Verbose print buffer */
uint64_t fcnt; /* Progress counter */
};
/*
* WT_TRACK_SHARED --
* Information shared between pages being merged.
*/
struct __wt_track_shared {
uint32_t ref; /* Reference count */
/*
* Physical information about the file block.
*/
WT_ADDR addr; /* Page address */
uint32_t size; /* Page size */
uint64_t gen; /* Page generation */
/*
* Pages that reference overflow pages contain a list of the overflow
* pages they reference. We start out with a list of addresses, and
* convert to overflow array slots during the reconciliation of page
* references to overflow records.
*/
WT_ADDR *ovfl_addr; /* Overflow pages by address */
uint32_t *ovfl_slot; /* Overflow pages by slot */
uint32_t ovfl_cnt; /* Overflow reference count */
};
/*
* WT_TRACK --
* Structure to track chunks, one per chunk; we start out with a chunk per
* page (either leaf or overflow), but when we find overlapping key ranges, we
* split the leaf page chunks up, one chunk for each unique key range.
*/
struct __wt_track {
#define trk_addr shared->addr.addr
#define trk_addr_size shared->addr.size
#define trk_gen shared->gen
#define trk_ovfl_addr shared->ovfl_addr
#define trk_ovfl_cnt shared->ovfl_cnt
#define trk_ovfl_slot shared->ovfl_slot
#define trk_size shared->size
WT_TRACK_SHARED *shared; /* Shared information */
WT_STUFF *ss; /* Enclosing stuff */
union {
struct {
#undef row_start
#define row_start u.row._row_start
WT_ITEM _row_start; /* Row-store start range */
#undef row_stop
#define row_stop u.row._row_stop
WT_ITEM _row_stop; /* Row-store stop range */
} row;
struct {
#undef col_start
#define col_start u.col._col_start
uint64_t _col_start; /* Col-store start range */
#undef col_stop
#define col_stop u.col._col_stop
uint64_t _col_stop; /* Col-store stop range */
#undef col_missing
#define col_missing u.col._col_missing
uint64_t _col_missing; /* Col-store missing range */
} col;
} u;
/* AUTOMATIC FLAG VALUE GENERATION START */
#define WT_TRACK_CHECK_START 0x1u /* Row: initial key updated */
#define WT_TRACK_CHECK_STOP 0x2u /* Row: last key updated */
#define WT_TRACK_MERGE 0x4u /* Page requires merging */
#define WT_TRACK_OVFL_REFD 0x8u /* Overflow page referenced */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
u_int flags;
};
static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *);
static int __slvg_col_ovfl(WT_SESSION_IMPL *,
WT_TRACK *, WT_PAGE *, uint64_t, uint64_t, uint64_t);
static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *);
static void __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_col_range_overlap(
WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
static void __slvg_col_trk_update_start(uint32_t, WT_STUFF *);
static int __slvg_merge_block_free(WT_SESSION_IMPL *, WT_STUFF *);
static int WT_CDECL __slvg_ovfl_compare(const void *, const void *);
static int __slvg_ovfl_discard(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_ovfl_reconcile(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_ovfl_ref(WT_SESSION_IMPL *, WT_TRACK *, bool);
static int __slvg_ovfl_ref_all(WT_SESSION_IMPL *, WT_TRACK *);
static int __slvg_read(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_row_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *);
static int __slvg_row_build_leaf(
WT_SESSION_IMPL *, WT_TRACK *, WT_REF *, WT_STUFF *);
static int __slvg_row_ovfl(
WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint32_t, uint32_t);
static int __slvg_row_range(WT_SESSION_IMPL *, WT_STUFF *);
static int __slvg_row_range_overlap(
WT_SESSION_IMPL *, uint32_t, uint32_t, WT_STUFF *);
static int __slvg_row_trk_update_start(
WT_SESSION_IMPL *, WT_ITEM *, uint32_t, WT_STUFF *);
static int WT_CDECL __slvg_trk_compare_addr(const void *, const void *);
static int WT_CDECL __slvg_trk_compare_gen(const void *, const void *);
static int WT_CDECL __slvg_trk_compare_key(const void *, const void *);
static int __slvg_trk_free(WT_SESSION_IMPL *, WT_TRACK **, bool);
static void __slvg_trk_free_addr(WT_SESSION_IMPL *, WT_TRACK *);
static int __slvg_trk_init(WT_SESSION_IMPL *, uint8_t *,
size_t, uint32_t, uint64_t, WT_STUFF *, WT_TRACK **);
static int __slvg_trk_leaf(WT_SESSION_IMPL *,
const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
static int __slvg_trk_leaf_ovfl(
WT_SESSION_IMPL *, const WT_PAGE_HEADER *, WT_TRACK *);
static int __slvg_trk_ovfl(WT_SESSION_IMPL *,
const WT_PAGE_HEADER *, uint8_t *, size_t, WT_STUFF *);
/*
* __wt_bt_salvage --
* Salvage a Btree.
*/
int
__wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[])
{
WT_BM *bm;
WT_BTREE *btree;
WT_DECL_RET;
WT_STUFF *ss, stuff;
uint32_t i, leaf_cnt;
WT_UNUSED(cfg);
btree = S2BT(session);
bm = btree->bm;
WT_CLEAR(stuff);
ss = &stuff;
ss->session = session;
ss->page_type = WT_PAGE_INVALID;
/* Allocate temporary buffers. */
WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp1));
WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2));
/*
* Step 1:
* Inform the underlying block manager that we're salvaging the file.
*/
WT_ERR(bm->salvage_start(bm, session));
/*
* Step 2:
* Read the file and build in-memory structures that reference any leaf
* or overflow page. Any pages other than leaf or overflow pages are
* added to the free list.
*
* Turn off read checksum and verification error messages while we're
* reading the file, we expect to see corrupted blocks.
*/
F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
ret = __slvg_read(session, ss);
F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);
WT_ERR(ret);
/*
* Step 3:
* Discard any page referencing a non-existent overflow page. We do
* this before checking overlapping key ranges on the grounds that a
* bad key range we can use is better than a terrific key range that
* references pages we don't have. On the other hand, we subsequently
* discard key ranges where there are better overlapping ranges, and
* it would be better if we let the availability of an overflow value
* inform our choices as to the key ranges we select, ideally on a
* per-key basis.
*
* A complicating problem is found in variable-length column-store
* objects, where we potentially split key ranges within RLE units.
* For example, if there's a page with rows 15-20 and we later find
* row 17 with a larger LSN, the range splits into 3 chunks, 15-16,
* 17, and 18-20. If rows 15-20 were originally a single value (an
* RLE of 6), and that record is an overflow record, we end up with
* two chunks, both of which want to reference the same overflow value.
*
* Instead of the approach just described, we're first discarding any
* pages referencing non-existent overflow pages, then we're reviewing
* our key ranges and discarding any that overlap. We're doing it that
* way for a few reasons: absent corruption, missing overflow items are
* strong arguments the page was replaced (on the other hand, some kind
* of file corruption is probably why we're here); it's a significant
* amount of additional complexity to simultaneously juggle overlapping
* ranges and missing overflow items; finally, real-world applications
* usually don't have a lot of overflow items, as WiredTiger supports
* very large page sizes, overflow items shouldn't be common.
*
* Step 4:
* Add unreferenced overflow page blocks to the free list so they are
* reused immediately.
*/
WT_ERR(__slvg_ovfl_reconcile(session, ss));
WT_ERR(__slvg_ovfl_discard(session, ss));
/*
* Step 5:
* Walk the list of pages looking for overlapping ranges to resolve.
* If we find a range that needs to be resolved, set a global flag
* and a per WT_TRACK flag on the pages requiring modification.
*
* This requires sorting the page list by key, and secondarily by LSN.
*
* !!!
* It's vanishingly unlikely and probably impossible for fixed-length
* column-store files to have overlapping key ranges. It's possible
* for an entire key range to go missing (if a page is corrupted and
* lost), but because pages can't split, it shouldn't be possible to
* find pages where the key ranges overlap. That said, we check for
* it and clean up after it in reconciliation because it doesn't cost
* much and future column-store formats or operations might allow for
* fixed-length format ranges to overlap during salvage, and I don't
* want to have to retrofit the code later.
*/
__wt_qsort(ss->pages,
(size_t)ss->pages_next, sizeof(WT_TRACK *), __slvg_trk_compare_key);
if (ss->page_type == WT_PAGE_ROW_LEAF)
WT_ERR(__slvg_row_range(session, ss));
else
WT_ERR(__slvg_col_range(session, ss));
/*
* Step 6:
* We may have lost key ranges in column-store databases, that is, some
* part of the record number space is gone; look for missing ranges.
*/
switch (ss->page_type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
__slvg_col_range_missing(session, ss);
break;
case WT_PAGE_ROW_LEAF:
break;
}
/*
* Step 7:
* Build an internal page that references all of the leaf pages,
* and write it, as well as any merged pages, to the file.
*
* Count how many leaf pages we have (we could track this during the
* array shuffling/splitting, but that's a lot harder).
*/
for (leaf_cnt = i = 0; i < ss->pages_next; ++i)
if (ss->pages[i] != NULL)
++leaf_cnt;
if (leaf_cnt != 0)
switch (ss->page_type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
WT_WITH_PAGE_INDEX(session,
ret = __slvg_col_build_internal(
session, leaf_cnt, ss));
WT_ERR(ret);
break;
case WT_PAGE_ROW_LEAF:
WT_WITH_PAGE_INDEX(session,
ret = __slvg_row_build_internal(
session, leaf_cnt, ss));
WT_ERR(ret);
break;
}
/*
* Step 8:
* If we had to merge key ranges, we have to do a final pass through
* the leaf page array and discard file pages used during key merges.
* We can't do it earlier: if we free'd the leaf pages we're merging as
* we merged them, the write of subsequent leaf pages or the internal
* page might allocate those free'd file blocks, and if the salvage run
* subsequently fails, we'd have overwritten pages used to construct the
* final key range. In other words, if the salvage run fails, we don't
* want to overwrite data the next salvage run might need.
*/
if (ss->merge_free)
WT_ERR(__slvg_merge_block_free(session, ss));
/*
* Step 9:
* Evict the newly created root page, creating a checkpoint.
*/
if (ss->root_ref.page != NULL) {
btree->ckpt = ckptbase;
ret = __wt_evict(session, &ss->root_ref, true, WT_REF_MEM);
ss->root_ref.page = NULL;
btree->ckpt = NULL;
}
/*
* Step 10:
* Inform the underlying block manager that we're done.
*/
err: WT_TRET(bm->salvage_end(bm, session));
/* Discard any root page we created. */
if (ss->root_ref.page != NULL)
__wt_ref_out(session, &ss->root_ref);
/* Discard the leaf and overflow page memory. */
WT_TRET(__slvg_cleanup(session, ss));
/* Discard temporary buffers. */
__wt_scr_free(session, &ss->tmp1);
__wt_scr_free(session, &ss->tmp2);
return (ret);
}
/*
* __slvg_read --
* Read the file and build a table of the pages we can use.
*/
static int
__slvg_read(WT_SESSION_IMPL *session, WT_STUFF *ss)
{
WT_BM *bm;
WT_DECL_ITEM(as);
WT_DECL_ITEM(buf);
WT_DECL_RET;
const WT_PAGE_HEADER *dsk;
size_t addr_size;
uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE];
bool eof, valid;
bm = S2BT(session)->bm;
WT_ERR(__wt_scr_alloc(session, 0, &as));
WT_ERR(__wt_scr_alloc(session, 0, &buf));
for (;;) {
/* Get the next block address from the block manager. */
WT_ERR(bm->salvage_next(bm, session, addr, &addr_size, &eof));
if (eof)
break;
/* Report progress occasionally. */
#define WT_SALVAGE_PROGRESS_INTERVAL 100
if (++ss->fcnt % WT_SALVAGE_PROGRESS_INTERVAL == 0)
WT_ERR(__wt_progress(session, NULL, ss->fcnt));
/*
* Read (and potentially decompress) the block; the underlying
* block manager might return only good blocks if checksums are
* configured, or both good and bad blocks if we're relying on
* compression.
*
* Report the block's status to the block manager.
*/
if ((ret = __wt_bt_read(session, buf, addr, addr_size)) == 0)
valid = true;
else {
valid = false;
if (ret == WT_ERROR)
ret = 0;
WT_ERR(ret);
}
WT_ERR(bm->salvage_valid(bm, session, addr, addr_size, valid));
if (!valid)
continue;
/* Create a printable version of the address. */
WT_ERR(bm->addr_string(bm, session, as, addr, addr_size));
/*
* Make sure it's an expected page type for the file.
*
* We only care about leaf and overflow pages from here on out;
* discard all of the others. We put them on the free list now,
* because we might as well overwrite them, we want the file to
* grow as little as possible, or shrink, and future salvage
* calls don't need them either.
*/
dsk = buf->data;
switch (dsk->type) {
case WT_PAGE_BLOCK_MANAGER:
case WT_PAGE_COL_INT:
case WT_PAGE_ROW_INT:
__wt_verbose(session, WT_VERB_SALVAGE,
"%s page ignored %s",
__wt_page_type_string(dsk->type),
(const char *)as->data);
WT_ERR(bm->free(bm, session, addr, addr_size));
continue;
}
/*
* Verify the page. It's unlikely a page could have a valid
* checksum and still be broken, but paranoia is healthy in
* salvage. Regardless, verify does return failure because
* it detects failures we'd expect to see in a corrupted file,
* like overflow references past the end of the file or
* overflow references to non-existent pages, might as well
* discard these pages now.
*/
if (__wt_verify_dsk(session, as->data, buf) != 0) {
__wt_verbose(session, WT_VERB_SALVAGE,
"%s page failed verify %s",
__wt_page_type_string(dsk->type),
(const char *)as->data);
WT_ERR(bm->free(bm, session, addr, addr_size));
continue;
}
__wt_verbose(session, WT_VERB_SALVAGE,
"tracking %s page, generation %" PRIu64 " %s",
__wt_page_type_string(dsk->type), dsk->write_gen,
(const char *)as->data);
switch (dsk->type) {
case WT_PAGE_COL_FIX:
case WT_PAGE_COL_VAR:
case WT_PAGE_ROW_LEAF:
if (ss->page_type == WT_PAGE_INVALID)
ss->page_type = dsk->type;
if (ss->page_type != dsk->type)
WT_ERR_MSG(session, WT_ERROR,
"file contains multiple file formats (both "
"%s and %s), and cannot be salvaged",
__wt_page_type_string(ss->page_type),
__wt_page_type_string(dsk->type));
WT_ERR(__slvg_trk_leaf(
session, dsk, addr, addr_size, ss));
break;
case WT_PAGE_OVFL:
WT_ERR(__slvg_trk_ovfl(
session, dsk, addr, addr_size, ss));
break;
}
}
err: __wt_scr_free(session, &as);
__wt_scr_free(session, &buf);
return (ret);
}
/*
* __slvg_trk_init --
* Initialize tracking information for a page.
*/
static int
__slvg_trk_init(WT_SESSION_IMPL *session,
uint8_t *addr, size_t addr_size,
uint32_t size, uint64_t gen, WT_STUFF *ss, WT_TRACK **retp)
{
WT_DECL_RET;
WT_TRACK *trk;
WT_RET(__wt_calloc_one(session, &trk));
WT_ERR(__wt_calloc_one(session, &trk->shared));
trk->shared->ref = 1;
trk->ss = ss;
WT_ERR(__wt_memdup(session, addr, addr_size, &trk->trk_addr));
trk->trk_addr_size = (uint8_t)addr_size;
trk->trk_size = size;
trk->trk_gen = gen;
*retp = trk;
return (0);
err: __wt_free(session, trk->trk_addr);
__wt_free(session, trk->shared);
__wt_free(session, trk);
return (ret);
}
/*
* __slvg_trk_leaf --
* Track a leaf page.
*/
static int
__slvg_trk_leaf(WT_SESSION_IMPL *session,
const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
{
WT_BTREE *btree;
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
WT_DECL_RET;
WT_PAGE *page;
WT_TRACK *trk;
uint64_t stop_recno;
uint32_t i;
btree = S2BT(session);
unpack = &_unpack;
page = NULL;
trk = NULL;
/* Re-allocate the array of pages, as necessary. */
WT_RET(__wt_realloc_def(
session, &ss->pages_allocated, ss->pages_next + 1, &ss->pages));
/* Allocate a WT_TRACK entry for this new page and fill it in. */
WT_RET(__slvg_trk_init(
session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
switch (dsk->type) {
case WT_PAGE_COL_FIX:
/*
* Column-store fixed-sized format: start and stop keys can be
* taken from the block's header, and doesn't contain overflow
* items.
*/
trk->col_start = dsk->recno;
trk->col_stop = dsk->recno + (dsk->u.entries - 1);
__wt_verbose(session, WT_VERB_SALVAGE,
"%s records %" PRIu64 "-%" PRIu64,
__wt_addr_string(
session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
trk->col_start, trk->col_stop);
break;
case WT_PAGE_COL_VAR:
/*
* Column-store variable-length format: the start key can be
* taken from the block's header, stop key requires walking
* the page.
*/
stop_recno = dsk->recno;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
__wt_cell_unpack(cell, unpack);
stop_recno += __wt_cell_rle(unpack);
}
trk->col_start = dsk->recno;
trk->col_stop = stop_recno - 1;
__wt_verbose(session, WT_VERB_SALVAGE,
"%s records %" PRIu64 "-%" PRIu64,
__wt_addr_string(
session, trk->trk_addr, trk->trk_addr_size, ss->tmp1),
trk->col_start, trk->col_stop);
/* Column-store pages can contain overflow items. */
WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
break;
case WT_PAGE_ROW_LEAF:
/*
* Row-store format: copy the first and last keys on the page.
* Keys are prefix-compressed, the simplest and slowest thing
* to do is instantiate the in-memory page, then instantiate
* and copy the full keys, then free the page. We do this on
* every leaf page, and if you need to speed up the salvage,
* it's probably a great place to start.
*
* Page flags are 0 because we aren't releasing the memory used
* to read the page into memory and we don't want page discard
* to free it.
*/
WT_ERR(__wt_page_inmem(session, NULL, dsk, 0, &page));
WT_ERR(__wt_row_leaf_key_copy(session,
page, &page->pg_row[0], &trk->row_start));
WT_ERR(__wt_row_leaf_key_copy(session,
page, &page->pg_row[page->entries - 1], &trk->row_stop));
__wt_verbose(session, WT_VERB_SALVAGE,
"%s start key %s",
__wt_addr_string(session,
trk->trk_addr, trk->trk_addr_size, ss->tmp1),
__wt_buf_set_printable(session,
trk->row_start.data, trk->row_start.size, ss->tmp2));
__wt_verbose(session, WT_VERB_SALVAGE,
"%s stop key %s",
__wt_addr_string(session,
trk->trk_addr, trk->trk_addr_size, ss->tmp1),
__wt_buf_set_printable(session,
trk->row_stop.data, trk->row_stop.size, ss->tmp2));
/* Row-store pages can contain overflow items. */
WT_ERR(__slvg_trk_leaf_ovfl(session, dsk, trk));
break;
}
ss->pages[ss->pages_next++] = trk;
if (0) {
err: __wt_free(session, trk);
}
if (page != NULL)
__wt_page_out(session, &page);
return (ret);
}
/*
* __slvg_trk_ovfl --
* Track an overflow page.
*/
static int
__slvg_trk_ovfl(WT_SESSION_IMPL *session,
const WT_PAGE_HEADER *dsk, uint8_t *addr, size_t addr_size, WT_STUFF *ss)
{
WT_TRACK *trk;
/*
* Reallocate the overflow page array as necessary, then save the
* page's location information.
*/
WT_RET(__wt_realloc_def(
session, &ss->ovfl_allocated, ss->ovfl_next + 1, &ss->ovfl));
WT_RET(__slvg_trk_init(
session, addr, addr_size, dsk->mem_size, dsk->write_gen, ss, &trk));
ss->ovfl[ss->ovfl_next++] = trk;
return (0);
}
/*
* __slvg_trk_leaf_ovfl --
* Search a leaf page for overflow items.
*/
static int
__slvg_trk_leaf_ovfl(
WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, WT_TRACK *trk)
{
WT_BTREE *btree;
WT_CELL *cell;
WT_CELL_UNPACK *unpack, _unpack;
uint32_t i, ovfl_cnt;
btree = S2BT(session);
unpack = &_unpack;
/*
* Two passes: count the overflow items, then copy them into an
* allocated array.
*/
ovfl_cnt = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
__wt_cell_unpack(cell, unpack);
if (unpack->ovfl)
++ovfl_cnt;
}
if (ovfl_cnt == 0)
return (0);
/* Allocate room for the array of overflow addresses and fill it in. */
WT_RET(__wt_calloc_def(session, ovfl_cnt, &trk->trk_ovfl_addr));
trk->trk_ovfl_cnt = ovfl_cnt;
ovfl_cnt = 0;
WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
__wt_cell_unpack(cell, unpack);
if (unpack->ovfl) {
WT_RET(__wt_memdup(session, unpack->data,
unpack->size, &trk->trk_ovfl_addr[ovfl_cnt].addr));
trk->trk_ovfl_addr[ovfl_cnt].size =
(uint8_t)unpack->size;
__wt_verbose(session, WT_VERB_SALVAGE,
"%s overflow reference %s",
__wt_addr_string(session,
trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1),
__wt_addr_string(session,
unpack->data, unpack->size, trk->ss->tmp2));
if (++ovfl_cnt == trk->trk_ovfl_cnt)
break;
}
}
return (0);
}
/*
* __slvg_col_range --
* Figure out the leaf pages we need and free the leaf pages we don't.
*
* When pages split, the key range is split across multiple pages. If not all
* of the old versions of the page are overwritten, or not all of the new pages
* are written, or some of the pages are corrupted, salvage will read different
* pages with overlapping key ranges, at different LSNs.
*
* We salvage all of the key ranges we find, at the latest LSN value: this means
* we may resurrect pages of deleted items, as page deletion doesn't write leaf
* pages and salvage will read and instantiate the contents of an old version of
* the deleted page.
*
* The leaf page array is sorted in key order, and secondarily on LSN: what this
* means is that for each new key range, the first page we find is the best page
* for that key. The process is to walk forward from each page until we reach a
* page with a starting key after the current page's stopping key.
*
* For each of page, check to see if they overlap the current page's key range.
* If they do, resolve the overlap. Because WiredTiger rarely splits pages,
* overlap resolution usually means discarding a page because the key ranges
* are the same, and one of the pages is simply an old version of the other.
*
* However, it's possible more complex resolution is necessary. For example,
* here's an improbably complex list of page ranges and LSNs:
*
* Page Range LSN
* 30 A-G 3
* 31 C-D 4
* 32 B-C 5
* 33 C-F 6
* 34 C-D 7
* 35 F-M 8
* 36 H-O 9
*
* We walk forward from each page reviewing all other pages in the array that
* overlap the range. For each overlap, the current or the overlapping
* page is updated so the page with the most recent information for any range
* "owns" that range. Here's an example for page 30.
*
* Review page 31: because page 31 has the range C-D and a higher LSN than page
* 30, page 30 would "split" into two ranges, A-C and E-G, conceding the C-D
* range to page 31. The new track element would be inserted into array with
* the following result:
*
* Page Range LSN
* 30 A-C 3 << Changed WT_TRACK element
* 31 C-D 4
* 32 B-C 5
* 33 C-F 6
* 34 C-D 7
* 30 E-G 3 << New WT_TRACK element
* 35 F-M 8
* 36 H-O 9
*
* Continue the review of the first element, using its new values.
*
* Review page 32: because page 31 has the range B-C and a higher LSN than page
* 30, page 30's A-C range would be truncated, conceding the B-C range to page
* 32.
* 30 A-B 3
* E-G 3
* 31 C-D 4
* 32 B-C 5
* 33 C-F 6
* 34 C-D 7
*
* Review page 33: because page 33 has a starting key (C) past page 30's ending
* key (B), we stop evaluating page 30's A-B range, as there can be no further
* overlaps.
*
* This process is repeated for each page in the array.
*
* When page 33 is processed, we'd discover that page 33's C-F range overlaps
* page 30's E-G range, and page 30's E-G range would be updated, conceding the
* E-F range to page 33.
*
* This is not computationally expensive because we don't walk far forward in
* the leaf array because it's sorted by starting key, and because WiredTiger
* splits are rare, the chance of finding the kind of range overlap requiring
* re-sorting the array is small.
*/
static int
__slvg_col_range(WT_SESSION_IMPL *session, WT_STUFF *ss)
{
WT_TRACK *jtrk;
uint32_t i, j;
/*
* DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
* COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
* BEING HANDLED.
*
* Walk the page array looking for overlapping key ranges, adjusting
* the ranges based on the LSN until there are no overlaps.
*
* DO NOT USE POINTERS INTO THE ARRAY: THE ARRAY IS RE-SORTED IN PLACE
* AS ENTRIES ARE SPLIT, SO ARRAY REFERENCES MUST ALWAYS BE ARRAY BASE
* PLUS OFFSET.
*/
for (i = 0; i < ss->pages_next; ++i) {
if (ss->pages[i] == NULL)
continue;
/* Check for pages that overlap our page. */
for (j = i + 1; j < ss->pages_next; ++j) {
if (ss->pages[j] == NULL)
continue;
/*
* We're done if this page starts after our stop, no
* subsequent pages can overlap our page.
*/
if (ss->pages[j]->col_start >
ss->pages[i]->col_stop)
break;
/* There's an overlap, fix it up. */
jtrk = ss->pages[j];
WT_RET(__slvg_col_range_overlap(session, i, j, ss));
/*
* If the overlap resolution changed the entry's start
* key, the entry might have moved and the page array
* re-sorted, and pages[j] would reference a different
* page. We don't move forward if that happened, we
* re-process the slot again (by decrementing j before
* the loop's increment).
*/
if (ss->pages[j] != NULL && jtrk != ss->pages[j])
--j;
}
}
return (0);
}
/*
* __slvg_col_range_overlap --
* Two column-store key ranges overlap, deal with it.
*/
static int
__slvg_col_range_overlap(
WT_SESSION_IMPL *session, uint32_t a_slot, uint32_t b_slot, WT_STUFF *ss)
{
WT_DECL_RET;
WT_TRACK *a_trk, *b_trk, *new;
uint32_t i;
/*
* DO NOT MODIFY THIS CODE WITHOUT REVIEWING THE CORRESPONDING ROW- OR
* COLUMN-STORE CODE: THEY ARE IDENTICAL OTHER THAN THE PAGES THAT ARE
* BEING HANDLED.
*/
a_trk = ss->pages[a_slot];
b_trk = ss->pages[b_slot];
__wt_verbose(session, WT_VERB_SALVAGE,
"%s and %s range overlap",
__wt_addr_string(
session, a_trk->trk_addr, a_trk->trk_addr_size, ss->tmp1),
__wt_addr_string(
session, b_trk->trk_addr, b_trk->trk_addr_size, ss->tmp2));
/*
* The key ranges of two WT_TRACK pages in the array overlap -- choose
* the ranges we're going to take from each.
*
* We can think of the overlap possibilities as 11 different cases:
*
* AAAAAAAAAAAAAAAAAA
* #1 BBBBBBBBBBBBBBBBBB pages are the same
* #2 BBBBBBBBBBBBB overlaps the beginning
* #3 BBBBBBBBBBBBBBBB overlaps the end
* #4 BBBBB B is a prefix of A
* #5 BBBBBB B is middle of A
* #6 BBBBBBBBBB B is a suffix of A
*
* and:
*
* BBBBBBBBBBBBBBBBBB
* #7 AAAAAAAAAAAAA same as #3
* #8 AAAAAAAAAAAAAAAA same as #2
* #9 AAAAA A is a prefix of B
* #10 AAAAAA A is middle of B
* #11 AAAAAAAAAA A is a suffix of B
*
* Note the leaf page array was sorted by key and a_trk appears earlier
* in the array than b_trk, so cases #2/8, #10 and #11 are impossible.
*
* Finally, there's one additional complicating factor -- final ranges
* are assigned based on the page's LSN.
*/
/* Case #2/8, #10, #11 */
if (a_trk->col_start > b_trk->col_start)
WT_PANIC_RET(
session, EINVAL, "unexpected merge array sort order");
if (a_trk->col_start == b_trk->col_start) { /* Case #1, #4 and #9 */
/*
* The secondary sort of the leaf page array was the page's LSN,
* in high-to-low order, which means a_trk has a higher LSN, and
* is more desirable, than b_trk. In cases #1 and #4 and #9,
* where the start of the range is the same for the two pages,
* this simplifies things, it guarantees a_trk has a higher LSN
* than b_trk.
*/
if (a_trk->col_stop >= b_trk->col_stop)
/*
* Case #1, #4: a_trk is a superset of b_trk, and a_trk
* is more desirable -- discard b_trk.
*/
goto delete_b;
/*
* Case #9: b_trk is a superset of a_trk, but a_trk is more
* desirable: keep both but delete a_trk's key range from
* b_trk.
*/
b_trk->col_start = a_trk->col_stop + 1;
__slvg_col_trk_update_start(b_slot, ss);
F_SET(b_trk, WT_TRACK_MERGE);
goto merge;
}
if (a_trk->col_stop == b_trk->col_stop) { /* Case #6 */
if (a_trk->trk_gen > b_trk->trk_gen)
/*
* Case #6: a_trk is a superset of b_trk and a_trk is
* more desirable -- discard b_trk.
*/
goto delete_b;
/*
* Case #6: a_trk is a superset of b_trk, but b_trk is more
* desirable: keep both but delete b_trk's key range from a_trk.
*/
a_trk->col_stop = b_trk->col_start - 1;
F_SET(a_trk, WT_TRACK_MERGE);
goto merge;
}
if (a_trk->col_stop < b_trk->col_stop) { /* Case #3/7 */
if (a_trk->trk_gen > b_trk->trk_gen) {
/*
* Case #3/7: a_trk is more desirable, delete a_trk's
* key range from b_trk;
*/
b_trk->col_start = a_trk->col_stop + 1;
__slvg_col_trk_update_start(b_slot, ss);
F_SET(b_trk, WT_TRACK_MERGE);
} else {
/*
* Case #3/7: b_trk is more desirable, delete b_trk's
* key range from a_trk;
*/
a_trk->col_stop = b_trk->col_start - 1;
F_SET(a_trk, WT_TRACK_MERGE);
}
goto merge;
}
/*
* Case #5: a_trk is a superset of b_trk and a_trk is more desirable --
* discard b_trk.
*/
if (a_trk->trk_gen > b_trk->trk_gen) {
delete_b: /*
* After page and overflow reconciliation, one (and only one)
* page can reference an overflow record. But, if we split a
* page into multiple chunks, any of the chunks might own any
* of the backing overflow records, so overflow records won't
* normally be discarded until after the merge phase completes.
* (The merge phase is where the final pages are written, and
* we figure out which overflow records are actually used.)
* If freeing a chunk and there are no other references to the
* underlying shared information, the overflow records must be
* useless, discard them to keep the final file size small.
*/
if (b_trk->shared->ref == 1)
for (i = 0; i < b_trk->trk_ovfl_cnt; ++i)
WT_RET(__slvg_trk_free(session,
&ss->ovfl[b_trk->trk_ovfl_slot[i]], true));
return (__slvg_trk_free(session, &ss->pages[b_slot], true));
}
/*
* Case #5: b_trk is more desirable and is a middle chunk of a_trk.
* Split a_trk into two parts, the key range before b_trk and the
* key range after b_trk.