From 7c1cbcb20e8a7c283757b87184cad2b3cb639fc8 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Sun, 24 Nov 2024 20:44:46 +0200 Subject: [PATCH 1/5] Check for invalid down link while prefetching B-Tree leave pages for index-only scan --- src/backend/access/nbtree/nbtsearch.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index bcf5dd368a6..ba747147945 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -884,9 +884,10 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti { ItemId itemid = PageGetItemId(page, offnum + i); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); - if (i == next_parent_prefetch_index) + if (j == next_parent_prefetch_index) so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ - so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + if (BlockNumberIsValid(BTreeTupleGetDownLink(itup))) + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); } } else @@ -898,9 +899,10 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti { ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); - if (i == next_parent_prefetch_index) + if (j == next_parent_prefetch_index) so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ - so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + if (BlockNumberIsValid(BTreeTupleGetDownLink(itup))) + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); } } so->n_prefetch_blocks = j; From 13920abbf448ec4c0ceedba3bf44831d7e6bdb2c Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Mon, 25 Nov 2024 08:56:19 +0200 Subject: [PATCH 2/5] Replace assert which doesn't take in account presence of invalid downlinks --- src/backend/access/nbtree/nbtsearch.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index ba747147945..735e61ab7d7 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1478,12 +1478,15 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) int skip = ScanDirectionIsForward(dir) ? stack->bts_offset - first_offset : first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset; - Assert(so->n_prefetch_blocks >= skip); - so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; - so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); - so->last_prefetch_index = skip + so->n_prefetch_requests; - for (int i = skip; i < so->last_prefetch_index; i++) - PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + /* n_prefetch_blocks can be smaller than skip because of skipped invalid downlinks */ + if (so->n_prefetch_blocks >= skip) + { + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); + so->last_prefetch_index = skip + so->n_prefetch_requests; + for (int i = skip; i < so->last_prefetch_index; i++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); + } } /* don't need to keep the stack around... */ From 69eb618761295800d011db6002234f0314b45cdb Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 26 Nov 2024 10:25:50 +0200 Subject: [PATCH 3/5] Add extra checks for IOS prefetch --- src/backend/access/nbtree/nbtsearch.c | 41 ++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 735e61ab7d7..bd91c46d819 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -880,14 +880,17 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti so->next_parent = opaque->btpo_next; if (so->next_parent == P_NONE) next_parent_prefetch_index = -1; + else if (!BlockNumberIsValid(so->next_parent)) + elog(FATAL, "btpo_next is invalid"); for (i = 0, j = 0; i < n_child; i++) { ItemId itemid = PageGetItemId(page, offnum + i); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); - if (j == next_parent_prefetch_index) + if (i == next_parent_prefetch_index) so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ - if (BlockNumberIsValid(BTreeTupleGetDownLink(itup))) - so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + if (!BlockNumberIsValid(BTreeTupleGetDownLink(itup))) + elog(FATAL, "Downlink %d is invalid for forward scan n_child=%d", i, n_child); + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); } } else @@ -895,14 +898,17 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti so->next_parent = opaque->btpo_prev; if (so->next_parent == P_NONE) next_parent_prefetch_index = -1; - for (i = 0, j = 0; i < n_child; i++) + else if (!BlockNumberIsValid(so->next_parent)) + elog(FATAL, "btpo_prev is invalid"); + for (i = 0, j = 0; i < n_child; i++) { ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); - if (j == next_parent_prefetch_index) + if (i == next_parent_prefetch_index) so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ - if (BlockNumberIsValid(BTreeTupleGetDownLink(itup))) - so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); + if (!BlockNumberIsValid(BTreeTupleGetDownLink(itup))) + elog(FATAL, "Downlink %d is invalid for backward scan n_child=%d", i, n_child); + so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); } } so->n_prefetch_blocks = j; @@ -1478,15 +1484,18 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) int skip = ScanDirectionIsForward(dir) ? stack->bts_offset - first_offset : first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset; - /* n_prefetch_blocks can be smaller than skip because of skipped invalid downlinks */ - if (so->n_prefetch_blocks >= skip) - { - so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; - so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); - so->last_prefetch_index = skip + so->n_prefetch_requests; - for (int i = skip; i < so->last_prefetch_index; i++) - PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[i]); - } + if (skip < 0) + elog(FATAL, "Forward=%d, bts_oiffset-%d, first_offset=%d, n_prefetch_blocks=%d", ScanDirectionIsForward(dir), stack->bts_offset, first_offset, so->n_prefetch_blocks); + Assert(so->n_prefetch_blocks >= skip); + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); + so->last_prefetch_index = skip + so->n_prefetch_requests; + if (so->last_prefetch_index > so->n_prefetch_blocks) + elog(FATAL, "last_prefetch_index-%d, n_prefetch_blocks=%d", + so->last_prefetch_index, so->n_prefetch_blocks); + Assert(so->last_prefetch_index <= so->n_prefetch_blocks); + for (int j = skip; j < so->last_prefetch_index; j++) + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[j]); } /* don't need to keep the stack around... */ From f27dbe181c3446ffcba29eff7a833a078800861b Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Tue, 26 Nov 2024 19:23:06 +0200 Subject: [PATCH 4/5] Do not use stack->bts_offset because content of parent page can be changed --- src/backend/access/nbtree/nbtsearch.c | 50 ++++++++++++++------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index bd91c46d819..c1f075ef390 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -848,9 +848,8 @@ _bt_compare(Relation rel, /* * _bt_read_parent_for_prefetch - read parent page and extract references to children for prefetch. - * This functions returns offset of first item. */ -static int +static void _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirection dir) { Relation rel = scan->indexRelation; @@ -888,8 +887,6 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); if (i == next_parent_prefetch_index) so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ - if (!BlockNumberIsValid(BTreeTupleGetDownLink(itup))) - elog(FATAL, "Downlink %d is invalid for forward scan n_child=%d", i, n_child); so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); } } @@ -906,15 +903,12 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); if (i == next_parent_prefetch_index) so->prefetch_blocks[j++] = so->next_parent; /* time to prefetch next parent page */ - if (!BlockNumberIsValid(BTreeTupleGetDownLink(itup))) - elog(FATAL, "Downlink %d is invalid for backward scan n_child=%d", i, n_child); so->prefetch_blocks[j++] = BTreeTupleGetDownLink(itup); } } so->n_prefetch_blocks = j; so->last_prefetch_index = 0; _bt_relbuf(rel, buf); - return offnum; } /* @@ -1478,24 +1472,32 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); /* Start prefetching for index only scan */ - if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */ + if (so->prefetch_maximum > 0 && stack != NULL && BufferIsValid(buf) && scan->xs_want_itup) /* index only scan */ { - int first_offset = _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir); - int skip = ScanDirectionIsForward(dir) - ? stack->bts_offset - first_offset - : first_offset + so->n_prefetch_blocks - 1 - stack->bts_offset; - if (skip < 0) - elog(FATAL, "Forward=%d, bts_oiffset-%d, first_offset=%d, n_prefetch_blocks=%d", ScanDirectionIsForward(dir), stack->bts_offset, first_offset, so->n_prefetch_blocks); - Assert(so->n_prefetch_blocks >= skip); - so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; - so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - skip); - so->last_prefetch_index = skip + so->n_prefetch_requests; - if (so->last_prefetch_index > so->n_prefetch_blocks) - elog(FATAL, "last_prefetch_index-%d, n_prefetch_blocks=%d", - so->last_prefetch_index, so->n_prefetch_blocks); - Assert(so->last_prefetch_index <= so->n_prefetch_blocks); - for (int j = skip; j < so->last_prefetch_index; j++) - PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[j]); + BlockNumber leaf = BufferGetBlockNumber(buf); + + _bt_read_parent_for_prefetch(scan, stack->bts_blkno, dir); + + /* + * We can not use stack->bts_offset because once parent page is unlocked, it can be updated and state captured by + * by _bt_read_parent_for_prefetch may not match with _bt_search + */ + + for (int j = 0; j < so->n_prefetch_blocks; j++) + { + if (so->prefetch_blocks[j] == leaf) + { + so->current_prefetch_distance = INCREASE_PREFETCH_DISTANCE_STEP; + so->n_prefetch_requests = Min(so->current_prefetch_distance, so->n_prefetch_blocks - j); + so->last_prefetch_index = j + so->n_prefetch_requests; + + do { + PrefetchBuffer(rel, MAIN_FORKNUM, so->prefetch_blocks[j]); + } while (++j < so->last_prefetch_index); + + break; + } + } } /* don't need to keep the stack around... */ From c2f65b3201591e02ce45b66731392f98d3388e73 Mon Sep 17 00:00:00 2001 From: Konstantin Knizhnik Date: Thu, 12 Dec 2024 07:58:34 +0200 Subject: [PATCH 5/5] Remove unintended changes --- src/backend/access/nbtree/nbtsearch.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index c1f075ef390..a300ab856eb 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -879,8 +879,7 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti so->next_parent = opaque->btpo_next; if (so->next_parent == P_NONE) next_parent_prefetch_index = -1; - else if (!BlockNumberIsValid(so->next_parent)) - elog(FATAL, "btpo_next is invalid"); + for (i = 0, j = 0; i < n_child; i++) { ItemId itemid = PageGetItemId(page, offnum + i); @@ -895,9 +894,8 @@ _bt_read_parent_for_prefetch(IndexScanDesc scan, BlockNumber parent, ScanDirecti so->next_parent = opaque->btpo_prev; if (so->next_parent == P_NONE) next_parent_prefetch_index = -1; - else if (!BlockNumberIsValid(so->next_parent)) - elog(FATAL, "btpo_prev is invalid"); - for (i = 0, j = 0; i < n_child; i++) + + for (i = 0, j = 0; i < n_child; i++) { ItemId itemid = PageGetItemId(page, offnum + n_child - i - 1); IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); @@ -1472,7 +1470,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) stack = _bt_search(rel, &inskey, &buf, BT_READ, scan->xs_snapshot); /* Start prefetching for index only scan */ - if (so->prefetch_maximum > 0 && stack != NULL && BufferIsValid(buf) && scan->xs_want_itup) /* index only scan */ + if (so->prefetch_maximum > 0 && stack != NULL && scan->xs_want_itup) /* index only scan */ { BlockNumber leaf = BufferGetBlockNumber(buf);