diff --git a/community/index/src/main/java/org/neo4j/index/gbptree/SeekCursor.java b/community/index/src/main/java/org/neo4j/index/gbptree/SeekCursor.java index 5b83069dc476..a0a1c861a5b3 100644 --- a/community/index/src/main/java/org/neo4j/index/gbptree/SeekCursor.java +++ b/community/index/src/main/java/org/neo4j/index/gbptree/SeekCursor.java @@ -100,6 +100,12 @@ class SeekCursor implements RawCursor,IOException>, Hi */ private final Supplier rootCatchup; + /** + * Max of leaf/internal key count that a tree node can have at most. This is used to sanity check + * key counts read from {@link TreeNode#keyCount(PageCursor)}. + */ + private final int maxKeyCount; + /** * Whether or not some result has been found, i.e. if {@code true} if there have been no call to * {@link #next()} returning {@code true}, otherwise {@code false}. If {@code false} then value in @@ -134,7 +140,7 @@ class SeekCursor implements RawCursor,IOException>, Hi /** * Set if the position of the last returned key need to be found again. */ - private boolean rediscoverKeyPosition; + private boolean concurrentWriteHappened; /** * {@link TreeNode#gen(PageCursor) generation} of the current leaf node, read every call to {@link #next()}. @@ -155,11 +161,12 @@ class SeekCursor implements RawCursor,IOException>, Hi */ private long expectedCurrentNodeGen; - /** - * Max of leaf/internal key count that a tree node can have at most. This is used to sanity check - * key counts read from {@link TreeNode#keyCount(PageCursor)}. - */ - private final int maxKeyCount; + private byte nodeType; + private long newGen; + private long newGenGen; + private boolean isInternal; + private long pointerId; + private long pointerGen; SeekCursor( PageCursor leafCursor, KEY mutableKey, VALUE mutableValue, TreeNode bTreeNode, KEY fromInclusive, KEY toExclusive, Layout layout, @@ -186,63 +193,29 @@ class SeekCursor implements RawCursor,IOException>, Hi private void traverseDownToFirstLeaf() throws IOException { - // some variables below are initialized to satisfy the compiler - byte nodeType; - long newGen = -1; - boolean isInternal = false; - int keyCount = -1; - long childId = -1; - int pos = -1; - long newGenGen = -1; - long childIdGen = -1; do { + // Read do { - nodeType = TreeNode.nodeType( cursor ); - if ( nodeType != TreeNode.NODE_TYPE_TREE_NODE ) - { - // If this node doesn't even look like a tree node then anything we read from it - // will be just random data when looking at it as if it were a tree node. - continue; - } - - currentNodeGen = bTreeNode.gen( cursor ); - - newGen = bTreeNode.newGen( cursor, stableGeneration, unstableGeneration ); - if ( GenSafePointerPair.isSuccess( newGen ) ) - { - newGenGen = bTreeNode.pointerGen( cursor, newGen ); - } - isInternal = bTreeNode.isInternal( cursor ); - // Find the left-most key within from-range - keyCount = bTreeNode.keyCount( cursor ); - if ( !keyCountIsSane() ) + // Where we are + if ( !readHeader() ) { continue; } - int search = KeySearch.search( cursor, bTreeNode, fromInclusive, mutableKey, keyCount ); - pos = KeySearch.positionOf( search ); - - // Assuming unique keys - if ( isInternal && KeySearch.isHit( search ) ) - { - pos++; - } - + // Where we're going + pos = searchKey( fromInclusive ); if ( isInternal ) { - childId = bTreeNode.childAt( cursor, pos, stableGeneration, unstableGeneration ); - if ( GenSafePointerPair.isSuccess( childId ) ) - { - childIdGen = bTreeNode.pointerGen( cursor, childId ); - } + pointerId = bTreeNode.childAt( cursor, pos, stableGeneration, unstableGeneration ); + readPointerGenOnSuccess(); } } while ( cursor.shouldRetry() ); checkOutOfBounds( cursor ); + // Act if ( nodeType != NODE_TYPE_TREE_NODE || !saneKeyCountRead( keyCount ) || !verifyNodeGenInvariants() ) { // This node has been reused. Restart seek from root. @@ -255,53 +228,54 @@ private void traverseDownToFirstLeaf() throws IOException continue; } - if ( pointerCheckingWithGenerationCatchup( newGen, true ) ) - { - continue; - } - else if ( TreeNode.isNode( newGen ) ) + if ( goToNewGen() ) { - bTreeNode.goTo( cursor, "new gen", newGen ); - lastFollowedPointerGen = newGenGen; continue; } if ( isInternal ) { - if ( pointerCheckingWithGenerationCatchup( childId, false ) ) - { - continue; - } - - bTreeNode.goTo( cursor, "child", childId ); - lastFollowedPointerGen = childIdGen; + goToNext( "child" ); } } while ( isInternal && keyCount > 0 ); // We've now come to the first relevant leaf, initialize the state for the coming leaf scan - this.pos = pos - 1; - this.keyCount = keyCount; + pos--; } - /** - * {@link TreeNode#keyCount(PageCursor) keyCount} is the only value read inside a do-shouldRetry loop - * which is used as data fed into another read. Because of that extra assertions are made around - * keyCount, both inside do-shouldRetry (requesting one more round in the loop) and outside - * (calling this method, which may throw exception). - * - * @param keyCount key count read from {@link TreeNode#keyCount(PageCursor)} and has "survived" - * a do-shouldRetry loop. - */ - private boolean saneKeyCountRead( int keyCount ) + private void goToNext( String type ) throws IOException { - return keyCount <= maxKeyCount; + if ( !pointerCheckingWithGenerationCatchup( pointerId, false ) ) + { + bTreeNode.goTo( cursor, type, pointerId ); + lastFollowedPointerGen = pointerGen; + } } - @Override - public Hit get() + private boolean goToNewGen() throws IOException { - return this; + if ( pointerCheckingWithGenerationCatchup( newGen, true ) ) + { + concurrentWriteHappened = true; + return true; + } + else if ( TreeNode.isNode( newGen ) ) + { + bTreeNode.goTo( cursor, "new gen", newGen ); + lastFollowedPointerGen = newGenGen; + concurrentWriteHappened = true; + return true; + } + return false; + } + + private void readPointerGenOnSuccess() + { + if ( GenSafePointerPair.isSuccess( pointerId ) ) + { + pointerGen = bTreeNode.pointerGen( cursor, pointerId ); + } } @Override @@ -310,50 +284,27 @@ public boolean next() throws IOException while ( true ) { pos++; - // some variables below are initialized to satisfy the compiler - byte nodeType; - long newGen = -1; - long rightSibling = -1; - long newGenGen = -1; - long rightSiblingGen = -1; + // Read do { - nodeType = TreeNode.nodeType( cursor ); - if ( nodeType != TreeNode.NODE_TYPE_TREE_NODE ) + // Where we are + if ( !readHeader() ) { - // If this node doesn't even look like a tree node then anything we read from it - // will be just random data when looking at it as if it were a tree node. continue; } - - currentNodeGen = bTreeNode.gen( cursor ); - newGen = bTreeNode.newGen( cursor, stableGeneration, unstableGeneration ); - keyCount = bTreeNode.keyCount( cursor ); - if ( !keyCountIsSane() ) + if ( concurrentWriteHappened ) { - continue; + // Keys could have been moved to the left so we need to make sure we are not missing any keys by + // moving position back until we find previously returned key + pos = searchKey( first ? fromInclusive : prevKey ); } - if ( GenSafePointerPair.isSuccess( newGen ) ) - { - newGenGen = bTreeNode.pointerGen( cursor, newGen ); - } - - if ( rediscoverKeyPosition ) - { - rediscoverKeyPosition(); - } - // There's a condition in here, choosing between go to next sibling or value, - // this condition is mirrored outside the shouldRetry loop to act upon the data - // which has by then been consistently read. No decision can be made in here directly. + // Next result if ( pos >= keyCount ) { // Read right sibling - rightSibling = bTreeNode.rightSibling( cursor, stableGeneration, unstableGeneration ); - if ( GenSafePointerPair.isSuccess( rightSibling ) ) - { - rightSiblingGen = bTreeNode.pointerGen( cursor, rightSibling ); - } + pointerId = bTreeNode.rightSibling( cursor, stableGeneration, unstableGeneration ); + readPointerGenOnSuccess(); } else { @@ -362,9 +313,10 @@ public boolean next() throws IOException bTreeNode.valueAt( cursor, mutableValue, pos ); } } - while ( rediscoverKeyPosition = cursor.shouldRetry() ); + while ( concurrentWriteHappened = cursor.shouldRetry() ); checkOutOfBounds( cursor ); + // Act if ( nodeType != TreeNode.NODE_TYPE_TREE_NODE || !saneKeyCountRead( keyCount ) ) { // This node has been reused for something else than a tree node. Restart seek from root. @@ -380,90 +332,165 @@ public boolean next() throws IOException continue; } - // Go to newGen if read successfully and - if ( pointerCheckingWithGenerationCatchup( newGen, true ) ) - { - // Reading newGen pointer resulted in a bad read, but generation had changed (a checkpoint has - // occurred since we started this cursor) so the generation fields in this cursor are now updated - // with the latest, so let's try that read again. - rediscoverKeyPosition = true; - continue; - } - else if ( TreeNode.isNode( newGen ) ) + if ( goToNewGen() ) { - // We ended up on a node which has a newGen set, let's go to it and read from that one instead. - bTreeNode.goTo( cursor, "new gen", newGen ); - rediscoverKeyPosition = true; - lastFollowedPointerGen = newGenGen; continue; } if ( pos >= keyCount ) { - // We've exhausted this node, it's time to see if there's a right sibling to go to. - - if ( pointerCheckingWithGenerationCatchup( rightSibling, true ) ) + if ( goToSibling() ) { - // Reading rightSibling pointer resulted in a bad read, but generation had changed - // (a checkpoint has occurred since we started this cursor) so the generation fields in this - // cursor are now updated with the latest, so let's try that read again. - rediscoverKeyPosition = true; - continue; - } - else if ( TreeNode.isNode( rightSibling ) ) - { - // TODO: Check if rightSibling is within expected range before calling next. - // TODO: Possibly by getting highest expected from IdProvider - bTreeNode.goTo( cursor, "right sibling", rightSibling ); - lastFollowedPointerGen = rightSiblingGen; - if ( first ) - { - // Have not yet found first hit among leaves. - // First hit can be several leaves to the right. - // Continue to use binary search in right leaf - rediscoverKeyPosition = true; - } - else - { - // It is likely that first key in right sibling is a next hit. - // Continue using scan - pos = -1; - } - continue; // in the outer loop, with the position reset to the beginning of the right sibling + continue; // in the read loop above so that we can continue reading from that sibling } } else if ( layout.compare( mutableKey, toExclusive ) < 0 ) { - if ( layout.compare( mutableKey, fromInclusive ) < 0 ) + if ( isResultKey() ) { - // too far to the left possibly because page reuse - rediscoverKeyPosition = true; - continue; - } - else if ( !first && layout.compare( prevKey, mutableKey ) >= 0 ) - { - // We've come across a bad read in the middle of a split - // This is outlined in InternalTreeLogic, skip this value (it's fine) - continue; + layout.copyKey( mutableKey, prevKey ); + return true; // which marks this read a hit that user can see } - - // A hit, it's within the range we search for - if ( first ) - { - // Setting first to false include an additional check for coming potential - // hits so that we cannot go backwards in our result. Going backwards can - // happen when reading through concurrent splits or similar and is a benign - // temporary observed state. - first = false; - } - layout.copyKey( mutableKey, prevKey ); - return true; + continue; } + // We've come too far and so this means the end of the result set return false; } } + private int searchKey( KEY key ) + { + int search = KeySearch.search( cursor, bTreeNode, key, mutableKey, keyCount ); + int pos = KeySearch.positionOf( search ); + + // Assuming unique keys + if ( isInternal && KeySearch.isHit( search ) ) + { + pos++; + } + return pos; + } + + /** + * @return {@code true} if header was read and looks sane, otherwise {@code false} meaning that node doesn't look + * like a tree node or we can expect a shouldRetry to take place. + */ + private boolean readHeader() + { + nodeType = TreeNode.nodeType( cursor ); + if ( nodeType != TreeNode.NODE_TYPE_TREE_NODE ) + { + // If this node doesn't even look like a tree node then anything we read from it + // will be just random data when looking at it as if it were a tree node. + return false; + } + + currentNodeGen = bTreeNode.gen( cursor ); + + newGen = bTreeNode.newGen( cursor, stableGeneration, unstableGeneration ); + if ( GenSafePointerPair.isSuccess( newGen ) ) + { + newGenGen = bTreeNode.pointerGen( cursor, newGen ); + } + isInternal = bTreeNode.isInternal( cursor ); + // Find the left-most key within from-range + keyCount = bTreeNode.keyCount( cursor ); + if ( !keyCountIsSane() ) + { + return false; + } + + return true; + } + + /** + * {@link TreeNode#keyCount(PageCursor) keyCount} is the only value read inside a do-shouldRetry loop + * which is used as data fed into another read. Because of that extra assertions are made around + * keyCount, both inside do-shouldRetry (requesting one more round in the loop) and outside + * (calling this method, which may throw exception). + * + * @param keyCount key count read from {@link TreeNode#keyCount(PageCursor)} and has "survived" + * a do-shouldRetry loop. + */ + private boolean saneKeyCountRead( int keyCount ) + { + return keyCount <= maxKeyCount; + } + + @Override + public Hit get() + { + return this; + } + + /** + * @return {@code true} if we should read more after this call, otherwise {@code false} to mark the end. + * @throws IOException on {@link PageCursor} error. + */ + private boolean goToSibling() throws IOException + { + if ( pointerCheckingWithGenerationCatchup( pointerId, true ) ) + { + // Reading rightSibling pointer resulted in a bad read, but generation had changed + // (a checkpoint has occurred since we started this cursor) so the generation fields in this + // cursor are now updated with the latest, so let's try that read again. + concurrentWriteHappened = true; + return true; + } + else if ( TreeNode.isNode( pointerId ) ) + { + // TODO: Check if rightSibling is within expected range before calling next. + // TODO: Possibly by getting highest expected from IdProvider + bTreeNode.goTo( cursor, "right sibling", pointerId ); + lastFollowedPointerGen = pointerGen; + if ( first ) + { + // Have not yet found first hit among leaves. + // First hit can be several leaves to the right. + // Continue to use binary search in right leaf + concurrentWriteHappened = true; + } + else + { + // It is likely that first key in right sibling is a next hit. + // Continue using scan + pos = -1; + } + return true; + } + + // The current node is exhausted and it had no sibling to read more from. + return false; + } + + private boolean isResultKey() + { + if ( layout.compare( mutableKey, fromInclusive ) < 0 ) + { + // too far to the left possibly because page reuse + concurrentWriteHappened = true; + return false; + } + else if ( !first && layout.compare( prevKey, mutableKey ) >= 0 ) + { + // We've come across a bad read in the middle of a split + // This is outlined in InternalTreeLogic, skip this value (it's fine) + return false; + } + + // A hit, it's within the range we search for + if ( first ) + { + // Setting first to false include an additional check for coming potential + // hits so that we cannot go backwards in our result. Going backwards can + // happen when reading through concurrent splits or similar and is a benign + // temporary observed state. + first = false; + } + return true; + } + private boolean keyCountIsSane() { // if keyCount is out of bounds of what a tree node can hold, it must be that we're @@ -506,15 +533,6 @@ else if ( currentNodeGen != expectedCurrentNodeGen ) return true; } - private void rediscoverKeyPosition() - { - // Keys could have been moved to the left so we need to make sure we are not missing any keys by - // moving position back until we find previously returned key - KEY keyToRediscover = first ? fromInclusive : prevKey; - int searchResult = KeySearch.search( cursor, bTreeNode, keyToRediscover, mutableKey, keyCount ); - pos = KeySearch.positionOf( searchResult ); - } - private boolean pointerCheckingWithGenerationCatchup( long pointer, boolean allowNoNode ) { if ( !GenSafePointerPair.isSuccess( pointer ) )