Faster page cache translation tables

Change the page cache translation table implementation from stripe-lock PrimitiveLongObjectMaps, to a dense lock-free concurrent array-trie. The new implementation uses an array-of-arrays approach, and uses Unsafe to perform atomic operations on the slots of the inner arrays. Latch objects (similar to CountDownLatch(1)) are CAS'd into slots prior to page faulting, and eviction to volatile writes of null values to update the translation tables in a wait-free manner. This way, there is no lock-ordering problem in eviction, and the race opportunity of reading a stale value from the translation table has been significantly diminished. The implementation also goes through fewer indirections on look-up, and thus suffers fewer cache misses. This makes it noticably faster.
neo4j · Feb 26, 2015 · fea9059 · fea9059
1 parent a6eb9b1
commit fea9059
Show file tree

Hide file tree

Showing 10 changed files with 559 additions and 342 deletions.
diff --git a/community/io/src/main/java/org/neo4j/io/pagecache/PageEvictionCallback.java b/community/io/src/main/java/org/neo4j/io/pagecache/PageEvictionCallback.java
@@ -21,5 +21,5 @@
 
 public interface PageEvictionCallback
 {
-    public void onEvict( long pageId, Page page );
+    public void onEvict( long filePageId, Page page );
 }
diff --git a/community/io/src/main/java/org/neo4j/io/pagecache/impl/muninn/Latch.java b/community/io/src/main/java/org/neo4j/io/pagecache/impl/muninn/Latch.java
@@ -0,0 +1,164 @@
+/**
+ * Copyright (c) 2002-2015 "Neo Technology,"
+ * Network Engine for Objects in Lund AB [http://neotechnology.com]
+ *
+ * This file is part of Neo4j.
+ *
+ * Neo4j is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.neo4j.io.pagecache.impl.muninn;
+
+import java.util.concurrent.locks.LockSupport;
+
+import static org.neo4j.io.pagecache.impl.muninn.UnsafeUtil.getAndSetObject;
+
+/**
+ * This class is similar in many ways to a CountDownLatch(1).
+ *
+ * The main difference is that instances of this specialized Latch implementation are much quicker to allocate and
+ * construct. Each instance also takes up less memory on the heap, and enqueueing wait nodes on the latch is faster.
+ *
+ * There are two reasons why this class is faster to construct: 1. it performs no volatile write during its
+ * construction, and 2. it does not need to allocate an internal Sync object, like CountDownLatch does.
+ */
+final class Latch
+{
+    private static class Node
+    {
+        volatile Node next;
+    }
+
+    private static final class Waiter extends Node
+    {
+        final Thread waitingThread = Thread.currentThread();
+    }
+
+    private static final long stackOffset =
+            UnsafeUtil.getFieldOffset( Latch.class, "stack" );
+    private static final Node end = new Node();
+    private static final Node released = new Node();
+
+    @SuppressWarnings( "unused" )
+    private volatile Node stack; // written to via unsafe
+
+    /**
+     * Release the latch, thereby unblocking all current and future calls to {@link #await()}.
+     */
+    public void release()
+    {
+        // Once the release sentinel is on the stack, it can never (observably) leave.
+        // Waiters might accidentally remove the released sentinel from the stack for brief periods of time, but then
+        // they are required to fix the situation and put it back.
+        // Atomically swapping the release sentinel onto the stack will give us back all the waiters, if any.
+        Node waiters = (Node) getAndSetObject( this, stackOffset, released );
+        if ( waiters == null )
+        {
+            // There are no waiters to unpark, so don't bother.
+            return;
+        }
+        unparkAll( waiters );
+    }
+
+    /**
+     * Wait for the latch to be released, blocking the current thread if necessary.
+     *
+     * This method returns immediately if the latch has already been released.
+     */
+    public void await()
+    {
+        // Put in a local variable to avoid volatile reads we don't need.
+        Node state = stack;
+        if ( state != released )
+        {
+            // The latch hasn't obviously already been released, so we want to add a waiter to the stack. Trouble is,
+            // we might race with release here, so we need to re-check for release after we've modified the stack.
+            Waiter waiter = new Waiter();
+            state = (Node) getAndSetObject( this, stackOffset, waiter );
+            if ( state == released )
+            {
+                // If we get 'released' back from the swap, then we raced with release, and it is our job to put the
+                // released sentinel back. Doing so can, however, return more waiters that have added themselves in
+                // the mean time. If we find such waiters, then we must make sure to unpark them. Note that we will
+                // never get a null back from this swap, because we at least added our own waiter earlier.
+                Node others = (Node) getAndSetObject( this, stackOffset, released );
+                // Set our next pointer to 'released' as a signal to other threads who might be going through the
+                // stack in the isReleased check.
+                waiter.next = released;
+                unparkAll( others );
+            }
+            else
+            {
+                // It looks like the latch hasn't yet been released, so we are going to park. Before that, we must
+                // assign a non-null value to our next pointer, so other threads will know that we have been properly
+                // enqueued. We use the 'end' sentinel as a marker when there's otherwise no other next node.
+                waiter.next = state == null? end : state;
+                do
+                {
+                    // Park may wake up spuriously, so we have to loop on it until we observe from the state of the
+                    // stack, that the latch has been released.
+                    LockSupport.park( this );
+                }
+                while ( !isReleased() );
+            }
+        }
+    }
+
+    private boolean isReleased()
+    {
+        // We have to go through the entire stack and look for the 'released' sentinel, since we might be racing with
+        // the 'state == released' branch in await.
+        Node state = stack;
+        do
+        {
+            if ( state == released )
+            {
+                // We've been released!
+                return true;
+            }
+
+            Node next;
+            do
+            {
+                // We loop on reading the next pointer because we might observe an enqueued node before its next
+                // pointer has been properly assigned. This is a benign race because we know that the next pointer of a
+                // properly enqueued node is never null.
+                next = state.next;
+            }
+            while ( next == null );
+            state = next;
+        }
+        while ( state != end );
+        // Reaching the end of the stack without seeing 'released' means we're not released.
+        return false;
+    }
+
+    private void unparkAll( Node waiters )
+    {
+        // If we find a node that is not a waiter, then it is either 'end' or 'released'. Looking at the type pointer
+        // is the cheapest way to make this check.
+        while ( waiters.getClass() == Waiter.class )
+        {
+            Waiter waiter = (Waiter) waiters;
+            LockSupport.unpark( waiter.waitingThread );
+            Node next;
+            do
+            {
+                // Just like in isReleased, loop if the next pointer is null.
+                next = waiters.next;
+            }
+            while ( next == null );
+            waiters = next;
+        }
+    }
+}
diff --git a/community/io/src/main/java/org/neo4j/io/pagecache/impl/muninn/MuninnPageCursor.java b/community/io/src/main/java/org/neo4j/io/pagecache/impl/muninn/MuninnPageCursor.java
@@ -20,15 +20,16 @@
 package org.neo4j.io.pagecache.impl.muninn;
 
 import java.io.IOException;
-import java.lang.Override;
 
-import org.neo4j.collection.primitive.PrimitiveLongObjectMap;
 import org.neo4j.io.pagecache.PageCursor;
 import org.neo4j.io.pagecache.PageSwapper;
-import org.neo4j.jsr166e.StampedLock;
 import org.neo4j.io.pagecache.tracing.PageFaultEvent;
 import org.neo4j.io.pagecache.tracing.PinEvent;
 
+import static org.neo4j.io.pagecache.impl.muninn.UnsafeUtil.compareAndSwapObject;
+import static org.neo4j.io.pagecache.impl.muninn.UnsafeUtil.getObjectVolatile;
+import static org.neo4j.io.pagecache.impl.muninn.UnsafeUtil.putObjectVolatile;
+
 abstract class MuninnPageCursor implements PageCursor
 {
     protected MuninnPagedFile pagedFile;
@@ -103,18 +104,84 @@ public final long getCurrentPageId()
     }
 
     /**
-     * NOTE: Must be called while holding the right translationTableLock.writeLock
+     * Pin the desired file page to this cursor, page faulting it into memory if it isn't there already.
-     * for the given translationTable!!!
+     * @param filePageId The file page id we want to pin this cursor to.
-     * This method will release that write lock on the translation table as part
+     * @param exclusive 'true' if we will be taking an exclusive lock on the page as part of the pin.
-     * of the page faulting!
+     * @throws IOException if anything goes wrong with the pin, most likely during a page fault.
      */
-    protected void pageFault(
+    protected void pin( long filePageId, boolean exclusive ) throws IOException
-            long filePageId,
+    {
-            PrimitiveLongObjectMap<MuninnPage> translationTable,
+        PageSwapper swapper = pagedFile.swapper;
-            StampedLock translationTableLock,
+        pinEvent = pagedFile.tracer.beginPin( exclusive, filePageId, swapper );
-            long ttlStamp,
+        int chunkId = pagedFile.computeChunkId( filePageId );
-            PageSwapper swapper ) throws IOException
+        // The chunkOffset is the addressing offset into the chunk array object for the relevant array slot. Using
+        // this, we can access the array slot with Unsafe.
+        long chunkOffset = pagedFile.computeChunkOffset( filePageId );
+        Object[][] tt = pagedFile.translationTable;
+        if ( tt.length <= chunkId )
+        {
+            tt = pagedFile.expandCapacity( chunkId );
+        }
+        Object[] chunk = tt[chunkId];
+
+        // Now, if the reference in the chunk slot is a latch, we wait on it and look up again (in a loop, since the
+        // page might get evicted right after the page fault completes). If we find a page, we lock it and check its
+        // binding (since it might get evicted and faulted into something else in the time between our look up and
+        // our locking of the page). If the reference is null or it referred to a page that had wrong bindings, we CAS
+        // in a latch. If that CAS succeeds, we page fault, set the slot to the faulted in page and open the latch.
+        // If the CAS failed, we retry the look up and start over from the top.
+        Object item;
+        do
+        {
+            item = getObjectVolatile( chunk, chunkOffset );
+            if ( item == null )
+            {
+                // Looks like there's no mapping, so we'd like to do a page fault.
+                Latch latch = new Latch();
+                if ( compareAndSwapObject( chunk, chunkOffset, null, latch ) )
+                {
+                    // We managed to inject our latch, so we now own the right to perform the page fault. We also
+                    // have a duty to eventually release and remove the latch, no matter what happens now.
+                    item = pageFault( filePageId, swapper, chunkOffset, chunk, latch );
+                }
+            }
+            else if ( item.getClass() == Latch.class )
+            {
+                // We found a latch, so someone else is already doing a page fault for this page. So we'll just wait
+                // for them to finish, and grab the page then.
+                Latch latch = (Latch) item;
+                latch.await();
+                item = null;
+            }
+            else
+            {
+                // We got *a* page, but we might be racing with eviction. To cope with that, we have to take some
+                // kind of lock on the page, and check that it is indeed bound to what we expect. If not, then it has
+                // been evicted, and possibly even page faulted into something else. In this case, we discard the
+                // item and try again, as the eviction thread would have set the chunk array slot to null.
+                MuninnPage page = (MuninnPage) item;
+                lockPage( page );
+                if ( !page.isBoundTo( swapper, filePageId ) )
+                {
+                    unlockPage( page );
+                    item = null;
+                }
+            }
+        }
+        while ( item == null );
+        pinCursorToPage( (MuninnPage) item, filePageId, swapper );
+    }
+
+    private MuninnPage pageFault(
+            long filePageId, PageSwapper swapper, long chunkOffset, Object[] chunk, Latch latch )
+            throws IOException
     {
+        // We are page faulting. This is a critical time, because we currently have the given latch in the chunk array
+        // slot that we are faulting into. We MUST make sure to release that latch, and remove it from the chunk, no
+        // matter what happens. Otherwise other threads will get stuck waiting forever for our page fault to finish.
+        // If we manage to get a free page to fault into, then we will also be taking a write lock on that page, to
+        // protect it against concurrent eviction as we assigning a binding to the page. If anything goes wrong, then
+        // we must make sure to release that write lock as well.
         PageFaultEvent faultEvent = pinEvent.beginPageFault();
         MuninnPage page;
         long stamp;
@@ -123,24 +190,23 @@ protected void pageFault(
             // The grabFreePage method might throw.
             page = pagedFile.grabFreePage( faultEvent );
 
-            // We got a free page, and we know that we have race-free access to it.
+            // We got a free page, and we know that we have race-free access to it. Well, it's not entirely race
-            // Well, it's not entirely race free, because other paged files might have
+            // free, because other paged files might have it in their translation tables (or rather, their reads of
-            // it in their translation tables, and try to pin it.
+            // their translation tables might race with eviction) and try to pin it.
-            // However, they will all fail because when they try to pin, the page will
+            // However, they will all fail because when they try to pin, the page will either be 1) free, 2) bound to
-            // either be 1) free, 2) bound to our file, or 3) the page is write locked.
+            // our file, or 3) the page is write locked.
             stamp = page.writeLock();
-            translationTable.put( filePageId, page );
         }
         catch ( Throwable throwable )
         {
+            // Make sure to unstuck the page fault latch.
+            putObjectVolatile( chunk, chunkOffset, null );
+            latch.release();
             faultEvent.done( throwable );
+            // We don't need to worry about the 'stamp' here, because the writeLock call is uninterruptible, so it
+            // can't really fail.
             throw throwable;
         }
-        finally
-        {
-            translationTableLock.unlockWrite( ttlStamp );
-        }
-
         try
         {
             // Check if we're racing with unmapping. We have the page lock
@@ -153,13 +219,19 @@ protected void pageFault(
         }
         catch ( Throwable throwable )
         {
+            // Make sure to unlock the page, so the eviction thread can pick up our trash.
             page.unlockWrite( stamp );
+            // Make sure to unstuck the page fault latch.
+            putObjectVolatile( chunk, chunkOffset, null );
+            latch.release();
             faultEvent.done( throwable );
             throw throwable;
         }
         convertPageFaultLock( page, stamp );
-        pinCursorToPage( page, filePageId, swapper );
+        putObjectVolatile( chunk, chunkOffset, page );
+        latch.release();
         faultEvent.done();
+        return page;
     }
 
     protected void assertPagedFileStillMapped()
@@ -176,6 +248,10 @@ protected void assertPagedFileStillMapped()
 
     protected abstract void pinCursorToPage( MuninnPage page, long filePageId, PageSwapper swapper );
 
+    protected abstract void lockPage( MuninnPage page );
+
+    protected abstract void unlockPage( MuninnPage page );
+
     // --- IO methods:
 
     @Override