Uniqueness check with different duplicate check strategies.

Introduce duplicate checking strategies: use simple map strategy for cases when expected number of elements is small enough, use dynamic bucket based array strategy in case if expected number of elements is quite high. Proposed changes speedups things significantly and for quite big numbers of duplicates allow create unique indexes in seconds instead of minutes.
neo4j · Aug 15, 2017 · 7c31a9e · 7c31a9e
1 parent d619578
commit 7c31a9e
Show file tree

Hide file tree

Showing 7 changed files with 593 additions and 149 deletions.
diff --git a/...rnel/src/main/java/org/neo4j/kernel/api/exceptions/index/IndexEntryConflictException.java b/...rnel/src/main/java/org/neo4j/kernel/api/exceptions/index/IndexEntryConflictException.java
@@ -28,7 +28,6 @@
 import org.neo4j.values.storable.ValueTuple;
 
 import static java.lang.String.format;
-import static java.lang.String.valueOf;
 import static org.neo4j.kernel.api.StatementConstants.NO_SUCH_NODE;
 
 /**
@@ -40,7 +39,7 @@ public class IndexEntryConflictException extends Exception
     private final long addedNodeId;
     private final long existingNodeId;
 
-    public IndexEntryConflictException( long existingNodeId, long addedNodeId, Value propertyValue )
+    public IndexEntryConflictException( long existingNodeId, long addedNodeId, Value... propertyValue )
     {
         this( existingNodeId, addedNodeId, ValueTuple.of( propertyValue ) );
     }

diff --git a/...va/org/neo4j/kernel/api/impl/schema/verification/CompositeDuplicateCheckingCollector.java b/...va/org/neo4j/kernel/api/impl/schema/verification/CompositeDuplicateCheckingCollector.java
@@ -22,114 +22,34 @@
 import org.apache.lucene.document.Document;
 
 import java.io.IOException;
-import java.util.Arrays;
 
 import org.neo4j.kernel.api.StatementConstants;
 import org.neo4j.kernel.api.exceptions.KernelException;
 import org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException;
 import org.neo4j.kernel.api.impl.schema.LuceneDocumentStructure;
 import org.neo4j.kernel.api.index.PropertyAccessor;
 import org.neo4j.values.storable.Value;
-import org.neo4j.values.storable.ValueTuple;
 
 public class CompositeDuplicateCheckingCollector extends DuplicateCheckingCollector
 {
     private final int[] propertyKeyIds;
-    private CompositeEntrySet actualValues;
 
-    public CompositeDuplicateCheckingCollector( PropertyAccessor accessor, int[] propertyKeyIds )
+    CompositeDuplicateCheckingCollector( PropertyAccessor accessor, int[] propertyKeyIds )
     {
-        super( accessor, -1 );
+        super( accessor, StatementConstants.NO_SUCH_PROPERTY_KEY );
         this.propertyKeyIds = propertyKeyIds;
-        actualValues = new CompositeEntrySet();
     }
 
     @Override
     protected void doCollect( int doc ) throws IOException, KernelException, IndexEntryConflictException
     {
         Document document = reader.document( doc );
         long nodeId = LuceneDocumentStructure.getNodeId( document );
-        Value[] reference = new Value[propertyKeyIds.length];
-        for ( int i = 0; i < reference.length; i++ )
+        Value[] values = new Value[propertyKeyIds.length];
+        for ( int i = 0; i < values.length; i++ )
         {
-            reference[i] = accessor.getPropertyValue( nodeId, propertyKeyIds[i] );
-        }
-
-        // We either have to find the first conflicting entry set element,
-        // or append one for the property we just fetched:
-        CompositeEntrySet currentEntrySet = actualValues;
-        scan:
-        do
-        {
-            for ( int i = 0; i < CompositeEntrySet.INCREMENT; i++ )
-            {
-                Value[] currentValues = currentEntrySet.values[i];
-
-                if ( currentEntrySet.nodeId[i] == StatementConstants.NO_SUCH_NODE )
-                {
-                    currentEntrySet.values[i] = reference;
-                    currentEntrySet.nodeId[i] = nodeId;
-                    if ( i == CompositeEntrySet.INCREMENT - 1 )
-                    {
-                        currentEntrySet.next = new CompositeEntrySet();
-                    }
-                    break scan;
-                }
-                else if ( propertyValuesEqual( reference, currentValues ) )
-                {
-                    throw new IndexEntryConflictException(
-                            currentEntrySet.nodeId[i], nodeId, ValueTuple.of( currentValues ) );
-                }
-            }
-            currentEntrySet = currentEntrySet.next;
-        }
-        while ( currentEntrySet != null );
-    }
-
-    private boolean propertyValuesEqual( Value[] properties, Value[] values )
-    {
-        if ( properties.length != values.length )
-        {
-            return false;
-        }
-        for ( int i = 0; i < properties.length; i++ )
-        {
-            if ( !properties[i].equals( values[i] ) )
-            {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    @Override
-    public boolean needsScores()
-    {
-        return false;
-    }
-
-    public void reset()
-    {
-        actualValues = new CompositeEntrySet();
-    }
-
-    /**
-     * A small struct of arrays of nodeId + array of property values, with a next pointer.
-     * Should exhibit fairly fast linear iteration, small memory overhead and dynamic growth.
-     * <p>
-     * NOTE: Must always call reset() before use!
-     */
-    private static class CompositeEntrySet
-    {
-        static final int INCREMENT = 10000;
-
-        Value[][] values = new Value[INCREMENT][];
-        long[] nodeId = new long[INCREMENT];
-        CompositeEntrySet next;
-
-        CompositeEntrySet()
-        {
-            Arrays.fill( nodeId, StatementConstants.NO_SUCH_NODE );
+            values[i] = accessor.getPropertyValue( nodeId, propertyKeyIds[i] );
         }
+        duplicateCheckStrategy.checkForDuplicate( values, nodeId );
     }
 }
diff --git a/...x/src/main/java/org/neo4j/kernel/api/impl/schema/verification/DuplicateCheckStrategy.java b/...x/src/main/java/org/neo4j/kernel/api/impl/schema/verification/DuplicateCheckStrategy.java
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2002-2017 "Neo Technology,"
+ * Network Engine for Objects in Lund AB [http://neotechnology.com]
+ *
+ * This file is part of Neo4j.
+ *
+ * Neo4j is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.neo4j.kernel.api.impl.schema.verification;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.neo4j.kernel.api.StatementConstants;
+import org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException;
+import org.neo4j.values.storable.Value;
+import org.neo4j.values.storable.ValueTuple;
+
+import static java.lang.Math.max;
+import static java.lang.Math.min;
+
+/**
+ * Base class for strategy used for duplicate check during verification of value uniqueness during
+ * constraint creation.
+ *
+ * Each particular strategy determines how uniqueness check is done and how to accumulate and store those values for
+ * to make check time and resource consumption optimal.
+ */
+abstract class DuplicateCheckStrategy
+{
+    /**
+     * Check uniqueness of multiple properties that belong to a node with provided node id
+     * @param values property values
+     * @param nodeId checked node id
+     * @throws IndexEntryConflictException
+     */
+    abstract void checkForDuplicate( Value[] values, long nodeId )
+            throws IndexEntryConflictException;
+
+    /**
+     * Check uniqueness of single property that belong to a node with provided node id.
+     * @param value property value
+     * @param nodeId checked node id
+     * @throws IndexEntryConflictException
+     */
+    abstract void checkForDuplicate( Value value, long nodeId ) throws IndexEntryConflictException;
+
+    private static boolean propertyValuesEqual( Value[] properties, Value[] values )
+    {
+        if ( properties.length != values.length )
+        {
+            return false;
+        }
+        for ( int i = 0; i < properties.length; i++ )
+        {
+            if ( !properties[i].equals( values[i] ) )
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Duplicate check strategy that uses plain hash map. Should be optimal for small amount of entries.
+     */
+    static class MapDuplicateCheckStrategy extends DuplicateCheckStrategy
+    {
+        private Map<Object,Long> valueNodeIdMap;
+
+        MapDuplicateCheckStrategy( int expectedNumberOfEntries )
+        {
+            this.valueNodeIdMap = new HashMap<>( expectedNumberOfEntries );
+        }
+
+        @Override
+        public void checkForDuplicate( Value[] values, long nodeId )
+                throws IndexEntryConflictException
+        {
+            Long previousNodeId = valueNodeIdMap.put( ValueTuple.of( values ), nodeId );
+            if ( previousNodeId != null )
+            {
+                throw new IndexEntryConflictException( previousNodeId, nodeId, ValueTuple.of( values ) );
+            }
+        }
+
+        @Override
+        void checkForDuplicate( Value value, long nodeId ) throws IndexEntryConflictException
+        {
+            Long previousNodeId = valueNodeIdMap.put( value, nodeId );
+            if ( previousNodeId != null )
+            {
+                throw new IndexEntryConflictException( previousNodeId, nodeId, value );
+            }
+        }
+
+    }
+
+    /**
+     * Strategy that uses arrays to store entries and uses hash codes to split those entries over different buckets.
+     * Number of buckets and size of entries block are dynamic and evaluated based on expected number of duplicates.
+     */
+    static class BucketsDuplicateCheckStrategy extends DuplicateCheckStrategy
+    {
+        private static final int BASE_ENTRY_SIZE = 1000;
+        private static final int DEFAULT_BUCKETS = 10;
+        static final int BUCKET_STRATEGY_ENTRIES_THRESHOLD = BASE_ENTRY_SIZE * DEFAULT_BUCKETS;
+
+        private static final int MAX_NUMBER_OF_BUCKETS = 100;
+        private final int numberOfBuckets;
+        private BucketEntry[] buckets;
+        private final int bucketSetSize;
+
+        BucketsDuplicateCheckStrategy()
+        {
+            this( BUCKET_STRATEGY_ENTRIES_THRESHOLD );
+        }
+
+        BucketsDuplicateCheckStrategy( int expectedNumberOfEntries )
+        {
+            numberOfBuckets = min( MAX_NUMBER_OF_BUCKETS, (expectedNumberOfEntries / BASE_ENTRY_SIZE) + 1 );
+            buckets = new BucketEntry[numberOfBuckets];
+            bucketSetSize = max( 100, BUCKET_STRATEGY_ENTRIES_THRESHOLD / numberOfBuckets );
+        }
+
+        @Override
+        public void checkForDuplicate( Value[] values, long nodeId )
+                throws IndexEntryConflictException
+        {
+            BucketEntry current = bucketEntrySet( Arrays.hashCode( values ), bucketSetSize );
+
+            // We either have to find the first conflicting entry set element,
+            // or append one for the property we just fetched:
+            scan:
+            do
+            {
+                for ( int i = 0; i < bucketSetSize; i++ )
+                {
+                    Value[] currentValues = (Value[]) current.value[i];
+
+                    if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
+                    {
+                        current.value[i] = values;
+                        current.nodeId[i] = nodeId;
+                        if ( i == bucketSetSize - 1 )
+                        {
+                            current.next = new BucketEntry( bucketSetSize );
+                        }
+                        break scan;
+                    }
+                    else if ( propertyValuesEqual( values, currentValues ) )
+                    {
+                        throw new IndexEntryConflictException( current.nodeId[i], nodeId, currentValues );
+                    }
+                }
+                current = current.next;
+            }
+            while ( current != null );
+        }
+
+        @Override
+        void checkForDuplicate( Value propertyValue, long nodeId ) throws IndexEntryConflictException
+        {
+            BucketEntry current = bucketEntrySet( propertyValue.hashCode(), bucketSetSize );
+
+            // We either have to find the first conflicting entry set element,
+            // or append one for the property we just fetched:
+            scan:
+            do
+            {
+                for ( int i = 0; i < bucketSetSize; i++ )
+                {
+                    Value value = (Value) current.value[i];
+
+                    if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
+                    {
+                        current.value[i] = propertyValue;
+                        current.nodeId[i] = nodeId;
+                        if ( i == bucketSetSize - 1 )
+                        {
+                            current.next = new BucketEntry( bucketSetSize );
+                        }
+                        break scan;
+                    }
+                    else if ( propertyValue.equals( value ) )
+                    {
+                        throw new IndexEntryConflictException( current.nodeId[i], nodeId, value );
+                    }
+                }
+                current = current.next;
+            }
+            while ( current != null );
+        }
+
+        private BucketEntry bucketEntrySet( int hashCode, int entrySetSize )
+        {
+            int bucket = Math.abs( hashCode ) % numberOfBuckets;
+            BucketEntry current = buckets[bucket];
+            if ( current == null )
+            {
+                current = new BucketEntry( entrySetSize );
+                buckets[bucket] = current;
+            }
+            return current;
+        }
+
+        /**
+         * Each bucket entry contains arrays of nodes and corresponding values and link to next BucketEntry in the
+         * chain for cases when we have more data then the size of one bucket. So bucket entries will form a
+         * chain of entries to represent values in particular bucket
+         */
+        private static class BucketEntry
+        {
+            final Object[] value;
+            final long[] nodeId;
+            BucketEntry next;
+
+            BucketEntry( int entrySize )
+            {
+                value = new Object[entrySize];
+                nodeId = new long[entrySize];
+                Arrays.fill( nodeId, StatementConstants.NO_SUCH_NODE );
+            }
+        }
+    }
+}