Uniqueness check with different duplicate check strategies.

Introduce duplicate checking strategies: use simple map strategy for cases when expected number of elements is small enough, use dynamic bucket based array strategy in case if expected number of elements is quite high. Proposed changes speedups things significantly and for quite big numbers of duplicates allow create unique indexes in seconds instead of minutes.
neo4j · Jul 27, 2017 · 6d000c3 · 6d000c3
1 parent 3b1b524
commit 6d000c3
Show file tree

Hide file tree

Showing 7 changed files with 628 additions and 146 deletions.
diff --git a/...rnel/src/main/java/org/neo4j/kernel/api/exceptions/index/IndexEntryConflictException.java b/...rnel/src/main/java/org/neo4j/kernel/api/exceptions/index/IndexEntryConflictException.java
@@ -38,7 +38,7 @@ public class IndexEntryConflictException extends Exception
     private final long addedNodeId;
     private final long existingNodeId;
 
-    public IndexEntryConflictException( long existingNodeId, long addedNodeId, Object propertyValue )
+    public IndexEntryConflictException( long existingNodeId, long addedNodeId, Object... propertyValue )
     {
         this( existingNodeId, addedNodeId, OrderedPropertyValues.ofUndefined( propertyValue ) );
     }

diff --git a/...va/org/neo4j/kernel/api/impl/schema/verification/CompositeDuplicateCheckingCollector.java b/...va/org/neo4j/kernel/api/impl/schema/verification/CompositeDuplicateCheckingCollector.java
@@ -22,26 +22,22 @@
 import org.apache.lucene.document.Document;
 
 import java.io.IOException;
-import java.util.Arrays;
 
 import org.neo4j.kernel.api.StatementConstants;
 import org.neo4j.kernel.api.exceptions.KernelException;
 import org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException;
 import org.neo4j.kernel.api.impl.schema.LuceneDocumentStructure;
 import org.neo4j.kernel.api.index.PropertyAccessor;
 import org.neo4j.kernel.api.properties.Property;
-import org.neo4j.kernel.api.schema.OrderedPropertyValues;
 
 public class CompositeDuplicateCheckingCollector extends DuplicateCheckingCollector
 {
     private final int[] propertyKeyIds;
-    private CompositeEntrySet actualValues;
 
-    public CompositeDuplicateCheckingCollector( PropertyAccessor accessor, int[] propertyKeyIds )
+    CompositeDuplicateCheckingCollector( PropertyAccessor accessor, int[] propertyKeyIds )
     {
-        super(accessor, -1);
+        super( accessor, StatementConstants.NO_SUCH_PROPERTY_KEY);
         this.propertyKeyIds = propertyKeyIds;
-        actualValues = new CompositeEntrySet();
     }
 
     @Override
@@ -56,82 +52,6 @@ protected void doCollect( int doc ) throws IOException, KernelException, IndexEn
             properties[i] = accessor.getProperty( nodeId, propertyKeyIds[i] );
             values[i] = properties[i].value();
         }
-
-        // We either have to find the first conflicting entry set element,
-        // or append one for the property we just fetched:
-        CompositeEntrySet current = actualValues;
-        scan:
-        do
-        {
-            for ( int i = 0; i < CompositeEntrySet.INCREMENT; i++ )
-            {
-                Object[] currentValues = current.values[i];
-
-                if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
-                {
-                    current.values[i] = values;
-                    current.nodeId[i] = nodeId;
-                    if ( i == CompositeEntrySet.INCREMENT - 1 )
-                    {
-                        current.next = new CompositeEntrySet();
-                    }
-                    break scan;
-                }
-                else if ( propertyValuesEqual( properties, currentValues ) )
-                {
-                    throw new IndexEntryConflictException( current.nodeId[i], nodeId,
-                            OrderedPropertyValues.ofUndefined( currentValues ) );
-                }
-            }
-            current = current.next;
-        }
-        while ( current != null );
-    }
-
-    private boolean propertyValuesEqual( Property[] properties, Object[] values )
-    {
-        if ( properties.length != values.length )
-        {
-            return false;
-        }
-        for ( int i = 0; i < properties.length; i++ )
-        {
-            if ( !properties[i].valueEquals( values[i] ) )
-            {
-                return false;
-            }
-        }
-        return true;
-    }
-
-    @Override
-    public boolean needsScores()
-    {
-        return false;
-    }
-
-    public void reset()
-    {
-        actualValues = new CompositeEntrySet();
-    }
-
-    /**
-     * A small struct of arrays of nodeId + array of property values, with a next pointer.
-     * Should exhibit fairly fast linear iteration, small memory overhead and dynamic growth.
-     * <p>
-     * NOTE: Must always call reset() before use!
-     */
-    private static class CompositeEntrySet
-    {
-        static final int INCREMENT = 10000;
-
-        Object[][] values = new Object[INCREMENT][];
-        long[] nodeId = new long[INCREMENT];
-        CompositeEntrySet next;
-
-        CompositeEntrySet()
-        {
-            Arrays.fill( nodeId, StatementConstants.NO_SUCH_NODE );
-        }
+        duplicateCheckStrategy.checkForDuplicate( properties, values, nodeId );
     }
 }
diff --git a/...x/src/main/java/org/neo4j/kernel/api/impl/schema/verification/DuplicateCheckStrategy.java b/...x/src/main/java/org/neo4j/kernel/api/impl/schema/verification/DuplicateCheckStrategy.java
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2002-2017 "Neo Technology,"
+ * Network Engine for Objects in Lund AB [http://neotechnology.com]
+ *
+ * This file is part of Neo4j.
+ *
+ * Neo4j is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+package org.neo4j.kernel.api.impl.schema.verification;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.neo4j.kernel.api.StatementConstants;
+import org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException;
+import org.neo4j.kernel.api.properties.Property;
+import org.neo4j.kernel.api.schema.OrderedPropertyValues;
+
+import static java.lang.Math.max;
+import static java.lang.Math.min;
+
+/**
+ * Base class for strategy used for duplicate check during verification of value uniqueness during
+ * constraint creation.
+ *
+ * Each particular strategy determines how uniqueness check is done and how to accumulate and store those values for
+ * to make check time and resource consumption optimal.
+ */
+abstract class DuplicateCheckStrategy
+{
+    /**
+     * Check uniqueness of multiple properties that belong to a node with provided node id
+     * @param properties node properties
+     * @param values property values
+     * @param nodeId checked node id
+     * @throws IndexEntryConflictException
+     */
+    abstract void checkForDuplicate( Property[] properties, Object[] values, long nodeId )
+            throws IndexEntryConflictException;
+
+    /**
+     * Check uniqueness of single property that belong to a node with provided node id.
+     * @param property node property
+     * @param value property value
+     * @param nodeId checked node id
+     * @throws IndexEntryConflictException
+     */
+    abstract void checkForDuplicate( Property property, Object value, long nodeId ) throws IndexEntryConflictException;
+
+    private static boolean propertyValuesEqual( Property[] properties, Object[] values )
+    {
+        if ( properties.length != values.length )
+        {
+            return false;
+        }
+        for ( int i = 0; i < properties.length; i++ )
+        {
+            if ( !properties[i].valueEquals( values[i] ) )
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    /**
+     * Duplicate check strategy that uses plain hash map. Should be optimal for small amount of entries.
+     */
+    static class MapDuplicateCheckStrategy extends DuplicateCheckStrategy
+    {
+        private Map<Object,Long> valueNodeIdMap;
+
+        MapDuplicateCheckStrategy( int expectedNumberOfEntries )
+        {
+            this.valueNodeIdMap = new HashMap<>( expectedNumberOfEntries );
+        }
+
+        @Override
+        public void checkForDuplicate( Property[] properties, Object[] values, long nodeId )
+                throws IndexEntryConflictException
+        {
+            Long previousNodeId = valueNodeIdMap.put( new PropertyValues( properties, values ), nodeId );
+            if ( previousNodeId != null )
+            {
+                throw new IndexEntryConflictException( previousNodeId, nodeId,
+                        OrderedPropertyValues.ofUndefined( values ) );
+            }
+        }
+
+        @Override
+        void checkForDuplicate( Property property, Object value, long nodeId ) throws IndexEntryConflictException
+        {
+            Long previousNodeId = valueNodeIdMap.put( property, nodeId );
+            if ( previousNodeId != null )
+            {
+                throw new IndexEntryConflictException( previousNodeId, nodeId, value );
+            }
+        }
+
+        private static class PropertyValues
+        {
+            private final Property[] properties;
+            private final Object[] values;
+
+            PropertyValues( Property[] properties, Object[] values )
+            {
+                this.properties = properties;
+                this.values = values;
+            }
+
+            @Override
+            public boolean equals( Object o )
+            {
+                if ( this == o )
+                {
+                    return true;
+                }
+                if ( o == null || getClass() != o.getClass() )
+                {
+                    return false;
+                }
+
+                PropertyValues that = (PropertyValues) o;
+                return propertyValuesEqual( properties, that.values );
+            }
+
+            @Override
+            public int hashCode()
+            {
+                int result = 0;
+                for ( Property property : properties )
+                {
+                    result = 31 * (result + property.hashCode());
+                }
+                return result;
+            }
+        }
+    }
+
+    /**
+     * Strategy that uses arrays to store entries and uses hash codes to split those entries over different buckets.
+     * Number of buckets and size of entries block are dynamic and evaluated based on expected number of duplicates.
+     */
+    static class BucketsDuplicateCheckStrategy extends DuplicateCheckStrategy
+    {
+        private static final int BASE_ENTRY_SIZE = 1000;
+        private static final int DEFAULT_BUCKETS = 10;
+        static final int BUCKET_STRATEGY_ENTRIES_THRESHOLD = BASE_ENTRY_SIZE * DEFAULT_BUCKETS;
+
+        private static final int MAX_NUMBER_OF_BUCKETS = 100;
+        private final int numberOfBuckets;
+        private EntrySet[] actualValues;
+        private final int entrySetSize;
+
+        BucketsDuplicateCheckStrategy()
+        {
+            this( BUCKET_STRATEGY_ENTRIES_THRESHOLD );
+        }
+
+        BucketsDuplicateCheckStrategy( int expectedNumberOfEntries )
+        {
+            numberOfBuckets = min( MAX_NUMBER_OF_BUCKETS, (expectedNumberOfEntries / BASE_ENTRY_SIZE) + 1 );
+            actualValues = new EntrySet[numberOfBuckets];
+            entrySetSize = max( 100, BUCKET_STRATEGY_ENTRIES_THRESHOLD / numberOfBuckets );
+        }
+
+        @Override
+        public void checkForDuplicate( Property[] properties, Object[] values, long nodeId )
+                throws IndexEntryConflictException
+        {
+            EntrySet current = bucketEntrySet( Arrays.hashCode( values ), entrySetSize );
+
+            // We either have to find the first conflicting entry set element,
+            // or append one for the property we just fetched:
+            scan:
+            do
+            {
+                for ( int i = 0; i < entrySetSize; i++ )
+                {
+                    Object[] currentValues = (Object[])current.value[i];
+
+                    if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
+                    {
+                        current.value[i] = values;
+                        current.nodeId[i] = nodeId;
+                        if ( i == entrySetSize - 1 )
+                        {
+                            current.next = new EntrySet( entrySetSize );
+                        }
+                        break scan;
+                    }
+                    else if ( propertyValuesEqual( properties, currentValues ) )
+                    {
+                        throw new IndexEntryConflictException( current.nodeId[i], nodeId, currentValues );
+                    }
+                }
+                current = current.next;
+            }
+            while ( current != null );
+        }
+
+        @Override
+        void checkForDuplicate( Property property, Object propertyValue, long nodeId ) throws IndexEntryConflictException
+        {
+            EntrySet current = bucketEntrySet( propertyValue.hashCode(), entrySetSize );
+
+            // We either have to find the first conflicting entry set element,
+            // or append one for the property we just fetched:
+            scan:
+            do
+            {
+                for ( int i = 0; i < entrySetSize; i++ )
+                {
+                    Object value = current.value[i];
+
+                    if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
+                    {
+                        current.value[i] = propertyValue;
+                        current.nodeId[i] = nodeId;
+                        if ( i == entrySetSize - 1 )
+                        {
+                            current.next = new EntrySet( entrySetSize );
+                        }
+                        break scan;
+                    }
+                    else if ( property.valueEquals( value ) )
+                    {
+                        throw new IndexEntryConflictException( current.nodeId[i], nodeId, value );
+                    }
+                }
+                current = current.next;
+            }
+            while ( current != null );
+        }
+
+        private EntrySet bucketEntrySet( int hashCode, int entrySetSize )
+        {
+            int bucket = Math.abs( hashCode ) % numberOfBuckets;
+            EntrySet current = actualValues[bucket];
+            if ( current == null )
+            {
+                current = new EntrySet( entrySetSize );
+                actualValues[bucket] = current;
+            }
+            return current;
+        }
+
+        private static class EntrySet
+        {
+            final Object[] value;
+            final long[] nodeId;
+            EntrySet next;
+
+            EntrySet( int entrySize )
+            {
+                value = new Object[entrySize];
+                nodeId = new long[entrySize];
+                Arrays.fill( nodeId, StatementConstants.NO_SUCH_NODE );
+            }
+        }
+    }
+}