From 00774813cc01101247829e05ba31d3eed108ce55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mattias=20Finn=C3=A9?= Date: Wed, 27 Mar 2019 18:06:50 +0100 Subject: [PATCH] More javadoc, lookup optimizations and multi-entity-tokens fixes --- .../api/index/SchemaDescriptorLookupSet.java | 215 ++++++++++++++---- .../index/SchemaDescriptorLookupSetTest.java | 12 +- 2 files changed, 174 insertions(+), 53 deletions(-) diff --git a/community/kernel/src/main/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSet.java b/community/kernel/src/main/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSet.java index f1d5275a7b4b0..c6a50c0c7f960 100644 --- a/community/kernel/src/main/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSet.java +++ b/community/kernel/src/main/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSet.java @@ -33,15 +33,15 @@ import static java.lang.Math.toIntExact; /** - * Collects and provides efficient access to {@link SchemaDescriptor}, based on entity token ids and partial or full list of properties. - * The descriptors are first grouped by entity token id and then for a specific entity token id they are further grouped by property id. - * The property grouping works on sorted property key id lists as to minimize deduplication when selecting for composite indexes. + * Collects and provides efficient access to {@link SchemaDescriptor}, based on complete list of entity tokens and partial or complete list of property keys. + * The descriptors are first grouped by entity token and then further grouped by property key. The grouping works on sorted token lists + * to minimize deduplication when doing lookups, especially for composite and multi-entity descriptors. *

- * The selection works efficiently when providing complete list of properties, but will have to resort to some amount of over-selection and union - * when caller can only provide partial list of properties. The most efficient linking starts from entity token ids and traverses through the property key ids - * of the schema descriptor (single or multiple for composite index) in sorted order where each property key id (for this entity token) links to the next - * property key. From all leaves along the way the descriptors can be collected. This way only the actually matching indexes will be collected - * instead of collecting all indexes matching any property key and doing a union of those. Example: + * The selection works best when providing a complete list of property keys and will have to resort to some amount of over-selection + * when caller can only provide partial list of property keys. Selection starts by traversing through the entity tokens, and for each + * found leaf continues traversing through the property keys. When adding descriptors the token lists are sorted and token lists passed + * into lookup methods also have to be sorted. This way the internal data structure can be represented as chains of tokens and traversal only + * needs to go through the token chains in one order (the sorted order). A visualization of the internal data structure of added descriptors: * *

  *     Legend: ids inside [] are entity tokens, ids inside () are properties
@@ -52,10 +52,14 @@
  *     D: [0](3, 4, 7)
  *     E: [1](7)
  *     F: [1](5, 6)
- *     TODO add multi-entity descriptors
+ *     G: [0, 1](3, 4)
+ *     H: [1, 0](3)
  *
  *     Will result in a data structure (for the optimized path):
  *     [0]
+ *        -> [1]
+ *           -> (3): H
+ *              -> (4): G
  *        -> (3)
  *           -> (4): C
  *              -> (7): A, D
@@ -72,18 +76,30 @@ public class SchemaDescriptorLookupSet
     private final MutableIntObjectMap byFirstEntityToken = IntObjectMaps.mutable.empty();
     private final MutableIntObjectMap byAnyEntityToken = IntObjectMaps.mutable.empty();
 
+    /**
+     * @return whether or not this set is empty, i.e. {@code true} if no descriptors have been added.
+     */
     boolean isEmpty()
     {
         return byFirstEntityToken.isEmpty();
     }
 
+    /**
+     * Cheap way of finding out whether or not there are any descriptors matching the set of entity token ids and the property key id.
+     *
+     * @param entityTokenIds complete list of entity token ids for the entity to check.
+     * @param propertyKey a property key id to check.
+     * @return {@code true} if there are one or more descriptors matching the given tokens.
+     */
     boolean has( long[] entityTokenIds, int propertyKey )
     {
-        if ( byFirstEntityToken.isEmpty() )
+        // Abort right away if there are no descriptors at all
+        if ( isEmpty() )
         {
             return false;
         }
 
+        // Check if there are any descriptors that matches any of the first (or only) entity token
         for ( int i = 0; i < entityTokenIds.length; i++ )
         {
             EntityMultiSet first = byFirstEntityToken.get( toIntExact( entityTokenIds[i] ) );
@@ -95,14 +111,25 @@ boolean has( long[] entityTokenIds, int propertyKey )
         return false;
     }
 
+    /**
+     * Cheap way of finding out whether or not there are any descriptors matching the given entity token id.
+     *
+     * @param entityTokenId entity token id to check.
+     * @return {@code true} if there are one or more descriptors matching the given entity token.
+     */
     boolean has( int entityTokenId )
     {
-        return !byAnyEntityToken.isEmpty() && byAnyEntityToken.containsKey( entityTokenId );
+        return byAnyEntityToken.containsKey( entityTokenId );
     }
 
+    /**
+     * Adds the given descriptor to this set so that it can be looked up from any of the lookup methods.
+     *
+     * @param schemaDescriptor the descriptor to add.
+     */
     public void add( T schemaDescriptor )
     {
-        int[] entityTokenIds = schemaDescriptor.schema().getEntityTokenIds();
+        int[] entityTokenIds = sortedEntityTokenIds( schemaDescriptor.schema() );
         int firstEntityTokenId = entityTokenIds[0];
         byFirstEntityToken.getIfAbsentPut( firstEntityTokenId, EntityMultiSet::new ).add( schemaDescriptor, entityTokenIds, 0 );
 
@@ -112,9 +139,15 @@ public void add( T schemaDescriptor )
         }
     }
 
+    /**
+     * Removes the given descriptor from this set so that it can no longer be looked up from the lookup methods.
+     * This operation is idempotent.
+     *
+     * @param schemaDescriptor the descriptor to remove.
+     */
     public void remove( T schemaDescriptor )
     {
-        int[] entityTokenIds = schemaDescriptor.schema().getEntityTokenIds();
+        int[] entityTokenIds = sortedEntityTokenIds( schemaDescriptor.schema() );
         int firstEntityTokenId = entityTokenIds[0];
         EntityMultiSet first = byFirstEntityToken.get( firstEntityTokenId );
         if ( first != null && first.remove( schemaDescriptor, entityTokenIds, 0 ) )
@@ -132,6 +165,14 @@ public void remove( T schemaDescriptor )
         }
     }
 
+    /**
+     * Collects descriptors matching the given complete list of entity tokens and property key tokens.
+     * I.e. all tokens of the matching descriptors can be found in the given lists of tokens.
+     *
+     * @param into {@link Collection} to add matching descriptors into.
+     * @param entityTokenIds complete and sorted array of entity token ids for the entity.
+     * @param sortedProperties complete and sorted array of property key token ids for the entity.
+     */
     void matchingDescriptorsForCompleteListOfProperties( Collection into, long[] entityTokenIds, int[] sortedProperties )
     {
         for ( int i = 0; i < entityTokenIds.length; i++ )
@@ -145,6 +186,17 @@ void matchingDescriptorsForCompleteListOfProperties( Collection into, long[]
         }
     }
 
+    /**
+     * Collects descriptors matching the given complete list of entity tokens, but only partial list of property key tokens.
+     * I.e. all entity tokens of the matching descriptors can be found in the given lists of tokens,
+     * but some matching descriptors may have other property key tokens in addition to those found in the given properties of the entity.
+     * This is for a scenario where the complete list of property key tokens isn't known when calling this method and may
+     * collect additional descriptors that in the end isn't relevant for the specific entity.
+     *
+     * @param into {@link Collection} to add matching descriptors into.
+     * @param entityTokenIds complete and sorted array of entity token ids for the entity.
+     * @param sortedProperties complete and sorted array of property key token ids for the entity.
+     */
     void matchingDescriptorsForPartialListOfProperties( Collection into, long[] entityTokenIds, int[] sortedProperties )
     {
         for ( int i = 0; i < entityTokenIds.length; i++ )
@@ -158,6 +210,12 @@ void matchingDescriptorsForPartialListOfProperties( Collection into, long[] e
         }
     }
 
+    /**
+     * Collects descriptors matching the given complete list of entity tokens.
+     *
+     * @param into {@link Collection} to add matching descriptors into.
+     * @param entityTokenIds complete and sorted array of entity token ids for the entity.
+     */
     void matchingDescriptors( Collection into, long[] entityTokenIds )
     {
         for ( int i = 0; i < entityTokenIds.length; i++ )
@@ -210,14 +268,17 @@ boolean remove( T schemaDescriptor, int[] entityTokenIds, int cursor )
         void collectForCompleteListOfProperties( Collection into, long[] entityTokenIds, int entityTokenCursor, int[] sortedProperties )
         {
             propertyMultiSet.collectForCompleteListOfProperties( into, sortedProperties );
-            // TODO potentially optimize be checking if there even are any next at all before looping? Same thing could be done in PropertyMultiSet
-            for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
+
+            if ( !next.isEmpty() )
             {
-                int nextEntityTokenId = toIntExact( entityTokenIds[i] );
-                EntityMultiSet nextSet = next.get( nextEntityTokenId );
-                if ( nextSet != null )
+                for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
                 {
-                    nextSet.collectForCompleteListOfProperties( into, entityTokenIds, i, sortedProperties );
+                    int nextEntityTokenId = toIntExact( entityTokenIds[i] );
+                    EntityMultiSet nextSet = next.get( nextEntityTokenId );
+                    if ( nextSet != null )
+                    {
+                        nextSet.collectForCompleteListOfProperties( into, entityTokenIds, i, sortedProperties );
+                    }
                 }
             }
         }
@@ -225,14 +286,17 @@ void collectForCompleteListOfProperties( Collection into, long[] entityTokenI
         void collectForPartialListOfProperties( Collection into, long[] entityTokenIds, int entityTokenCursor, int[] sortedProperties )
         {
             propertyMultiSet.collectForPartialListOfProperties( into, sortedProperties );
-            // TODO potentially optimize be checking if there even are any next at all before looping? Same thing could be done in PropertyMultiSet
-            for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
+
+            if ( !next.isEmpty() )
             {
-                int nextEntityTokenId = toIntExact( entityTokenIds[i] );
-                EntityMultiSet nextSet = next.get( nextEntityTokenId );
-                if ( nextSet != null )
+                for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
                 {
-                    nextSet.collectForPartialListOfProperties( into, entityTokenIds, i, sortedProperties );
+                    int nextEntityTokenId = toIntExact( entityTokenIds[i] );
+                    EntityMultiSet nextSet = next.get( nextEntityTokenId );
+                    if ( nextSet != null )
+                    {
+                        nextSet.collectForPartialListOfProperties( into, entityTokenIds, i, sortedProperties );
+                    }
                 }
             }
         }
@@ -240,13 +304,17 @@ void collectForPartialListOfProperties( Collection into, long[] entityTokenId
         void collectAll( Collection into, long[] entityTokenIds, int entityTokenCursor )
         {
             propertyMultiSet.collectAll( into );
-            for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
+
+            if ( !next.isEmpty() )
             {
-                int nextEntityTokenId = toIntExact( entityTokenIds[i] );
-                EntityMultiSet nextSet = next.get( nextEntityTokenId );
-                if ( nextSet != null )
+                for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
                 {
-                    nextSet.collectAll( into, entityTokenIds, i );
+                    int nextEntityTokenId = toIntExact( entityTokenIds[i] );
+                    EntityMultiSet nextSet = next.get( nextEntityTokenId );
+                    if ( nextSet != null )
+                    {
+                        nextSet.collectAll( into, entityTokenIds, i );
+                    }
                 }
             }
         }
@@ -257,19 +325,47 @@ boolean has( long[] entityTokenIds, int entityTokenCursor, int propertyKey )
             {
                 return true;
             }
-            for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
+            if ( !next.isEmpty() )
             {
-                int nextEntityTokenId = toIntExact( entityTokenIds[i] );
-                EntityMultiSet nextSet = next.get( nextEntityTokenId );
-                if ( nextSet != null && nextSet.has( entityTokenIds, i, propertyKey ) )
+                for ( int i = entityTokenCursor + 1; i < entityTokenIds.length; i++ )
                 {
-                    return true;
+                    int nextEntityTokenId = toIntExact( entityTokenIds[i] );
+                    EntityMultiSet nextSet = next.get( nextEntityTokenId );
+                    if ( nextSet != null && nextSet.has( entityTokenIds, i, propertyKey ) )
+                    {
+                        return true;
+                    }
                 }
             }
             return false;
         }
     }
 
+    /**
+     * A starting point for traversal of property key tokens from an entity token leaf.
+     * Contains starting points of property key ids chains as well as lookup by any property in the chain.
+     * Roughly like this:
+     *
+     * 
+     *     Descriptors:
+     *     A: [0](5, 7)
+     *     B: [0, 1](5, 4)
+     *     C: [0, 1](4)
+     *     D: [1, 0](6)
+     *
+     *     Data structure:
+     *     [0]
+     *        -> [1]
+     *           -> (4): C
+     *              -> (5): B
+     *           -> (6): D
+     *        -> (5)
+     *           -> (7): A
+     * 
+ * + * In the above layout the [0] and [0] -> [1] are entity token "leaves" which each have a {@link PropertyMultiSet}, + * constituting the flip in the traversal between entity token chain and property chain. + */ private class PropertyMultiSet { private final Set descriptors = new HashSet<>(); @@ -353,17 +449,6 @@ void collectAll( Collection descriptors ) descriptors.addAll( this.descriptors ); } - private int[] sortedPropertyKeyIds( SchemaDescriptor schemaDescriptor ) - { - int[] propertyKeyIds = schemaDescriptor.getPropertyIds(); - if ( propertyKeyIds.length > 1 ) - { - propertyKeyIds = propertyKeyIds.clone(); - Arrays.sort( propertyKeyIds ); - } - return propertyKeyIds; - } - boolean has( int propertyKey ) { return byAnyProperty.containsKey( propertyKey ); @@ -375,6 +460,12 @@ boolean isEmpty() } } + /** + * A single item in a property key chain. Sort of a subset, or a more specific version, of {@link PropertyMultiSet}. + * It has a set containing descriptors that have their property chain end with this property key token and next pointers + * to other property key tokens in known chains, but no way of looking up descriptors by any part of the chain, + * like {@link PropertyMultiSet} has. + */ private class PropertySet { private final Set fullDescriptors = new HashSet<>(); @@ -420,14 +511,38 @@ boolean remove( T schemaDescriptor, int[] propertyKeyIds, int cursor ) void collectForCompleteListOfProperties( Collection descriptors, int[] sortedProperties, int cursor ) { descriptors.addAll( fullDescriptors ); - for ( int i = cursor + 1; i < sortedProperties.length; i++ ) + if ( !next.isEmpty() ) { - PropertySet nextSet = next.get( sortedProperties[i] ); - if ( nextSet != null ) + for ( int i = cursor + 1; i < sortedProperties.length; i++ ) { - nextSet.collectForCompleteListOfProperties( descriptors, sortedProperties, i ); + PropertySet nextSet = next.get( sortedProperties[i] ); + if ( nextSet != null ) + { + nextSet.collectForCompleteListOfProperties( descriptors, sortedProperties, i ); + } } } } } + + private static int[] sortedPropertyKeyIds( SchemaDescriptor schemaDescriptor ) + { + return sortedTokenIds( schemaDescriptor.getPropertyIds() ); + } + + private static int[] sortedEntityTokenIds( SchemaDescriptor schemaDescriptor ) + { + return sortedTokenIds( schemaDescriptor.getEntityTokenIds() ); + } + + private static int[] sortedTokenIds( int[] tokenIds ) + { + if ( tokenIds.length > 1 ) + { + // Clone it because we don't know if the array was an internal array that the descriptor handed out + tokenIds = tokenIds.clone(); + Arrays.sort( tokenIds ); + } + return tokenIds; + } } diff --git a/community/kernel/src/test/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSetTest.java b/community/kernel/src/test/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSetTest.java index b0fb3891b1570..706dfbd07b8f3 100644 --- a/community/kernel/src/test/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSetTest.java +++ b/community/kernel/src/test/java/org/neo4j/kernel/impl/api/index/SchemaDescriptorLookupSetTest.java @@ -236,15 +236,15 @@ private static Set expectedDescriptors( List private SchemaDescriptor randomSchemaDescriptor( int highEntityKeyId, int highPropertyKeyId, int maxNumberOfEntityKeys, int maxNumberOfPropertyKeys ) { int numberOfEntityKeys = random.nextInt( 1, maxNumberOfEntityKeys ); - int[] entityKeys = randomUniqueSortedIntArray( highEntityKeyId, numberOfEntityKeys ); + int[] entityKeys = randomUniqueUnsortedIntArray( highEntityKeyId, numberOfEntityKeys ); int numberOfPropertyKeys = random.nextInt( 1, maxNumberOfPropertyKeys ); - int[] propertyKeys = randomUniqueSortedIntArray( highPropertyKeyId, numberOfPropertyKeys ); + int[] propertyKeys = randomUniqueUnsortedIntArray( highPropertyKeyId, numberOfPropertyKeys ); return entityKeys.length > 1 ? SchemaDescriptorFactory.multiToken( entityKeys, EntityType.NODE, propertyKeys ) : SchemaDescriptorFactory.forLabel( entityKeys[0], propertyKeys ); } - private int[] randomUniqueSortedIntArray( int maxValue, int length ) + private int[] randomUniqueUnsortedIntArray( int maxValue, int length ) { int[] array = new int[length]; MutableIntSet seen = IntSets.mutable.empty(); @@ -258,6 +258,12 @@ private int[] randomUniqueSortedIntArray( int maxValue, int length ) while ( !seen.add( candidate ) ); array[i] = candidate; } + return array; + } + + private int[] randomUniqueSortedIntArray( int maxValue, int length ) + { + int[] array = randomUniqueUnsortedIntArray( maxValue, length ); Arrays.sort( array ); return array; }