Skip to content

Commit

Permalink
Uniqueness check with different duplicate check strategies.
Browse files Browse the repository at this point in the history
Introduce duplicate checking strategies:
use simple map strategy for cases when expected number of elements is
small enough, use dynamic bucket based array strategy in case if expected
number of elements is quite high.
Proposed changes speedups things significantly and for quite big numbers of
duplicates allow create unique indexes in seconds instead of minutes.
  • Loading branch information
MishaDemianenko committed Jul 27, 2017
1 parent 3b1b524 commit 6d000c3
Show file tree
Hide file tree
Showing 7 changed files with 628 additions and 146 deletions.
Expand Up @@ -38,7 +38,7 @@ public class IndexEntryConflictException extends Exception
private final long addedNodeId;
private final long existingNodeId;

public IndexEntryConflictException( long existingNodeId, long addedNodeId, Object propertyValue )
public IndexEntryConflictException( long existingNodeId, long addedNodeId, Object... propertyValue )
{
this( existingNodeId, addedNodeId, OrderedPropertyValues.ofUndefined( propertyValue ) );
}
Expand Down
Expand Up @@ -22,26 +22,22 @@
import org.apache.lucene.document.Document;

import java.io.IOException;
import java.util.Arrays;

import org.neo4j.kernel.api.StatementConstants;
import org.neo4j.kernel.api.exceptions.KernelException;
import org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException;
import org.neo4j.kernel.api.impl.schema.LuceneDocumentStructure;
import org.neo4j.kernel.api.index.PropertyAccessor;
import org.neo4j.kernel.api.properties.Property;
import org.neo4j.kernel.api.schema.OrderedPropertyValues;

public class CompositeDuplicateCheckingCollector extends DuplicateCheckingCollector
{
private final int[] propertyKeyIds;
private CompositeEntrySet actualValues;

public CompositeDuplicateCheckingCollector( PropertyAccessor accessor, int[] propertyKeyIds )
CompositeDuplicateCheckingCollector( PropertyAccessor accessor, int[] propertyKeyIds )
{
super(accessor, -1);
super( accessor, StatementConstants.NO_SUCH_PROPERTY_KEY);
this.propertyKeyIds = propertyKeyIds;
actualValues = new CompositeEntrySet();
}

@Override
Expand All @@ -56,82 +52,6 @@ protected void doCollect( int doc ) throws IOException, KernelException, IndexEn
properties[i] = accessor.getProperty( nodeId, propertyKeyIds[i] );
values[i] = properties[i].value();
}

// We either have to find the first conflicting entry set element,
// or append one for the property we just fetched:
CompositeEntrySet current = actualValues;
scan:
do
{
for ( int i = 0; i < CompositeEntrySet.INCREMENT; i++ )
{
Object[] currentValues = current.values[i];

if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
{
current.values[i] = values;
current.nodeId[i] = nodeId;
if ( i == CompositeEntrySet.INCREMENT - 1 )
{
current.next = new CompositeEntrySet();
}
break scan;
}
else if ( propertyValuesEqual( properties, currentValues ) )
{
throw new IndexEntryConflictException( current.nodeId[i], nodeId,
OrderedPropertyValues.ofUndefined( currentValues ) );
}
}
current = current.next;
}
while ( current != null );
}

private boolean propertyValuesEqual( Property[] properties, Object[] values )
{
if ( properties.length != values.length )
{
return false;
}
for ( int i = 0; i < properties.length; i++ )
{
if ( !properties[i].valueEquals( values[i] ) )
{
return false;
}
}
return true;
}

@Override
public boolean needsScores()
{
return false;
}

public void reset()
{
actualValues = new CompositeEntrySet();
}

/**
* A small struct of arrays of nodeId + array of property values, with a next pointer.
* Should exhibit fairly fast linear iteration, small memory overhead and dynamic growth.
* <p>
* NOTE: Must always call reset() before use!
*/
private static class CompositeEntrySet
{
static final int INCREMENT = 10000;

Object[][] values = new Object[INCREMENT][];
long[] nodeId = new long[INCREMENT];
CompositeEntrySet next;

CompositeEntrySet()
{
Arrays.fill( nodeId, StatementConstants.NO_SUCH_NODE );
}
duplicateCheckStrategy.checkForDuplicate( properties, values, nodeId );
}
}
@@ -0,0 +1,274 @@
/*
* Copyright (c) 2002-2017 "Neo Technology,"
* Network Engine for Objects in Lund AB [http://neotechnology.com]
*
* This file is part of Neo4j.
*
* Neo4j is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.neo4j.kernel.api.impl.schema.verification;

import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;

import org.neo4j.kernel.api.StatementConstants;
import org.neo4j.kernel.api.exceptions.index.IndexEntryConflictException;
import org.neo4j.kernel.api.properties.Property;
import org.neo4j.kernel.api.schema.OrderedPropertyValues;

import static java.lang.Math.max;
import static java.lang.Math.min;

/**
* Base class for strategy used for duplicate check during verification of value uniqueness during
* constraint creation.
*
* Each particular strategy determines how uniqueness check is done and how to accumulate and store those values for
* to make check time and resource consumption optimal.
*/
abstract class DuplicateCheckStrategy
{
/**
* Check uniqueness of multiple properties that belong to a node with provided node id
* @param properties node properties
* @param values property values
* @param nodeId checked node id
* @throws IndexEntryConflictException
*/
abstract void checkForDuplicate( Property[] properties, Object[] values, long nodeId )
throws IndexEntryConflictException;

/**
* Check uniqueness of single property that belong to a node with provided node id.
* @param property node property
* @param value property value
* @param nodeId checked node id
* @throws IndexEntryConflictException
*/
abstract void checkForDuplicate( Property property, Object value, long nodeId ) throws IndexEntryConflictException;

private static boolean propertyValuesEqual( Property[] properties, Object[] values )
{
if ( properties.length != values.length )
{
return false;
}
for ( int i = 0; i < properties.length; i++ )
{
if ( !properties[i].valueEquals( values[i] ) )
{
return false;
}
}
return true;
}

/**
* Duplicate check strategy that uses plain hash map. Should be optimal for small amount of entries.
*/
static class MapDuplicateCheckStrategy extends DuplicateCheckStrategy
{
private Map<Object,Long> valueNodeIdMap;

MapDuplicateCheckStrategy( int expectedNumberOfEntries )
{
this.valueNodeIdMap = new HashMap<>( expectedNumberOfEntries );
}

@Override
public void checkForDuplicate( Property[] properties, Object[] values, long nodeId )
throws IndexEntryConflictException
{
Long previousNodeId = valueNodeIdMap.put( new PropertyValues( properties, values ), nodeId );
if ( previousNodeId != null )
{
throw new IndexEntryConflictException( previousNodeId, nodeId,
OrderedPropertyValues.ofUndefined( values ) );
}
}

@Override
void checkForDuplicate( Property property, Object value, long nodeId ) throws IndexEntryConflictException
{
Long previousNodeId = valueNodeIdMap.put( property, nodeId );
if ( previousNodeId != null )
{
throw new IndexEntryConflictException( previousNodeId, nodeId, value );
}
}

private static class PropertyValues
{
private final Property[] properties;
private final Object[] values;

PropertyValues( Property[] properties, Object[] values )
{
this.properties = properties;
this.values = values;
}

@Override
public boolean equals( Object o )
{
if ( this == o )
{
return true;
}
if ( o == null || getClass() != o.getClass() )
{
return false;
}

PropertyValues that = (PropertyValues) o;
return propertyValuesEqual( properties, that.values );
}

@Override
public int hashCode()
{
int result = 0;
for ( Property property : properties )
{
result = 31 * (result + property.hashCode());
}
return result;
}
}
}

/**
* Strategy that uses arrays to store entries and uses hash codes to split those entries over different buckets.
* Number of buckets and size of entries block are dynamic and evaluated based on expected number of duplicates.
*/
static class BucketsDuplicateCheckStrategy extends DuplicateCheckStrategy
{
private static final int BASE_ENTRY_SIZE = 1000;
private static final int DEFAULT_BUCKETS = 10;
static final int BUCKET_STRATEGY_ENTRIES_THRESHOLD = BASE_ENTRY_SIZE * DEFAULT_BUCKETS;

private static final int MAX_NUMBER_OF_BUCKETS = 100;
private final int numberOfBuckets;
private EntrySet[] actualValues;
private final int entrySetSize;

BucketsDuplicateCheckStrategy()
{
this( BUCKET_STRATEGY_ENTRIES_THRESHOLD );
}

BucketsDuplicateCheckStrategy( int expectedNumberOfEntries )
{
numberOfBuckets = min( MAX_NUMBER_OF_BUCKETS, (expectedNumberOfEntries / BASE_ENTRY_SIZE) + 1 );
actualValues = new EntrySet[numberOfBuckets];
entrySetSize = max( 100, BUCKET_STRATEGY_ENTRIES_THRESHOLD / numberOfBuckets );
}

@Override
public void checkForDuplicate( Property[] properties, Object[] values, long nodeId )
throws IndexEntryConflictException
{
EntrySet current = bucketEntrySet( Arrays.hashCode( values ), entrySetSize );

// We either have to find the first conflicting entry set element,
// or append one for the property we just fetched:
scan:
do
{
for ( int i = 0; i < entrySetSize; i++ )
{
Object[] currentValues = (Object[])current.value[i];

if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
{
current.value[i] = values;
current.nodeId[i] = nodeId;
if ( i == entrySetSize - 1 )
{
current.next = new EntrySet( entrySetSize );
}
break scan;
}
else if ( propertyValuesEqual( properties, currentValues ) )
{
throw new IndexEntryConflictException( current.nodeId[i], nodeId, currentValues );
}
}
current = current.next;
}
while ( current != null );
}

@Override
void checkForDuplicate( Property property, Object propertyValue, long nodeId ) throws IndexEntryConflictException
{
EntrySet current = bucketEntrySet( propertyValue.hashCode(), entrySetSize );

// We either have to find the first conflicting entry set element,
// or append one for the property we just fetched:
scan:
do
{
for ( int i = 0; i < entrySetSize; i++ )
{
Object value = current.value[i];

if ( current.nodeId[i] == StatementConstants.NO_SUCH_NODE )
{
current.value[i] = propertyValue;
current.nodeId[i] = nodeId;
if ( i == entrySetSize - 1 )
{
current.next = new EntrySet( entrySetSize );
}
break scan;
}
else if ( property.valueEquals( value ) )
{
throw new IndexEntryConflictException( current.nodeId[i], nodeId, value );
}
}
current = current.next;
}
while ( current != null );
}

private EntrySet bucketEntrySet( int hashCode, int entrySetSize )
{
int bucket = Math.abs( hashCode ) % numberOfBuckets;
EntrySet current = actualValues[bucket];
if ( current == null )
{
current = new EntrySet( entrySetSize );
actualValues[bucket] = current;
}
return current;
}

private static class EntrySet
{
final Object[] value;
final long[] nodeId;
EntrySet next;

EntrySet( int entrySize )
{
value = new Object[entrySize];
nodeId = new long[entrySize];
Arrays.fill( nodeId, StatementConstants.NO_SUCH_NODE );
}
}
}
}

0 comments on commit 6d000c3

Please sign in to comment.