From 8813ab4daf0a078870ff13c56d1e6c2a89357c48 Mon Sep 17 00:00:00 2001 From: Davide Grohmann Date: Mon, 18 Apr 2016 11:45:51 +0200 Subject: [PATCH] Fix sampling of lucene index size Since when we iterate over all the terms in the lucene index, lucene will return also the term that has been deleted and not in use anymore untill the next index compaction. This is not a huge problem per se for computing index selectivity, but we were using this skewed numbers also for the index size. This commit will fix the index size to be computed correctly by asking lucene the number of documents present in the index. --- .../impl/index/LuceneIndexAccessorReader.java | 10 ++- .../index/LuceneIndexAccessorReaderTest.java | 2 + .../index/IndexSamplingIntegrationTest.java | 65 ++++++++++++++++++- 3 files changed, 72 insertions(+), 5 deletions(-) diff --git a/community/lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReader.java b/community/lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReader.java index 64b73f40d2eb..9b3a66fb80e2 100644 --- a/community/lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReader.java +++ b/community/lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReader.java @@ -66,7 +66,8 @@ class LuceneIndexAccessorReader implements IndexReader public long sampleIndex( DoubleLong.Out result ) throws IndexNotFoundKernelException { NonUniqueIndexSampler sampler = new NonUniqueIndexSampler( bufferSizeLimit ); - try ( TermEnum terms = luceneIndexReader().terms() ) + org.apache.lucene.index.IndexReader indexReader = luceneIndexReader(); + try ( TermEnum terms = indexReader.terms() ) { while ( terms.next() ) { @@ -86,7 +87,12 @@ public long sampleIndex( DoubleLong.Out result ) throws IndexNotFoundKernelExcep throw new RuntimeException( e ); } - return sampler.result( result ); + sampler.result( result ); + + // Here do not return the count from the sampler, since when traversing all terms lucene will consider also the + // logical delete once that are gonna be physically deleted in the next compaction. The index size can be + // computed exactly by asking the current number of documents in the lucene index. + return indexReader.numDocs(); } @Override diff --git a/community/lucene-index/src/test/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReaderTest.java b/community/lucene-index/src/test/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReaderTest.java index 8cef7adac27f..0f3250fcd684 100644 --- a/community/lucene-index/src/test/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReaderTest.java +++ b/community/lucene-index/src/test/java/org/neo4j/kernel/api/impl/index/LuceneIndexAccessorReaderTest.java @@ -87,6 +87,7 @@ public void shouldProvideTheIndexUniqueValuesForAnIndexWithDuplicates() throws E new Term( "string", "aaa" ), new Term( "string", "ccc" ) ); + when(reader.numDocs() ).thenReturn( 3 ); // When final DoubleLongRegister output = Registers.newDoubleLongRegister(); @@ -108,6 +109,7 @@ public void shouldSkipTheNonNodeIdKeyEntriesWhenCalculatingIndexUniqueValues() t new Term( NODE_ID_KEY, "aaa" ), // <- this should be ignored new Term( "string", "bbb" ) ); + when(reader.numDocs() ).thenReturn( 1 ); // When diff --git a/community/neo4j/src/test/java/org/neo4j/index/IndexSamplingIntegrationTest.java b/community/neo4j/src/test/java/org/neo4j/index/IndexSamplingIntegrationTest.java index 03a9eb6e1a30..53ba8d6016b0 100644 --- a/community/neo4j/src/test/java/org/neo4j/index/IndexSamplingIntegrationTest.java +++ b/community/neo4j/src/test/java/org/neo4j/index/IndexSamplingIntegrationTest.java @@ -37,6 +37,7 @@ import org.neo4j.kernel.impl.store.counts.CountsTracker; import org.neo4j.kernel.impl.store.counts.keys.CountsKeyFactory; import org.neo4j.kernel.impl.store.counts.keys.IndexSampleKey; +import org.neo4j.kernel.impl.store.counts.keys.IndexStatisticsKey; import org.neo4j.register.Register.DoubleLongRegister; import org.neo4j.register.Registers; import org.neo4j.test.TargetDirectory; @@ -58,6 +59,7 @@ public class IndexSamplingIntegrationTest public void shouldSampleNotUniqueIndex() throws Throwable { GraphDatabaseService db = null; + long deletedNodes = 0; try { // Given @@ -82,6 +84,17 @@ public void shouldSampleNotUniqueIndex() throws Throwable db.createNode( label ).setProperty( property, names[i % names.length] ); tx.success(); } + + } + + try ( Transaction tx = db.beginTx() ) + { + for ( int i = 0; i < (nodes / 10) ; i++ ) + { + db.findNodes( label, property, names[i % names.length] ).next().delete(); + deletedNodes++; + tx.success(); + } } } finally @@ -96,15 +109,23 @@ public void shouldSampleNotUniqueIndex() throws Throwable triggerIndexResamplingOnNextStartup(); // Then + + // sampling will consider also the delete nodes till the next lucene compaction DoubleLongRegister register = fetchIndexSamplingValues( db ); assertEquals( names.length, register.readFirst() ); assertEquals( nodes, register.readSecond() ); + + // but the deleted nodes should not be considered in the index size value + DoubleLongRegister indexSizeRegister = fetchIndexSizeValues( db ); + assertEquals( 0, indexSizeRegister.readFirst() ); + assertEquals( nodes - deletedNodes, indexSizeRegister.readSecond() ); } @Test public void shouldSampleUniqueIndex() throws Throwable { GraphDatabaseService db = null; + long deletedNodes = 0; try { // Given @@ -123,6 +144,19 @@ public void shouldSampleUniqueIndex() throws Throwable tx.success(); } } + + try ( Transaction tx = db.beginTx() ) + { + for ( int i = 0; i < nodes; i++ ) + { + if ( i % 10 == 0 ) + { + deletedNodes++; + db.findNode( label, property, "" + i ).delete(); + tx.success(); + } + } + } } finally { @@ -136,9 +170,13 @@ public void shouldSampleUniqueIndex() throws Throwable triggerIndexResamplingOnNextStartup(); // Then - DoubleLongRegister register = fetchIndexSamplingValues( db ); - assertEquals( nodes, register.readFirst() ); - assertEquals( nodes, register.readSecond() ); + DoubleLongRegister indexSampleRegister = fetchIndexSamplingValues( db ); + assertEquals( nodes - deletedNodes, indexSampleRegister.readFirst() ); + assertEquals( nodes - deletedNodes, indexSampleRegister.readSecond() ); + + DoubleLongRegister indexSizeRegister = fetchIndexSizeValues( db ); + assertEquals( 0, indexSizeRegister.readFirst() ); + assertEquals( nodes - deletedNodes, indexSizeRegister.readSecond() ); } private DoubleLongRegister fetchIndexSamplingValues( GraphDatabaseService db ) @@ -162,6 +200,27 @@ private DoubleLongRegister fetchIndexSamplingValues( GraphDatabaseService db ) } } + private DoubleLongRegister fetchIndexSizeValues( GraphDatabaseService db ) + { + try + { + // Then + db = new TestGraphDatabaseFactory().newEmbeddedDatabase( testDirectory.graphDbDir().getAbsolutePath() ); + @SuppressWarnings( "deprecation" ) + GraphDatabaseAPI api = (GraphDatabaseAPI) db; + CountsTracker countsTracker = api.getDependencyResolver().resolveDependency( NeoStore.class ).getCounts(); + IndexStatisticsKey key = CountsKeyFactory.indexStatisticsKey( 0, 0 ); // cheating a bit... + return countsTracker.get( key, Registers.newDoubleLongRegister() ); + } + finally + { + if ( db != null ) + { + db.shutdown(); + } + } + } + private void triggerIndexResamplingOnNextStartup() { // Trigger index resampling on next at startup