Prefix and seek queries regression

Switch to latest available lucene 5.5 version. Introduce custom PrefixMultiTermsQuery and use it instead of default PrefixQuery to avoid expensive internal automaton creation. Switch posting format to BlockTreeOrdsPostingsFormat by default. Both of optimisations are pluggable and can be switched off by corresponding feature toggle. Please note: switching posting format will require index regeneration.
neo4j · Feb 26, 2016 · 306c259 · 306c259
1 parent 1d7025e
commit 306c259
Show file tree

Hide file tree

Showing 52 changed files with 150 additions and 32 deletions.
diff --git a/community/bolt/LICENSES.txt b/community/bolt/LICENSES.txt
@@ -5,6 +5,7 @@ libraries. For an overview of the licenses see the NOTICE.txt file.
 Apache Software License, Version 2.0
   Apache Commons Lang
   ConcurrentLinkedHashMap
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/bolt/NOTICE.txt b/community/bolt/NOTICE.txt
@@ -28,6 +28,7 @@ Third-party licenses
 Apache Software License, Version 2.0
   Apache Commons Lang
   ConcurrentLinkedHashMap
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/consistency-check-legacy/LICENSES.txt b/community/consistency-check-legacy/LICENSES.txt
@@ -4,6 +4,7 @@ libraries. For an overview of the licenses see the NOTICE.txt file.
 ------------------------------------------------------------------------------
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/consistency-check-legacy/NOTICE.txt b/community/consistency-check-legacy/NOTICE.txt
@@ -27,6 +27,7 @@ Third-party licenses
 
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/consistency-check/LICENSES.txt b/community/consistency-check/LICENSES.txt
@@ -4,6 +4,7 @@ libraries. For an overview of the licenses see the NOTICE.txt file.
 ------------------------------------------------------------------------------
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/consistency-check/NOTICE.txt b/community/consistency-check/NOTICE.txt
@@ -27,6 +27,7 @@ Third-party licenses
 
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/cypher/cypher/LICENSES.txt b/community/cypher/cypher/LICENSES.txt
@@ -4,6 +4,7 @@ libraries. For an overview of the licenses see the NOTICE.txt file.
 ------------------------------------------------------------------------------
 Apache Software License, Version 2.0
   ConcurrentLinkedHashMap
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/cypher/cypher/NOTICE.txt b/community/cypher/cypher/NOTICE.txt
@@ -27,6 +27,7 @@ Third-party licenses
 
 Apache Software License, Version 2.0
   ConcurrentLinkedHashMap
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/import-tool/LICENSES.txt b/community/import-tool/LICENSES.txt
@@ -4,6 +4,7 @@ libraries. For an overview of the licenses see the NOTICE.txt file.
 ------------------------------------------------------------------------------
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/import-tool/NOTICE.txt b/community/import-tool/NOTICE.txt
@@ -27,6 +27,7 @@ Third-party licenses
 
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/lucene-index-upgrade/pom.xml b/community/lucene-index-upgrade/pom.xml
@@ -13,7 +13,7 @@
         <license-text.header>GPL-3-header.txt</license-text.header>
         <licensing.prepend.text>notice-gpl-prefix.txt</licensing.prepend.text>
         <lucene4.version>4.10.4</lucene4.version>
-        <lucene5.version>5.4.0</lucene5.version>
+        <lucene5.version>5.5.0</lucene5.version>
     </properties>
 
     <modelVersion>4.0.0</modelVersion>

diff --git a/...ucene-index-upgrade/src/main/java/org/neo4j/upgrade/lucene/LuceneLegacyIndexUpgrader.java b/...ucene-index-upgrade/src/main/java/org/neo4j/upgrade/lucene/LuceneLegacyIndexUpgrader.java
@@ -66,8 +66,8 @@ default void migrated( String name ) {}
     private static final String LIBRARY_DIRECTORY = "lib";
     private static final String RESOURCE_SEPARATOR = "/";
     private static final String LUCENE4_CORE_JAR_NAME = "lucene-core-4.10.4.jar";
-    private static final String LUCENE5_CORE_JAR_NAME = "lucene-core-5.4.0.jar";
-    private static final String LUCENE5_BACKWARD_CODECS_NAME = "lucene-backward-codecs-5.4.0.jar";
+    private static final String LUCENE5_CORE_JAR_NAME = "lucene-core-5.5.0.jar";
+    private static final String LUCENE5_BACKWARD_CODECS_NAME = "lucene-backward-codecs-5.5.0.jar";
     private static final String SEGMENTS_FILE_NAME_PREFIX = "segments";
 
     private final Path indexRootPath;

diff --git a/community/lucene-index/LICENSES.txt b/community/lucene-index/LICENSES.txt
@@ -4,6 +4,7 @@ libraries. For an overview of the licenses see the NOTICE.txt file.
 ------------------------------------------------------------------------------
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/lucene-index/NOTICE.txt b/community/lucene-index/NOTICE.txt
@@ -27,6 +27,7 @@ Third-party licenses
 
 Apache Software License, Version 2.0
   Apache Commons Lang
+  Lucene codecs
   Lucene Common Analyzers
   Lucene Core
   Lucene Memory

diff --git a/community/lucene-index/pom.xml b/community/lucene-index/pom.xml
@@ -91,6 +91,11 @@ the relevant Commercial Agreement.
       <groupId>org.apache.lucene</groupId>
       <artifactId>lucene-queryparser</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-codecs</artifactId>
+    </dependency>
+
 
     <dependency>
       <groupId>junit</groupId>

diff --git a/community/lucene-index/src/main/java/org/neo4j/index/impl/lucene/legacy/FullTxData.java b/community/lucene-index/src/main/java/org/neo4j/index/impl/lucene/legacy/FullTxData.java
@@ -377,7 +377,7 @@ private IndexSearcher searcher( boolean allowRefreshSearcher )
         try
         {
             IndexReader newReader = this.reader == null ?
-                                    DirectoryReader.open( this.writer, true ) :
+                                    DirectoryReader.open( this.writer ) :
                                     DirectoryReader.openIfChanged( (DirectoryReader) this.reader );
             if ( newReader == this.reader )
             {

diff --git a/...unity/lucene-index/src/main/java/org/neo4j/index/impl/lucene/legacy/LuceneDataSource.java b/...unity/lucene-index/src/main/java/org/neo4j/index/impl/lucene/legacy/LuceneDataSource.java
@@ -226,7 +226,7 @@ private IndexReference refreshSearcher( IndexReference searcher )
             // TODO: this cast should always succeed, maybe check nonetheless?
             DirectoryReader reader = (DirectoryReader) searcher.getSearcher().getIndexReader();
             IndexWriter writer = searcher.getWriter();
-            IndexReader reopened = DirectoryReader.openIfChanged( reader, writer, true );
+            IndexReader reopened = DirectoryReader.openIfChanged( reader, writer );
             if ( reopened != null )
             {
                 IndexSearcher newSearcher = newIndexSearcher( searcher.getIdentifier(), reopened );
@@ -305,7 +305,7 @@ synchronized IndexReference syncGetIndexSearcher( IndexIdentifier identifier )
             if ( searcher == null )
             {
                 IndexWriter writer = newIndexWriter( identifier );
-                IndexReader reader = DirectoryReader.open( writer, true );
+                IndexReader reader = DirectoryReader.open( writer );
                 IndexSearcher indexSearcher = newIndexSearcher( identifier, reader );
                 searcher = new IndexReference( identifier, indexSearcher, writer );
                 indexSearchers.put( identifier, searcher );

diff --git a/community/lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/IndexWriterConfigs.java b/community/lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/IndexWriterConfigs.java
@@ -19,6 +19,9 @@
  */
 package org.neo4j.kernel.api.impl.index;
 
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
+import org.apache.lucene.codecs.lucene54.Lucene54Codec;
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.LogByteSizeMergePolicy;
 
@@ -39,6 +42,8 @@ public final class IndexWriterConfigs
             FeatureToggles.getDouble( IndexWriterConfigs.class, "nocfs.ratio", 1.0 );
     private static final double MERGE_POLICY_MIN_MERGE_MB =
             FeatureToggles.getDouble( IndexWriterConfigs.class, "min.merge", 0.1 );
+    private static final boolean CODEC_BLOCK_TREE_ORDS_POSTING_FORMAT =
+            FeatureToggles.flag( IndexWriterConfigs.class, "block.tree.ords.posting.format", true );
 
     private static final int POPULATION_RAM_BUFFER_SIZE_MB =
             FeatureToggles.getInteger( IndexWriterConfigs.class, "population.ram.buffer.size", 50 );
@@ -55,6 +60,16 @@ public static IndexWriterConfig standard()
         writerConfig.setMaxBufferedDocs( MAX_BUFFERED_DOCS );
         writerConfig.setIndexDeletionPolicy( new MultipleBackupDeletionPolicy() );
         writerConfig.setUseCompoundFile( true );
+        writerConfig.setCodec(new Lucene54Codec()
+        {
+            @Override
+            public PostingsFormat getPostingsFormatForField( String field )
+            {
+                PostingsFormat postingFormat = super.getPostingsFormatForField( field );
+                return CODEC_BLOCK_TREE_ORDS_POSTING_FORMAT ? new BlockTreeOrdsPostingsFormat() :
+                       postingFormat;
+            }
+        });
 
         LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy();
         mergePolicy.setNoCFSRatio( MERGE_POLICY_NO_CFS_RATIO );

diff --git a/...ene-index/src/main/java/org/neo4j/kernel/api/impl/index/collector/DocValuesCollector.java b/...ene-index/src/main/java/org/neo4j/kernel/api/impl/index/collector/DocValuesCollector.java
@@ -560,28 +560,17 @@ public int freq() throws IOException
         }
 
         @Override
-        public int docID()
-        {
-            throw new UnsupportedOperationException();
-        }
-
-        @Override
-        public int nextDoc() throws IOException
+        public DocIdSetIterator iterator()
         {
             throw new UnsupportedOperationException();
         }
 
         @Override
-        public int advance( int target ) throws IOException
+        public int docID()
         {
             throw new UnsupportedOperationException();
         }
 
-        @Override
-        public long cost()
-        {
-            return scores.length;
-        }
     }
 
     private static final class DocsInIndexOrderIterator extends AbstractIndexHits<Document>

diff --git a/.../lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/partition/IndexPartition.java b/.../lucene-index/src/main/java/org/neo4j/kernel/api/impl/index/partition/IndexPartition.java
@@ -51,7 +51,7 @@ public IndexPartition( File partitionFolder, Directory directory, IndexWriterCon
         this.indexFolder = partitionFolder;
         this.directory = directory;
         this.indexWriter = new IndexWriter( directory, writerConfig );
-        this.searcherManager = new SearcherManager( indexWriter, true, new SearcherFactory() );
+        this.searcherManager = new SearcherManager( indexWriter, new SearcherFactory() );
     }
 
     public IndexWriter getIndexWriter()

diff --git a/.../lucene-index/src/main/java/org/neo4j/kernel/api/impl/schema/LuceneDocumentStructure.java b/.../lucene-index/src/main/java/org/neo4j/kernel/api/impl/schema/LuceneDocumentStructure.java
@@ -24,6 +24,7 @@
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.NumericDocValuesField;
 import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.FilteredTermsEnum;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
@@ -32,24 +33,34 @@
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiTermQuery;
 import org.apache.lucene.search.NumericRangeQuery;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
-import org.apache.lucene.util.NumericUtils;
 import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.StringHelper;
 
 import java.io.IOException;
 import java.util.EnumMap;
 import java.util.Iterator;
 import java.util.Map;
 
+import org.neo4j.unsafe.impl.internal.dragons.FeatureToggles;
+
 import static org.apache.lucene.document.Field.Store.YES;
 
 public class LuceneDocumentStructure
 {
+    private static final boolean USE_LUCENE_STANDARD_PREFIX_QUERY =
+            FeatureToggles.flag( LuceneDocumentStructure.class, "lucene.standard.prefix.query", false );
+
     public static final String NODE_ID_KEY = "id";
 
     //  Absolute hard maximum length for a term, in bytes once
@@ -139,7 +150,7 @@ public static Query newRangeSeekByStringQuery( String lower, boolean includeLowe
             builder.add( termRangeQuery, BooleanClause.Occur.SHOULD );
             return builder.build();
         }
-        return termRangeQuery;
+        return new ConstantScoreQuery( termRangeQuery );
     }
 
     public static Query newWildCardStringQuery( String searchFor )
@@ -150,9 +161,12 @@ public static Query newWildCardStringQuery( String searchFor )
         return new WildcardQuery( term );
     }
 
-    public static PrefixQuery newRangeSeekByPrefixQuery( String prefix )
+    public static Query newRangeSeekByPrefixQuery( String prefix )
     {
-        return new PrefixQuery( new Term( ValueEncoding.String.key(), prefix ) );
+        Term term = new Term( ValueEncoding.String.key(), prefix );
+        MultiTermQuery prefixQuery = USE_LUCENE_STANDARD_PREFIX_QUERY ? new PrefixQuery( term ) :
+                                     new PrefixMultiTermsQuery( term );
+        return new ConstantScoreQuery( prefixQuery );
     }
 
     public static Term newTermForChangeOrRemove( long nodeId )
@@ -188,6 +202,54 @@ public static TermsEnum originalTerms( Terms terms, String fieldKey ) throws IOE
                : termsEnum;
     }
 
+    /**
+     * Simple implementation of prefix query that mimics old lucene way of handling prefix queries.
+     * According to benchmarks this implementation is faster then
+     * {@link org.apache.lucene.search.PhraseQuery} because we do not construct automaton  which is
+     * extremely expensive.
+     */
+    private static class PrefixMultiTermsQuery extends MultiTermQuery
+    {
+        private Term term;
+
+        PrefixMultiTermsQuery( Term term )
+        {
+            super(term.field());
+            this.term = term;
+        }
+
+        @Override
+        protected TermsEnum getTermsEnum( Terms terms, AttributeSource atts ) throws IOException
+        {
+            return term.bytes().length == 0 ? terms.iterator() : new PrefixTermsEnum( terms.iterator(), term.bytes() );
+        }
+
+
+        @Override
+        public String toString( String field )
+        {
+            return getClass().getSimpleName() + ", term:" + term + ", field:" + field;
+        }
+
+        private class PrefixTermsEnum extends FilteredTermsEnum
+        {
+            private BytesRef prefix;
+
+            PrefixTermsEnum( TermsEnum termEnum, BytesRef prefix )
+            {
+                super( termEnum );
+                this.prefix = prefix;
+                setInitialSeekTerm( this.prefix );
+            }
+
+            @Override
+            protected AcceptStatus accept( BytesRef term ) throws IOException
+            {
+                return StringHelper.startsWith( term, prefix ) ? AcceptStatus.YES : AcceptStatus.END;
+            }
+        }
+    }
+
     private static class DocWithId
     {
         private final Document document;

diff --git a/community/lucene-index/src/test/java/org/neo4j/kernel/api/impl/index/IndexReaderStub.java b/community/lucene-index/src/test/java/org/neo4j/kernel/api/impl/index/IndexReaderStub.java
@@ -42,8 +42,6 @@
 import java.util.Map;
 import java.util.function.Function;
 
-import static com.sun.corba.se.spi.activation.IIOP_CLEAR_TEXT.value;
-
 public class IndexReaderStub extends LeafReader
 {