Completed 3.0.3 integration

closes #14
luceneplusplus · Dec 16, 2010 · a6cbc06 · a6cbc06
1 parent 213df3c
commit a6cbc06
Show file tree

Hide file tree

Showing 121 changed files with 2,888 additions and 8,584 deletions.
diff --git a/README.rst b/README.rst
@@ -1,7 +1,7 @@
 Lucene++
 ==========
 
-Welcome to lucene++ version **3.0.2**.
+Welcome to lucene++ version **3.0.3**.
 
 Lucene++ is an up to date C++ port of the popular Java Lucene library, a high-performance, full-featured text search engine.
 
@@ -32,7 +32,7 @@ We use `Waf <http://code.google.com/p/waf/>`_ to drive the build. Waf requires t
 To build the library the following commands should be issued::
 
     $ ./waf configure
-    $ ./waf build
+    $ ./waf --static build
 
 
 Additionally static builds of the following libraries are required for a successful build:
@@ -52,15 +52,15 @@ Build Instructions for Windows systems
 
 Open solution lucene++.sln located in the *msvc* folder into Visual Studio 2008 and build.
 
-**Note: "BOOST_ROOT" environment variable must be defined to point to the boost library directory (eg. c:\\boost_1_41_0)**
+**Note: "BOOST_ROOT" environment variable must be defined to point to the boost library directory (eg. c:\\boost_1_44_0)**
 
 
 To run unit test suite
 ----------------------
 
 lucene_tester is built using the `Boost Unit Test Framework <http://www.boost.org/doc/libs/1_44_0/libs/test/doc/html/index.html>`_ and is launched by the following command::
 
-    $ bin/default/lucene_tester --test_dir=./test/testfiles --show_progress=yes
+    $ bin/default/lucene_tester --show_progress=yes
 
 Other `command options <http://www.boost.org/doc/libs/1_44_0/libs/test/doc/html/utf/user-guide/runtime-config/reference.html>`_ can be supplied.
 
@@ -72,7 +72,6 @@ Acknowledgements
 - Jamie Kirkpatrick for cross-platform and waf build support.
 
 - `Zlib <http://www.zlib.net>`_ Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
-- `UTF8-CPP <http://utfcpp.sourceforge.net/>`_ Copyright 2006 Nemanja Trifunovic
 - `nedmalloc <http://sourceforge.net/projects/nedmalloc/>`_ Copyright 2005-2006 Niall Douglas
 - md5 Copyright (C) 1999, 2000, 2002 Aladdin Enterprises
 - `Unicode character properties (guniprop) <http://library.gnome.org/devel/glib/>`_ Copyright (C) 1999 Tom Tromey, Copyright (C) 2000 Red Hat, Inc.
diff --git a/analysis/BaseCharFilter.cpp b/analysis/BaseCharFilter.cpp
@@ -11,6 +11,7 @@ namespace Lucene
 {
     BaseCharFilter::BaseCharFilter(CharStreamPtr in) : CharFilter(in)
     {
+        size = 0;
     }
 
     BaseCharFilter::~BaseCharFilter()
@@ -19,42 +20,52 @@ namespace Lucene
 
     int32_t BaseCharFilter::correct(int32_t currentOff)
     {
-        if (!pcmList || pcmList.empty())
+        if (!offsets || currentOff < offsets[0])
             return currentOff;
-        for (int32_t i = pcmList.size() - 1; i >= 0; --i)
+
+        int32_t hi = size - 1;
+        if (currentOff >= offsets[hi])
+            return currentOff + diffs[hi];
+
+        int32_t lo = 0;
+        int32_t mid = -1;
+
+        while (hi >= lo)
         {
-            if (currentOff >=  pcmList[i]->off)
-                return currentOff + pcmList[i]->cumulativeDiff;
+            mid = MiscUtils::unsignedShift(lo + hi, 1);
+            if (currentOff < offsets[mid])
+                hi = mid - 1;
+            else if (currentOff > offsets[mid])
+                lo = mid + 1;
+            else
+                return currentOff + diffs[mid];
         }
-        return currentOff;
+
+        if (currentOff < offsets[mid])
+            return mid == 0 ? currentOff : currentOff + diffs[mid - 1];
+        else
+            return currentOff + diffs[mid];
     }
 
     int32_t BaseCharFilter::getLastCumulativeDiff()
     {
-        return (!pcmList || pcmList.empty()) ? 0 : pcmList[pcmList.size() - 1]->cumulativeDiff;
+        return !offsets ? 0 : diffs[size - 1];
     }
 
     void BaseCharFilter::addOffCorrectMap(int32_t off, int32_t cumulativeDiff)
     {
-        if (!pcmList)
-            pcmList = Collection<OffCorrectMapPtr>::newInstance();
-        pcmList.add(newLucene<OffCorrectMap>(off, cumulativeDiff));
-    }
-
-    OffCorrectMap::OffCorrectMap(int32_t off, int32_t cumulativeDiff)
-    {
-        this->off = off;
-        this->cumulativeDiff = cumulativeDiff;
-    }
-
-    OffCorrectMap::~OffCorrectMap()
-    {
-    }
-
-    String OffCorrectMap::toString()
-    {
-        StringStream buffer;
-        buffer << L"(" << off << L"," << cumulativeDiff << L")";
-        return buffer.str();
+        if (!offsets)
+        {
+            offsets = IntArray::newInstance(64);
+            diffs = IntArray::newInstance(64);
+        }
+        else if (size == offsets.length())
+        {
+            offsets.resize(MiscUtils::getNextSize(offsets.length()));
+            diffs.resize(MiscUtils::getNextSize(diffs.length()));
+        }
+
+        offsets[size] = off;
+        diffs[size++] = cumulativeDiff; 
     }
 }
diff --git a/analysis/PerFieldAnalyzerWrapper.cpp b/analysis/PerFieldAnalyzerWrapper.cpp
@@ -6,6 +6,7 @@
 
 #include "LuceneInc.h"
 #include "PerFieldAnalyzerWrapper.h"
+#include "Fieldable.h"
 
 namespace Lucene
 {
@@ -56,6 +57,14 @@ namespace Lucene
         return analyzer->getPositionIncrementGap(fieldName);
     }
 
+    int32_t PerFieldAnalyzerWrapper::getOffsetGap(FieldablePtr field)
+    {
+        AnalyzerPtr analyzer(analyzerMap.get(field->name()));
+        if (!analyzer)
+            analyzer = defaultAnalyzer;
+        return analyzer->getOffsetGap(field);
+    }
+
     String PerFieldAnalyzerWrapper::toString()
     {
         return L"PerFieldAnalyzerWrapper(default=" + defaultAnalyzer->toString() + L")";

diff --git a/analysis/standard/StandardTokenizer.cpp b/analysis/standard/StandardTokenizer.cpp
@@ -140,16 +140,10 @@ namespace Lucene
         offsetAtt->setOffset(finalOffset, finalOffset);
     }
 
-    void StandardTokenizer::reset()
-    {
-        TokenStream::reset();
-        scanner->yyreset(input);
-    }
-
     void StandardTokenizer::reset(ReaderPtr input)
     {
         Tokenizer::reset(input);
-        reset();
+        scanner->reset(input);
     }
 
     bool StandardTokenizer::isReplaceInvalidAcronym()

diff --git a/analysis/standard/StandardTokenizerImpl.cpp b/analysis/standard/StandardTokenizerImpl.cpp
@@ -315,6 +315,14 @@ namespace Lucene
         return _yychar;
     }
 
+    void StandardTokenizerImpl::reset(ReaderPtr r)
+    {
+        // reset to default buffer size, if buffer has grown
+        if (zzBuffer.length() > ZZ_BUFFERSIZE)
+            zzBuffer.resize(ZZ_BUFFERSIZE);
+        yyreset(r);
+    }
+
     void StandardTokenizerImpl::getText(TokenPtr t)
     {
         t->setTermBuffer(zzBuffer.get(), zzStartRead, zzMarkedPos - zzStartRead);

diff --git a/contrib/highlighter/SimpleSpanFragmenter.cpp b/contrib/highlighter/SimpleSpanFragmenter.cpp
@@ -68,7 +68,7 @@ namespace Lucene
             }
         }
 
-        bool isNewFrag = (offsetAtt->endOffset() >= (fragmentSize * currentNumFrags) && (textSize - offsetAtt->endOffset()) >= (fragmentSize >> 1));
+        bool isNewFrag = (offsetAtt->endOffset() >= (fragmentSize * currentNumFrags) && (textSize - offsetAtt->endOffset()) >= MiscUtils::unsignedShift(fragmentSize, 1));
 
         if (isNewFrag)
             ++currentNumFrags;

diff --git a/contrib/include/DutchStemFilter.h b/contrib/include/DutchStemFilter.h
@@ -17,6 +17,10 @@ namespace Lucene
     /// It supports a table of words that should not be stemmed at all.  The stemmer used can 
     /// be changed at runtime after the filter object is created (as long as it is a 
     /// {@link DutchStemmer}).
+    ///
+    /// NOTE: This stemmer does not implement the Snowball algorithm correctly, specifically 
+    /// doubled consonants. It is recommended that you consider using the "Dutch" stemmer in 
+    /// the snowball package instead. This stemmer will likely be deprecated in a future release.
     class LPPAPI DutchStemFilter : public TokenFilter
     {
     public:

diff --git a/contrib/include/FrenchStemFilter.h b/contrib/include/FrenchStemFilter.h
@@ -17,6 +17,10 @@ namespace Lucene
     /// It supports a table of words that should not be stemmed at all.  The stemmer used can 
     /// be changed at runtime after the filter object is created (as long as it is a 
     /// {@link FrenchStemmer}).
+    ///
+    /// NOTE: This stemmer does not implement the Snowball algorithm correctly, especially 
+    /// involving case problems. It is recommended that you consider using the "French" stemmer 
+    /// in the snowball package instead. This stemmer will likely be deprecated in a future release.
     class LPPAPI FrenchStemFilter : public TokenFilter
     {
     public:

diff --git a/include/BaseCharFilter.h b/include/BaseCharFilter.h
@@ -13,9 +13,6 @@ namespace Lucene
 {
     /// Base utility class for implementing a {@link CharFilter}.  You subclass this, and then record mappings by 
     /// calling {@link #addOffCorrectMap}, and then invoke the correct method to correct an offset.
-    ///
-    /// NOTE: This class is not particularly efficient. For example, a new class instance is created for every call 
-    /// to {@link #addOffCorrectMap}, which is then appended to a private list.
     class LPPAPI BaseCharFilter : public CharFilter
     {
     public:
@@ -25,31 +22,17 @@ namespace Lucene
         LUCENE_CLASS(BaseCharFilter);
 
     protected:
-        Collection<OffCorrectMapPtr> pcmList;
+        IntArray offsets;
+        IntArray diffs;
+        int32_t size;
 
     protected:
-        /// Retrieve the corrected offset.  Note that this method is slow, if you correct positions far before the 
-        /// most recently added position, as it's a simple linear search backwards through all offset corrections 
-        /// added by {@link #addOffCorrectMap}.
+        /// Retrieve the corrected offset.
         virtual int32_t correct(int32_t currentOff);
 
         int32_t getLastCumulativeDiff();
         void addOffCorrectMap(int32_t off, int32_t cumulativeDiff);
     };
-
-    class LPPAPI OffCorrectMap : public LuceneObject
-    {
-    public:
-        OffCorrectMap(int32_t off, int32_t cumulativeDiff);
-        virtual ~OffCorrectMap();
-
-    public:
-        int32_t off;
-        int32_t cumulativeDiff;
-
-    public:
-        virtual String toString();
-    };
 }
 
 #endif
diff --git a/include/BitSet.h b/include/BitSet.h
@@ -48,10 +48,10 @@ namespace Lucene
         bool get(uint32_t bitIndex) const;
         bool fastGet(uint32_t bitIndex) const;
         int32_t nextSetBit(uint32_t fromIndex) const;
-        void andBitSet(BitSetPtr set);
-        void orBitSet(BitSetPtr set);
-        void xorBitSet(BitSetPtr set);
-        void andNotBitSet(BitSetPtr set);
+        void _and(BitSetPtr set);
+        void _or(BitSetPtr set);
+        void _xor(BitSetPtr set);
+        void andNot(BitSetPtr set);
         bool intersectsBitSet(BitSetPtr set) const;
         uint32_t cardinality();
         void resize(uint32_t size);

diff --git a/include/BitUtil.h b/include/BitUtil.h
@@ -24,51 +24,51 @@ namespace Lucene
 
     public:
         /// Returns the number of bits set in the long
-        static uint32_t pop(uint64_t x);
+        static int32_t pop(int64_t x);
 
         /// Returns the number of set bits in an array of longs.
-        static uint64_t pop_array(const uint64_t* A, int32_t wordOffset, int32_t numWords);
+        static int64_t pop_array(const int64_t* A, int32_t wordOffset, int32_t numWords);
 
         /// Returns the popcount or cardinality of the two sets after an intersection.  Neither array is modified.
-        static uint64_t pop_intersect(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
+        static int64_t pop_intersect(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);
 
         /// Returns the popcount or cardinality of the union of two sets.  Neither array is modified.
-        static uint64_t pop_union(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
+        static int64_t pop_union(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);
 
         /// Returns the popcount or cardinality of A & ~B.  Neither array is modified.
-        static uint64_t pop_andnot(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
+        static int64_t pop_andnot(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);
 
         /// Returns the popcount or cardinality of A ^ B.  Neither array is modified.
-        static uint64_t pop_xor(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
+        static int64_t pop_xor(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);
 
         /// Returns number of trailing zeros in a 64 bit long value.
-        static uint32_t ntz(uint64_t val);
+        static int32_t ntz(int64_t val);
 
         /// Returns number of trailing zeros in a 32 bit int value.
-        static uint32_t ntz(uint32_t val);
+        static int32_t ntz(int32_t val);
 
         /// Returns 0 based index of first set bit (only works for x!=0)
         /// This is an alternate implementation of ntz()
-        static uint32_t ntz2(uint64_t x);
+        static int32_t ntz2(int64_t x);
 
         /// Returns 0 based index of first set bit.
         /// This is an alternate implementation of ntz()
-        static uint32_t ntz3(uint64_t x);
+        static int32_t ntz3(int64_t x);
 
         /// Returns true if v is a power of two or zero.
-        static bool isPowerOfTwo(uint32_t v);
+        static bool isPowerOfTwo(int32_t v);
 
         /// Returns true if v is a power of two or zero.
-        static bool isPowerOfTwo(uint64_t v);
+        static bool isPowerOfTwo(int64_t v);
 
         /// Returns the next highest power of two, or the current value if it's already a power of two or zero.
-        static uint32_t nextHighestPowerOfTwo(uint32_t v);
+        static int32_t nextHighestPowerOfTwo(int32_t v);
 
         /// Returns the next highest power of two, or the current value if it's already a power of two or zero.
-        static uint64_t nextHighestPowerOfTwo(uint64_t v);
+        static int64_t nextHighestPowerOfTwo(int64_t v);
 
     protected:
-        inline static void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c); 
+        inline static void CSA(int64_t& h, int64_t& l, int64_t a, int64_t b, int64_t c); 
     };
 }
 

diff --git a/include/Directory.h b/include/Directory.h
@@ -91,6 +91,8 @@ namespace Lucene
         /// This is how locking "scopes" to the right index.
         virtual String getLockID();
 
+        virtual String toString();
+
         /// Copy contents of a directory src to a directory dest. If a file in src already exists in dest then the one 
         /// in dest will be blindly overwritten.  NOTE: the source directory cannot change while this method is running.
         /// Otherwise the results are undefined.

diff --git a/include/DirectoryReader.h b/include/DirectoryReader.h
@@ -49,14 +49,17 @@ namespace Lucene
         int32_t termInfosIndexDivisor;
 
         bool rollbackHasChanges;
-        SegmentInfosPtr rollbackSegmentInfos;
 
         Collection<SegmentReaderPtr> subReaders;
         Collection<int32_t> starts; // 1st docno for each segment
         MapStringByteArray normsCache;
         int32_t _maxDoc;
         int32_t _numDocs;
         bool _hasDeletions;
+
+        // Max version in index as of when we opened; this can be > our current segmentInfos version 
+        // in case we were opened on a past IndexCommit
+        int64_t maxIndexVersion;
 
     public:
         void _initialize(Collection<SegmentReaderPtr> subReaders);
@@ -353,6 +356,8 @@ namespace Lucene
         MapStringString userData;
 
     public:
+        virtual String toString();
+
         /// Returns true if this commit is an optimized index.
         virtual bool isOptimized();
 

diff --git a/include/DocIdSetIterator.h b/include/DocIdSetIterator.h
@@ -68,7 +68,7 @@ namespace Lucene
         ///
         /// NOTE: after the iterator has exhausted you should not call this method, as it may result in unpredicted 
         /// behaviour.
-           virtual int32_t advance(int32_t target) = 0;
+        virtual int32_t advance(int32_t target) = 0;
     };
 }