Skip to content

Commit

Permalink
Completed 3.0.3 integration
Browse files Browse the repository at this point in the history
closes #14
  • Loading branch information
alanw committed Dec 16, 2010
1 parent 213df3c commit a6cbc06
Show file tree
Hide file tree
Showing 121 changed files with 2,888 additions and 8,584 deletions.
9 changes: 4 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Lucene++
==========

Welcome to lucene++ version **3.0.2**.
Welcome to lucene++ version **3.0.3**.

Lucene++ is an up to date C++ port of the popular Java Lucene library, a high-performance, full-featured text search engine.

Expand Down Expand Up @@ -32,7 +32,7 @@ We use `Waf <http://code.google.com/p/waf/>`_ to drive the build. Waf requires t
To build the library the following commands should be issued::

$ ./waf configure
$ ./waf build
$ ./waf --static build


Additionally static builds of the following libraries are required for a successful build:
Expand All @@ -52,15 +52,15 @@ Build Instructions for Windows systems

Open solution lucene++.sln located in the *msvc* folder into Visual Studio 2008 and build.

**Note: "BOOST_ROOT" environment variable must be defined to point to the boost library directory (eg. c:\\boost_1_41_0)**
**Note: "BOOST_ROOT" environment variable must be defined to point to the boost library directory (eg. c:\\boost_1_44_0)**


To run unit test suite
----------------------

lucene_tester is built using the `Boost Unit Test Framework <http://www.boost.org/doc/libs/1_44_0/libs/test/doc/html/index.html>`_ and is launched by the following command::

$ bin/default/lucene_tester --test_dir=./test/testfiles --show_progress=yes
$ bin/default/lucene_tester --show_progress=yes

Other `command options <http://www.boost.org/doc/libs/1_44_0/libs/test/doc/html/utf/user-guide/runtime-config/reference.html>`_ can be supplied.

Expand All @@ -72,7 +72,6 @@ Acknowledgements
- Jamie Kirkpatrick for cross-platform and waf build support.

- `Zlib <http://www.zlib.net>`_ Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler
- `UTF8-CPP <http://utfcpp.sourceforge.net/>`_ Copyright 2006 Nemanja Trifunovic
- `nedmalloc <http://sourceforge.net/projects/nedmalloc/>`_ Copyright 2005-2006 Niall Douglas
- md5 Copyright (C) 1999, 2000, 2002 Aladdin Enterprises
- `Unicode character properties (guniprop) <http://library.gnome.org/devel/glib/>`_ Copyright (C) 1999 Tom Tromey, Copyright (C) 2000 Red Hat, Inc.
63 changes: 37 additions & 26 deletions analysis/BaseCharFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ namespace Lucene
{
BaseCharFilter::BaseCharFilter(CharStreamPtr in) : CharFilter(in)
{
size = 0;
}

BaseCharFilter::~BaseCharFilter()
Expand All @@ -19,42 +20,52 @@ namespace Lucene

int32_t BaseCharFilter::correct(int32_t currentOff)
{
if (!pcmList || pcmList.empty())
if (!offsets || currentOff < offsets[0])
return currentOff;
for (int32_t i = pcmList.size() - 1; i >= 0; --i)

int32_t hi = size - 1;
if (currentOff >= offsets[hi])
return currentOff + diffs[hi];

int32_t lo = 0;
int32_t mid = -1;

while (hi >= lo)
{
if (currentOff >= pcmList[i]->off)
return currentOff + pcmList[i]->cumulativeDiff;
mid = MiscUtils::unsignedShift(lo + hi, 1);
if (currentOff < offsets[mid])
hi = mid - 1;
else if (currentOff > offsets[mid])
lo = mid + 1;
else
return currentOff + diffs[mid];
}
return currentOff;

if (currentOff < offsets[mid])
return mid == 0 ? currentOff : currentOff + diffs[mid - 1];
else
return currentOff + diffs[mid];
}

int32_t BaseCharFilter::getLastCumulativeDiff()
{
return (!pcmList || pcmList.empty()) ? 0 : pcmList[pcmList.size() - 1]->cumulativeDiff;
return !offsets ? 0 : diffs[size - 1];
}

void BaseCharFilter::addOffCorrectMap(int32_t off, int32_t cumulativeDiff)
{
if (!pcmList)
pcmList = Collection<OffCorrectMapPtr>::newInstance();
pcmList.add(newLucene<OffCorrectMap>(off, cumulativeDiff));
}

OffCorrectMap::OffCorrectMap(int32_t off, int32_t cumulativeDiff)
{
this->off = off;
this->cumulativeDiff = cumulativeDiff;
}

OffCorrectMap::~OffCorrectMap()
{
}

String OffCorrectMap::toString()
{
StringStream buffer;
buffer << L"(" << off << L"," << cumulativeDiff << L")";
return buffer.str();
if (!offsets)
{
offsets = IntArray::newInstance(64);
diffs = IntArray::newInstance(64);
}
else if (size == offsets.length())
{
offsets.resize(MiscUtils::getNextSize(offsets.length()));
diffs.resize(MiscUtils::getNextSize(diffs.length()));
}

offsets[size] = off;
diffs[size++] = cumulativeDiff;
}
}
9 changes: 9 additions & 0 deletions analysis/PerFieldAnalyzerWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "LuceneInc.h"
#include "PerFieldAnalyzerWrapper.h"
#include "Fieldable.h"

namespace Lucene
{
Expand Down Expand Up @@ -56,6 +57,14 @@ namespace Lucene
return analyzer->getPositionIncrementGap(fieldName);
}

int32_t PerFieldAnalyzerWrapper::getOffsetGap(FieldablePtr field)
{
AnalyzerPtr analyzer(analyzerMap.get(field->name()));
if (!analyzer)
analyzer = defaultAnalyzer;
return analyzer->getOffsetGap(field);
}

String PerFieldAnalyzerWrapper::toString()
{
return L"PerFieldAnalyzerWrapper(default=" + defaultAnalyzer->toString() + L")";
Expand Down
8 changes: 1 addition & 7 deletions analysis/standard/StandardTokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,16 +140,10 @@ namespace Lucene
offsetAtt->setOffset(finalOffset, finalOffset);
}

void StandardTokenizer::reset()
{
TokenStream::reset();
scanner->yyreset(input);
}

void StandardTokenizer::reset(ReaderPtr input)
{
Tokenizer::reset(input);
reset();
scanner->reset(input);
}

bool StandardTokenizer::isReplaceInvalidAcronym()
Expand Down
8 changes: 8 additions & 0 deletions analysis/standard/StandardTokenizerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,14 @@ namespace Lucene
return _yychar;
}

void StandardTokenizerImpl::reset(ReaderPtr r)
{
// reset to default buffer size, if buffer has grown
if (zzBuffer.length() > ZZ_BUFFERSIZE)
zzBuffer.resize(ZZ_BUFFERSIZE);
yyreset(r);
}

void StandardTokenizerImpl::getText(TokenPtr t)
{
t->setTermBuffer(zzBuffer.get(), zzStartRead, zzMarkedPos - zzStartRead);
Expand Down
2 changes: 1 addition & 1 deletion contrib/highlighter/SimpleSpanFragmenter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ namespace Lucene
}
}

bool isNewFrag = (offsetAtt->endOffset() >= (fragmentSize * currentNumFrags) && (textSize - offsetAtt->endOffset()) >= (fragmentSize >> 1));
bool isNewFrag = (offsetAtt->endOffset() >= (fragmentSize * currentNumFrags) && (textSize - offsetAtt->endOffset()) >= MiscUtils::unsignedShift(fragmentSize, 1));

if (isNewFrag)
++currentNumFrags;
Expand Down
4 changes: 4 additions & 0 deletions contrib/include/DutchStemFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ namespace Lucene
/// It supports a table of words that should not be stemmed at all. The stemmer used can
/// be changed at runtime after the filter object is created (as long as it is a
/// {@link DutchStemmer}).
///
/// NOTE: This stemmer does not implement the Snowball algorithm correctly, specifically
/// doubled consonants. It is recommended that you consider using the "Dutch" stemmer in
/// the snowball package instead. This stemmer will likely be deprecated in a future release.
class LPPAPI DutchStemFilter : public TokenFilter
{
public:
Expand Down
4 changes: 4 additions & 0 deletions contrib/include/FrenchStemFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ namespace Lucene
/// It supports a table of words that should not be stemmed at all. The stemmer used can
/// be changed at runtime after the filter object is created (as long as it is a
/// {@link FrenchStemmer}).
///
/// NOTE: This stemmer does not implement the Snowball algorithm correctly, especially
/// involving case problems. It is recommended that you consider using the "French" stemmer
/// in the snowball package instead. This stemmer will likely be deprecated in a future release.
class LPPAPI FrenchStemFilter : public TokenFilter
{
public:
Expand Down
25 changes: 4 additions & 21 deletions include/BaseCharFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,6 @@ namespace Lucene
{
/// Base utility class for implementing a {@link CharFilter}. You subclass this, and then record mappings by
/// calling {@link #addOffCorrectMap}, and then invoke the correct method to correct an offset.
///
/// NOTE: This class is not particularly efficient. For example, a new class instance is created for every call
/// to {@link #addOffCorrectMap}, which is then appended to a private list.
class LPPAPI BaseCharFilter : public CharFilter
{
public:
Expand All @@ -25,31 +22,17 @@ namespace Lucene
LUCENE_CLASS(BaseCharFilter);

protected:
Collection<OffCorrectMapPtr> pcmList;
IntArray offsets;
IntArray diffs;
int32_t size;

protected:
/// Retrieve the corrected offset. Note that this method is slow, if you correct positions far before the
/// most recently added position, as it's a simple linear search backwards through all offset corrections
/// added by {@link #addOffCorrectMap}.
/// Retrieve the corrected offset.
virtual int32_t correct(int32_t currentOff);

int32_t getLastCumulativeDiff();
void addOffCorrectMap(int32_t off, int32_t cumulativeDiff);
};

class LPPAPI OffCorrectMap : public LuceneObject
{
public:
OffCorrectMap(int32_t off, int32_t cumulativeDiff);
virtual ~OffCorrectMap();

public:
int32_t off;
int32_t cumulativeDiff;

public:
virtual String toString();
};
}

#endif
8 changes: 4 additions & 4 deletions include/BitSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ namespace Lucene
bool get(uint32_t bitIndex) const;
bool fastGet(uint32_t bitIndex) const;
int32_t nextSetBit(uint32_t fromIndex) const;
void andBitSet(BitSetPtr set);
void orBitSet(BitSetPtr set);
void xorBitSet(BitSetPtr set);
void andNotBitSet(BitSetPtr set);
void _and(BitSetPtr set);
void _or(BitSetPtr set);
void _xor(BitSetPtr set);
void andNot(BitSetPtr set);
bool intersectsBitSet(BitSetPtr set) const;
uint32_t cardinality();
void resize(uint32_t size);
Expand Down
30 changes: 15 additions & 15 deletions include/BitUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,51 +24,51 @@ namespace Lucene

public:
/// Returns the number of bits set in the long
static uint32_t pop(uint64_t x);
static int32_t pop(int64_t x);

/// Returns the number of set bits in an array of longs.
static uint64_t pop_array(const uint64_t* A, int32_t wordOffset, int32_t numWords);
static int64_t pop_array(const int64_t* A, int32_t wordOffset, int32_t numWords);

/// Returns the popcount or cardinality of the two sets after an intersection. Neither array is modified.
static uint64_t pop_intersect(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
static int64_t pop_intersect(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);

/// Returns the popcount or cardinality of the union of two sets. Neither array is modified.
static uint64_t pop_union(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
static int64_t pop_union(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);

/// Returns the popcount or cardinality of A & ~B. Neither array is modified.
static uint64_t pop_andnot(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
static int64_t pop_andnot(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);

/// Returns the popcount or cardinality of A ^ B. Neither array is modified.
static uint64_t pop_xor(const uint64_t* A, const uint64_t* B, int32_t wordOffset, int32_t numWords);
static int64_t pop_xor(const int64_t* A, const int64_t* B, int32_t wordOffset, int32_t numWords);

/// Returns number of trailing zeros in a 64 bit long value.
static uint32_t ntz(uint64_t val);
static int32_t ntz(int64_t val);

/// Returns number of trailing zeros in a 32 bit int value.
static uint32_t ntz(uint32_t val);
static int32_t ntz(int32_t val);

/// Returns 0 based index of first set bit (only works for x!=0)
/// This is an alternate implementation of ntz()
static uint32_t ntz2(uint64_t x);
static int32_t ntz2(int64_t x);

/// Returns 0 based index of first set bit.
/// This is an alternate implementation of ntz()
static uint32_t ntz3(uint64_t x);
static int32_t ntz3(int64_t x);

/// Returns true if v is a power of two or zero.
static bool isPowerOfTwo(uint32_t v);
static bool isPowerOfTwo(int32_t v);

/// Returns true if v is a power of two or zero.
static bool isPowerOfTwo(uint64_t v);
static bool isPowerOfTwo(int64_t v);

/// Returns the next highest power of two, or the current value if it's already a power of two or zero.
static uint32_t nextHighestPowerOfTwo(uint32_t v);
static int32_t nextHighestPowerOfTwo(int32_t v);

/// Returns the next highest power of two, or the current value if it's already a power of two or zero.
static uint64_t nextHighestPowerOfTwo(uint64_t v);
static int64_t nextHighestPowerOfTwo(int64_t v);

protected:
inline static void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c);
inline static void CSA(int64_t& h, int64_t& l, int64_t a, int64_t b, int64_t c);
};
}

Expand Down
2 changes: 2 additions & 0 deletions include/Directory.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ namespace Lucene
/// This is how locking "scopes" to the right index.
virtual String getLockID();

virtual String toString();

/// Copy contents of a directory src to a directory dest. If a file in src already exists in dest then the one
/// in dest will be blindly overwritten. NOTE: the source directory cannot change while this method is running.
/// Otherwise the results are undefined.
Expand Down
7 changes: 6 additions & 1 deletion include/DirectoryReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,17 @@ namespace Lucene
int32_t termInfosIndexDivisor;

bool rollbackHasChanges;
SegmentInfosPtr rollbackSegmentInfos;

Collection<SegmentReaderPtr> subReaders;
Collection<int32_t> starts; // 1st docno for each segment
MapStringByteArray normsCache;
int32_t _maxDoc;
int32_t _numDocs;
bool _hasDeletions;

// Max version in index as of when we opened; this can be > our current segmentInfos version
// in case we were opened on a past IndexCommit
int64_t maxIndexVersion;

public:
void _initialize(Collection<SegmentReaderPtr> subReaders);
Expand Down Expand Up @@ -353,6 +356,8 @@ namespace Lucene
MapStringString userData;

public:
virtual String toString();

/// Returns true if this commit is an optimized index.
virtual bool isOptimized();

Expand Down
2 changes: 1 addition & 1 deletion include/DocIdSetIterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ namespace Lucene
///
/// NOTE: after the iterator has exhausted you should not call this method, as it may result in unpredicted
/// behaviour.
virtual int32_t advance(int32_t target) = 0;
virtual int32_t advance(int32_t target) = 0;
};
}

Expand Down
Loading

0 comments on commit a6cbc06

Please sign in to comment.