Skip to content

Commit

Permalink
SERVER-19421 Add abstractions for phrase matching in FTS
Browse files Browse the repository at this point in the history
  • Loading branch information
adamchel committed Jul 29, 2015
1 parent f1bdf1b commit 852258e
Show file tree
Hide file tree
Showing 11 changed files with 280 additions and 33 deletions.
4 changes: 4 additions & 0 deletions src/mongo/db/fts/SConscript
Expand Up @@ -34,6 +34,7 @@ baseEnv.Library('base', [
'fts_spec.cpp',
'fts_spec_legacy.cpp',
'fts_language.cpp',
'fts_basic_phrase_matcher.cpp',
'fts_basic_tokenizer.cpp',
'fts_util.cpp',
'fts_element_iterator.cpp',
Expand Down Expand Up @@ -93,3 +94,6 @@ env.CppUnitTest( "fts_matcher_test", "fts_matcher_test.cpp",

env.CppUnitTest( "fts_element_iterator_test", "fts_element_iterator_test.cpp",
LIBDEPS=["base"] )

env.CppUnitTest( "fts_basic_phrase_matcher_test", "fts_basic_phrase_matcher_test.cpp",
LIBDEPS=["base"] )
49 changes: 49 additions & 0 deletions src/mongo/db/fts/fts_basic_phrase_matcher.cpp
@@ -0,0 +1,49 @@
/**
* Copyright (C) 2015 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/

#include "mongo/db/fts/fts_basic_phrase_matcher.h"

#include "mongo/platform/strcasestr.h"

namespace mongo {
namespace fts {

using std::string;

bool BasicFTSPhraseMatcher::phraseMatches(const string& phrase,
const string& haystack,
PhraseMatcherOptions options) const {
if (options & kCaseSensitive) {
return haystack.find(phrase) != string::npos;
}

return strcasestr(haystack.c_str(), phrase.c_str()) != NULL;
}

} // namespace fts
} // namespace mongo
53 changes: 53 additions & 0 deletions src/mongo/db/fts/fts_basic_phrase_matcher.h
@@ -0,0 +1,53 @@
/**
* Copyright (C) 2015 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/

#pragma once

#include "mongo/base/disallow_copying.h"
#include "mongo/db/fts/fts_phrase_matcher.h"

namespace mongo {
namespace fts {

/**
* A phrase matcher that looks for exact substring matches with optional ASCII-aware case
* insensitivity.
*/
class BasicFTSPhraseMatcher final : public FTSPhraseMatcher {
MONGO_DISALLOW_COPYING(BasicFTSPhraseMatcher);

public:
BasicFTSPhraseMatcher() = default;

bool phraseMatches(const std::string& phrase,
const std::string& haystack,
PhraseMatcherOptions options) const final;
};

} // namespace fts
} // namespace mongo
77 changes: 77 additions & 0 deletions src/mongo/db/fts/fts_basic_phrase_matcher_test.cpp
@@ -0,0 +1,77 @@
/**
* Copyright (C) 2015 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/

#include "mongo/db/fts/fts_basic_phrase_matcher.h"

#include "mongo/unittest/unittest.h"

namespace mongo {
namespace fts {

// Case insensitive match.
TEST(FtsBasicPhraseMatcher, CaseInsensitive) {
std::string str1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
std::string find1 = "Consectetur adipiscing";
std::string nofind1 = "dolor amet";

std::string str2 = "Duis aute irure dolor in reprehenderit in Voluptate velit esse cillum.";
std::string find2 = "In Voluptate";
std::string nofind2 = "dolor velit";

BasicFTSPhraseMatcher phraseMatcher;
FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kNone;

ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
ASSERT(phraseMatcher.phraseMatches(find2, str2, options));

ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str1, options));
ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str2, options));
}

// Case sensitive match.
TEST(FtsBasicPhraseMatcher, CaseSensitive) {
std::string str1 = "Lorem ipsum dolor sit amet, consectetur adipiscing elit.";
std::string find1 = "Lorem ipsum";
std::string nofind1 = "Sit amet";

std::string str2 = "Duis aute irure dolor in reprehenderit in Voluptate velit esse cillum.";
std::string find2 = "in Voluptate";
std::string nofind2 = "Irure dolor";

BasicFTSPhraseMatcher phraseMatcher;
FTSPhraseMatcher::PhraseMatcherOptions options = FTSPhraseMatcher::kCaseSensitive;

ASSERT(phraseMatcher.phraseMatches(find1, str1, options));
ASSERT(phraseMatcher.phraseMatches(find2, str2, options));

ASSERT_FALSE(phraseMatcher.phraseMatches(nofind1, str1, options));
ASSERT_FALSE(phraseMatcher.phraseMatches(nofind2, str2, options));
}

} // namespace fts
} // namespace mongo
5 changes: 5 additions & 0 deletions src/mongo/db/fts/fts_language.cpp
Expand Up @@ -33,6 +33,7 @@
#include <string>

#include "mongo/base/init.h"
#include "mongo/db/fts/fts_basic_phrase_matcher.h"
#include "mongo/db/fts/fts_basic_tokenizer.h"
#include "mongo/stdx/memory.h"
#include "mongo/util/assert_util.h"
Expand Down Expand Up @@ -85,6 +86,10 @@ std::unique_ptr<FTSTokenizer> BasicFTSLanguage::createTokenizer() const {
return stdx::make_unique<BasicFTSTokenizer>(this);
}

const FTSPhraseMatcher& BasicFTSLanguage::getPhraseMatcher() const {
return _basicPhraseMatcher;
}

MONGO_INITIALIZER_GROUP(FTSAllLanguagesRegistered, MONGO_NO_PREREQUISITES, MONGO_NO_DEPENDENTS);

//
Expand Down
13 changes: 12 additions & 1 deletion src/mongo/db/fts/fts_language.h
Expand Up @@ -30,6 +30,8 @@

#pragma once

#include "mongo/db/fts/fts_basic_phrase_matcher.h"
#include "mongo/db/fts/fts_phrase_matcher.h"
#include "mongo/db/fts/fts_util.h"
#include "mongo/base/status_with.h"

Expand Down Expand Up @@ -86,6 +88,11 @@ class FTSLanguage {
*/
virtual std::unique_ptr<FTSTokenizer> createTokenizer() const = 0;

/**
* Returns a reference to the phrase matcher instance that this language owns.
*/
virtual const FTSPhraseMatcher& getPhraseMatcher() const = 0;

/**
* Register std::string 'languageName' as a new language with text index version
* 'textIndexVersion'. Saves the resulting language to out-argument 'languageOut'.
Expand Down Expand Up @@ -133,7 +140,11 @@ typedef StatusWith<const FTSLanguage*> StatusWithFTSLanguage;

class BasicFTSLanguage : public FTSLanguage {
public:
std::unique_ptr<FTSTokenizer> createTokenizer() const override;
std::unique_ptr<FTSTokenizer> createTokenizer() const final;
const FTSPhraseMatcher& getPhraseMatcher() const final;

private:
BasicFTSPhraseMatcher _basicPhraseMatcher;
};

extern BasicFTSLanguage languagePorterV1;
Expand Down
20 changes: 7 additions & 13 deletions src/mongo/db/fts/fts_matcher.cpp
Expand Up @@ -31,27 +31,16 @@
#include "mongo/platform/basic.h"

#include "mongo/db/fts/fts_matcher.h"
#include "mongo/db/fts/fts_phrase_matcher.h"
#include "mongo/db/fts/fts_tokenizer.h"
#include "mongo/db/fts/fts_element_iterator.h"
#include "mongo/platform/strcasestr.h"

namespace mongo {

namespace fts {

using std::string;

/**
* Does the string 'phrase' occur in the string 'haystack'? Match is case-insensitive if
* 'caseSensitive' is false; otherwise, an exact substring match is performed.
*/
static bool phraseMatches(const string& phrase, const string& haystack, bool caseSensitive) {
if (caseSensitive) {
return haystack.find(phrase) != string::npos;
}
return strcasestr(haystack.c_str(), phrase.c_str()) != NULL;
}

FTSMatcher::FTSMatcher(const FTSQuery& query, const FTSSpec& spec) : _query(query), _spec(spec) {}

bool FTSMatcher::matches(const BSONObj& obj) const {
Expand Down Expand Up @@ -163,7 +152,12 @@ bool FTSMatcher::_phraseMatch(const string& phrase, const BSONObj& obj) const {

while (it.more()) {
FTSIteratorValue val = it.next();
if (phraseMatches(phrase, val._text, _query.getCaseSensitive())) {

if (val._language->getPhraseMatcher().phraseMatches(phrase,
val._text,
_query.getCaseSensitive()
? FTSPhraseMatcher::kCaseSensitive
: FTSPhraseMatcher::kNone)) {
return true;
}
}
Expand Down
65 changes: 65 additions & 0 deletions src/mongo/db/fts/fts_phrase_matcher.h
@@ -0,0 +1,65 @@
/**
* Copyright (C) 2015 MongoDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the GNU Affero General Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/

#pragma once

#include <cstdint>
#include <string>

namespace mongo {
namespace fts {

/**
* An interface for substring matching routines.
*/
class FTSPhraseMatcher {
public:
virtual ~FTSPhraseMatcher() = default;

using PhraseMatcherOptions = uint8_t;

/**
* Use no options.
*/
static const int kNone = 0;

/**
* Lowercase strings as part of phrase matching.
*/
static const int kCaseSensitive = 1 << 0;

/**
* Does the string 'phrase' occur in the string 'haystack'?
*/
virtual bool phraseMatches(const std::string& phrase,
const std::string& haystack,
PhraseMatcherOptions options) const = 0;
};

} // namespace fts
} // namespace mongo
16 changes: 5 additions & 11 deletions src/mongo/db/fts/fts_query.cpp
Expand Up @@ -106,10 +106,11 @@ Status FTSQuery::parse(const string& query,
unsigned phraseStart = quoteOffset + 1;
unsigned phraseLength = t.offset - phraseStart;
StringData phrase = StringData(query).substr(phraseStart, phraseLength);
if (inNegation)
_negatedPhrases.push_back(normalizeString(phrase));
else
_positivePhrases.push_back(normalizeString(phrase));
if (inNegation) {
_negatedPhrases.push_back(phrase.toString());
} else {
_positivePhrases.push_back(phrase.toString());
}
inNegation = false;
inPhrase = false;
} else {
Expand Down Expand Up @@ -170,13 +171,6 @@ void FTSQuery::_addTerms(FTSTokenizer* tokenizer, const string& sentence, bool n
}
}

string FTSQuery::normalizeString(StringData str) const {
if (_caseSensitive) {
return str.toString();
}
return tolowerString(str);
}

namespace {
void _debugHelp(stringstream& ss, const set<string>& s, const string& sep) {
bool first = true;
Expand Down

0 comments on commit 852258e

Please sign in to comment.