Skip to content

Commit

Permalink
Tuning bm25, adding sort order, remove lavenshtein
Browse files Browse the repository at this point in the history
Modify suggestion mode flow

add unit tests
  • Loading branch information
maneeshpm committed Feb 17, 2021
1 parent 1077e3e commit 544f52f
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 88 deletions.
31 changes: 0 additions & 31 deletions src/levenshtein.cpp

This file was deleted.

9 changes: 0 additions & 9 deletions src/levenshtein.h

This file was deleted.

1 change: 0 additions & 1 deletion src/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,6 @@ endif
xapian_sources = [
'search.cpp',
'search_iterator.cpp',
'levenshtein.cpp',
'xapian/htmlparse.cc',
'xapian/myhtmlparse.cc',
'writer/xapianIndexer.cpp',
Expand Down
55 changes: 10 additions & 45 deletions src/search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#include <zim/item.h>
#include "fileimpl.h"
#include "search_internal.h"
#include "levenshtein.h"
#include "fs.h"

#include <sstream>
Expand Down Expand Up @@ -115,23 +114,6 @@ setup_queryParser(Xapian::QueryParser* queryparser,
}
}

class LevenshteinDistanceMaker : public Xapian::KeyMaker {
public:
LevenshteinDistanceMaker(const std::string& query, size_t value_index):
query(query),
value_index(value_index) {}
~LevenshteinDistanceMaker() = default;

virtual std::string operator() (const Xapian::Document &doc) const {
auto document_value = doc.get_value(value_index);
return Xapian::sortable_serialise(
levenshtein_distance(document_value, query));
}
private:
std::string query;
size_t value_index;
};

}

Search::Search(const std::vector<Archive>& archives) :
Expand Down Expand Up @@ -235,8 +217,6 @@ Search& Search::set_suggestion_mode(const bool suggestion_mode) {
return *this;
}

#define WITH_LEV 1

Search::iterator Search::begin() const {
if ( this->search_started ) {
return new search_iterator::InternalData(this, internal->results.begin());
Expand Down Expand Up @@ -364,9 +344,6 @@ Search::iterator Search::begin() const {
delete queryParser;

Xapian::Enquire enquire(internal->database);
#if WITH_LEV
std::unique_ptr<Xapian::KeyMaker> keyMaker(nullptr);
#endif

if (geo_query && valuesmap.find("geo.position") != valuesmap.end()) {
Xapian::GreatCircleMetric metric;
Expand All @@ -379,29 +356,17 @@ Search::iterator Search::begin() const {
}
}

enquire.set_query(query);

#if WITH_LEV
if (suggestion_mode && !hasNewSuggestionFormat) {
size_t value_index = 0;
bool has_custom_distance_maker = true;
if ( !valuesmap.empty() ) {
if ( valuesmap.find("title") != valuesmap.end() ) {
value_index = valuesmap["title"];
} else {
// This should not happen as valuesmap has a title entry, but let's
// be tolerent.
has_custom_distance_maker = false;
}
}
auto temp_results = enquire.get_mset(0,0);
if ( has_custom_distance_maker
&& temp_results.get_matches_estimated() <= MAX_MATCHES_TO_SORT ) {
keyMaker.reset(new LevenshteinDistanceMaker(this->query, value_index));
enquire.set_sort_by_key(keyMaker.get(), false);
}
// In suggestion mode, we are searching over a separate title index.
// Default BM25 is not adapted for this case, tuning down wdf factor
// (k1) and increasing length normalization factor(b) is necessary.
// The document set is first sorted by their relevance score then by
// value so that suggestion results are closer to search string.
if (suggestion_mode) {
enquire.set_weighting_scheme(Xapian::BM25Weight(0.001,0,1,1,0.5));
enquire.set_sort_by_relevance_then_value(valuesmap["title"], true);
}
#endif

enquire.set_query(query);

if (suggestion_mode && valuesmap.find("title") != valuesmap.end()) {
enquire.set_collapse_key(valuesmap["title"]);
Expand Down
4 changes: 2 additions & 2 deletions test/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ tests = [
]

if xapian_dep.found()
tests += ['search']
tests += ['search', 'suggestion']
endif

if gtest_dep.found() and not meson.is_cross_build()
Expand All @@ -37,4 +37,4 @@ if gtest_dep.found() and not meson.is_cross_build()
test(test_name, test_exe, timeout : 120,
workdir: meson.current_build_dir())
endforeach
endif
endif
188 changes: 188 additions & 0 deletions test/suggestion.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
/*
* Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
* NON-INFRINGEMENT. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/

#include <zim/zim.h>
#include <zim/archive.h>
#include <zim/search.h>
#include <zim/writer/creator.h>
#include <zim/writer/item.h>
#include <zim/writer/contentProvider.h>

#include "tools.h"

#include "gtest/gtest.h"

namespace {

class TestItem : public zim::writer::BasicItem {
public:
TestItem(const std::string& path, const std::string& mimetype, const std::string& title)
: BasicItem(path, mimetype, title) {}

virtual std::unique_ptr<zim::writer::ContentProvider> getContentProvider() const {
return std::unique_ptr<zim::writer::ContentProvider>(new zim::writer::StringProvider(""));
}
};

std::string createTestZim(std::vector<std::string> titles) {
zim::unittests::TempFile temp("testZim");
auto tempPath = temp.path() + ".zim";

zim::writer::Creator creator;
creator.configIndexing(true, "eng");
creator.startZimCreation(tempPath);

// add dummy items with given titles
for (auto title : titles) {
std::string path = "dummyPath" + title;
auto item = std::make_shared<TestItem>(path, "plain/text", title);
creator.addItem(item);
}

creator.addMetadata("Title", "This is a title");
creator.finishZimCreation();

return tempPath;
}

std::vector<std::string> getSuggestions(const zim::Archive archive, std::string query, int range) {
zim::Search search(archive);
search.set_suggestion_mode(true);
search.set_query(query);
search.set_range(0, range);
search.set_verbose(true);

std::vector<std::string> result;
for (auto entry = search.begin();entry!=search.end();entry++) {
result.push_back((*entry).getTitle());
}

return result;
}

TEST(Suggestion, emptyQuery) {
std::vector<std::string> titles = {
"fooland",
"berlin wall",
"hotel berlin, berlin",
"again berlin",
"berlin",
"not berlin",
};

std::string path = createTestZim(titles);
const zim::Archive archive(path);
std::vector<std::string> resultSet = getSuggestions(archive, "", archive.getEntryCount());

ASSERT_EQ(resultSet.size(), 0);
}

TEST(Suggestion, noResult) {
std::vector<std::string> titles = {
"fooland"
"berlin wall",
"hotel berlin, berlin",
"again berlin",
"berlin",
"not berlin",
};

std::string path = createTestZim(titles);
const zim::Archive archive(path);
std::vector<std::string> resultSet = getSuggestions(archive, "none", archive.getEntryCount());

ASSERT_EQ(resultSet.size(), 0);
}

TEST(Suggestion, singleTermOrder) {
std::vector<std::string> titles = {
"fooland",
"berlin wall",
"hotel berlin, berlin",
"again berlin",
"berlin",
"not berlin",
};

std::vector<std::string> expectedResult = {
"berlin",
"hotel berlin, berlin",
"not berlin",
"berlin wall",
"again berlin",
};

std::string path = createTestZim(titles);
const zim::Archive archive(path);
std::vector<std::string> resultSet = getSuggestions(archive, "berlin", archive.getEntryCount());

ASSERT_EQ(expectedResult.size() , resultSet.size());

for(int i = 0;(unsigned int)i < resultSet.size();i++){
std::cout<<"Expected: "<<expectedResult[i]<<", Actual: "<<resultSet[i]<<std::endl;
ASSERT_EQ(expectedResult[i], resultSet[i]);
}
}

TEST(Suggestion, resultsGreaterThanLimit) {
std::vector<std::string> titles = {
"foobar b",
"foobar a",
"foobar c",
"foobar e",
"foobar d",
};

std::string path = createTestZim(titles);
const zim::Archive archive(path);
std::vector<std::string> resultSet = getSuggestions(archive, "foobar", 2);

ASSERT_EQ(2 , resultSet.size());
}

TEST(Suggestion, phraseOrder) {
std::vector<std::string> titles = {
"Summer in Berlin",
"In Summer",
"Shivers in summer",
"Summer in Paradise",
"In mid Summer",
};

// TODO improve the relative order of terms
std::vector<std::string> expectedResult = {
"In Summer",
"Summer in Paradise",
"Summer in Berlin",
"Shivers in summer",
"In mid Summer",
};


std::string path = createTestZim(titles);
const zim::Archive archive(path);
std::vector<std::string> resultSet = getSuggestions(archive, "summer in", archive.getEntryCount());

ASSERT_EQ(expectedResult.size(), resultSet.size());
for(int i = 0;(unsigned int)i < resultSet.size();i++){
std::cout<<"Expected: "<<expectedResult[i]<<", Actual: "<<resultSet[i]<<std::endl;
ASSERT_EQ(expectedResult[i], resultSet[i]);
}
}
}

0 comments on commit 544f52f

Please sign in to comment.