Skip to content

Commit

Permalink
LUCENE-10212: Add support to parse task for CombinedFieldsQuery (#148)
Browse files Browse the repository at this point in the history
LUCENE-10212: Add support to parse task for CombinedFieldsQuery

Task format:
taskName term1 term2 term3 term4 ... +combinedFields=field1^boost1,field2,field3^boost3,...
  • Loading branch information
zacharymorn committed Nov 9, 2021
1 parent ec7438c commit 0550148
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 6 deletions.
1 change: 0 additions & 1 deletion src/main/perf/LineFileDocs.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicInteger;
Expand Down
67 changes: 63 additions & 4 deletions src/main/perf/TaskParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.sandbox.search.CombinedFieldQuery;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery.Builder;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PointRangeQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
Expand Down Expand Up @@ -93,6 +93,9 @@ public TaskParser(IndexState state,

private final static Pattern filterPattern = Pattern.compile(" \\+filter=([0-9\\.]+)%");
private final static Pattern minShouldMatchPattern = Pattern.compile(" \\+minShouldMatch=(\\d+)($| )");
// pattern: taskName term1 term2 term3 term4 +combinedFields=field1^1.0,field2,field3^2.0
// this pattern doesn't handle all variations of floating numbers, such as .9 , but should be good enough for perf test query parsing purpose
private final static Pattern combinedFieldsPattern = Pattern.compile(" \\+combinedFields=((\\p{Alnum}+(\\^\\d+.\\d)?,)+\\p{Alnum}+(\\^\\d+.\\d)?)");

public Task parseOneTask(String line) throws ParseException {
return new TaskBuilder(line).build();
Expand All @@ -104,6 +107,7 @@ class TaskBuilder {
final String origText;

List<String> facets;
List<FieldAndWeight> combinedFields;
String text;
boolean doDrillSideways, doHilite, doStoredLoadsTask;
Sort sort;
Expand Down Expand Up @@ -148,10 +152,11 @@ Task buildQueryTask(String input) throws ParseException {
String taskType = taskAndType[0];
text = taskAndType[1];
int msm = parseMinShouldMatch();
Query query = buildQuery(taskType, text, msm);
combinedFields = parseCombinedFields();
Query query = buildQuery(taskType, text, msm, combinedFields);
Query query2 = applyDrillDowns(query, drillDowns);
Query query3 = applyFilter(query2, filter);
return new SearchTask(category, query2, sort, group, topN, doHilite, doStoredLoadsTask, facets, null, doDrillSideways);
return new SearchTask(category, query3, sort, group, topN, doHilite, doStoredLoadsTask, facets, null, doDrillSideways);
}

String[] parseTaskType(String line) {
Expand Down Expand Up @@ -226,6 +231,36 @@ int parseMinShouldMatch() {
return minShouldMatch;
}

class FieldAndWeight {
final String field;
final float weight;

FieldAndWeight(String field, float weight) {
this.field = field;
this.weight = weight;
}
}

List<FieldAndWeight> parseCombinedFields() {
final Matcher m = combinedFieldsPattern.matcher(text);
List<FieldAndWeight> result = new ArrayList<>();
if (m.find()) {
for (String fieldAndWeight : m.group(1).split(",")) {
if (fieldAndWeight.contains("^")) { // boosted field
String[] pair = fieldAndWeight.split("\\^");
result.add(new FieldAndWeight(pair[0], Float.valueOf(pair[1])));
} else {
result.add(new FieldAndWeight(fieldAndWeight, 1L));
}
}
// Splice out the combinedFields string:
text = (text.substring(0, m.start(0)) + text.substring(m.end(0), text.length())).trim();
return result;
} else {
return null;
}
}

List<String> parseFacets() {
List<String> facets = new ArrayList<>();
while (true) {
Expand Down Expand Up @@ -295,7 +330,7 @@ void parseHilite() {
}
}

Query buildQuery(String type, String text, int minShouldMatch) throws ParseException {
Query buildQuery(String type, String text, int minShouldMatch, List<FieldAndWeight> fieldAndWeights) throws ParseException {
Query query;
switch(type) {
case "ordered":
Expand Down Expand Up @@ -327,6 +362,30 @@ Query buildQuery(String type, String text, int minShouldMatch) throws ParseExcep
if (query.toString().equals("")) {
throw new RuntimeException("query text \"" + text + "\" parsed to empty query");
}

if (combinedFields != null) {
CombinedFieldQuery.Builder cfqBuilder = new CombinedFieldQuery.Builder();

for (FieldAndWeight fieldAndWeight : fieldAndWeights) {
cfqBuilder.addField(fieldAndWeight.field, fieldAndWeight.weight);
}

if (query instanceof TermQuery) {
cfqBuilder.addTerm(((TermQuery) query).getTerm().bytes());
} else if (query instanceof BooleanQuery) {
for (BooleanClause clause : (BooleanQuery) query) {
if (clause.getOccur() != Occur.SHOULD) {
throw new RuntimeException("combinedFields can only be used with TermQuery or BooleanQuery with OR clauses: query=" + origText);
}
cfqBuilder.addTerm(((TermQuery) clause.getQuery()).getTerm().bytes());
}
} else {
throw new RuntimeException("combinedFields can only be used with TermQuery or BooleanQuery with OR clauses: query=" + origText);
}

return cfqBuilder.build();
}

if (minShouldMatch == 0) {
return query;
} else {
Expand Down
19 changes: 18 additions & 1 deletion src/python/competition.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,19 @@ def __init__(self, name, lineFile, numDocs, tasksFile):
constants.DISJUNCTION_DOC_COUNT,
constants.DISJUNCTION_INTENSIVE_TASKS_FILE)

COMBINED_FIELDS_BIG = Data('combinedFieldsBig',
constants.WIKI_BIG_DOCS_LINE_FILE, constants.WIKI_BIG_DOCS_COUNT, constants.COMBINED_FIELDS_TASKS_FILE)

COMBINED_FIELDS_UNEVENLY_WEIGHTED_BIG = Data('combinedFieldsUnevenlyWeightedBig',
constants.WIKI_BIG_DOCS_LINE_FILE, constants.WIKI_BIG_DOCS_COUNT, constants.COMBINED_FIELDS_UNEVENLY_WEIGHTED_TASKS_FILE)

COMBINED_FIELDS_MEDIUM_10M = Data('combinedFieldsMedium10M',
constants.WIKI_MEDIUM_DOCS_LINE_FILE, 10000000, constants.COMBINED_FIELDS_TASKS_FILE)

COMBINED_FIELDS_UNEVENLY_WEIGHTED_MEDIUM_10M = Data('combinedFieldsUnevenlyWeightedMedium10M',
constants.WIKI_MEDIUM_DOCS_LINE_FILE, 10000000, constants.COMBINED_FIELDS_UNEVENLY_WEIGHTED_TASKS_FILE)


DATA = {'wikimediumall': WIKI_MEDIUM_ALL,
'wikimedium10m' : WIKI_MEDIUM_10M,
'wikimedium1m' : WIKI_MEDIUM_1M,
Expand All @@ -86,7 +99,11 @@ def __init__(self, name, lineFile, numDocs, tasksFile):
'wikivector10k' : WIKI_VECTOR_10K,
'disjunctionSimple' : DISJUNCTION_SIMPLE,
'disjunctionRealistic' : DISJUNCTION_REALISTIC,
'disjunctionIntensive' : DISJUNCTION_INTENSIVE
'disjunctionIntensive' : DISJUNCTION_INTENSIVE,
'combinedFieldsBig' : COMBINED_FIELDS_BIG,
'combinedFieldsUnevenlyWeightedBig' : COMBINED_FIELDS_UNEVENLY_WEIGHTED_BIG,
'combinedFieldsMedium10M' : COMBINED_FIELDS_MEDIUM_10M,
'combinedFieldsUnevenlyWeightedMedium10M' : COMBINED_FIELDS_UNEVENLY_WEIGHTED_MEDIUM_10M
}

# for multi-segment index:
Expand Down
2 changes: 2 additions & 0 deletions src/python/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
DISJUNCTION_SIMPLE_TASKS_FILE = '%s/tasks/disjunctionSimple.tasks' % BENCH_BASE_DIR
DISJUNCTION_REALISTIC_TASKS_FILE = '%s/tasks/disjunctionRealistic.tasks' % BENCH_BASE_DIR
DISJUNCTION_INTENSIVE_TASKS_FILE = '%s/tasks/disjunctionIntensive.tasks' % BENCH_BASE_DIR
COMBINED_FIELDS_TASKS_FILE = '%s/tasks/combinedfields.tasks' % BENCH_BASE_DIR
COMBINED_FIELDS_UNEVENLY_WEIGHTED_TASKS_FILE = '%s/tasks/combinedfields.unevenlyweighted.tasks' % BENCH_BASE_DIR

# wget http://home.apache.org/~mikemccand/enwiki-20100302-pages-articles-lines.txt.bz2
WIKI_BIG_DOCS_LINE_FILE = '%s/data/enwiki-20100302-pages-articles-lines.txt' % BASE_DIR
Expand Down
15 changes: 15 additions & 0 deletions tasks/combinedfields.tasks
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
CFQHighHigh: are last +combinedFields=titleTokenized^4.0,body^2.0 # freq=1921211 freq=830278
CFQHighHigh: at united +combinedFields=titleTokenized^4.0,body^2.0 # freq=2834104 freq=1185528
CFQHighHigh: but year +combinedFields=titleTokenized^4.0,body^2.0 # freq=1484398 freq=1098425
CFQHighHigh: name its +combinedFields=titleTokenized^4.0,body^2.0 # freq=2577591 freq=1160703
CFQHighHigh: to but +combinedFields=titleTokenized^4.0,body^2.0 # freq=6105155 freq=1484398
CFQHighMed: at mostly +combinedFields=titleTokenized^4.0,body^2.0 # freq=2834104 freq=89401
CFQHighMed: his interview +combinedFields=titleTokenized^4.0,body^2.0 # freq=1771920 freq=94736
CFQHighMed: http 9 +combinedFields=titleTokenized^4.0,body^2.0 # freq=3289683 freq=541405
CFQHighMed: they hard +combinedFields=titleTokenized^4.0,body^2.0 # freq=1031516 freq=92045
CFQHighMed: title bay +combinedFields=titleTokenized^4.0,body^2.0 # freq=2077102 freq=117167
CFQHighLow: should marque +combinedFields=titleTokenized^4.0,body^2.0 # freq=401379 freq=1801
CFQHighLow: location cloverleaf +combinedFields=titleTokenized^4.0,body^2.0 # freq=489791 freq=1097
CFQHighLow: three geronimo +combinedFields=titleTokenized^4.0,body^2.0 # freq=598349 freq=1158
CFQHighLow: do necessities +combinedFields=titleTokenized^4.0,body^2.0 # freq=511178 freq=1195
CFQHighLow: had halfback +combinedFields=titleTokenized^4.0,body^2.0 # freq=1246743 freq=1205
15 changes: 15 additions & 0 deletions tasks/combinedfields.unevenlyweighted.tasks
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
CFQHighHigh: are last +combinedFields=titleTokenized^20.0,body^1.0 # freq=1921211 freq=830278
CFQHighHigh: at united +combinedFields=titleTokenized^20.0,body^1.0 # freq=2834104 freq=1185528
CFQHighHigh: but year +combinedFields=titleTokenized^20.0,body^1.0 # freq=1484398 freq=1098425
CFQHighHigh: name its +combinedFields=titleTokenized^20.0,body^1.0 # freq=2577591 freq=1160703
CFQHighHigh: to but +combinedFields=titleTokenized^20.0,body^1.0 # freq=6105155 freq=1484398
CFQHighMed: at mostly +combinedFields=titleTokenized^20.0,body^1.0 # freq=2834104 freq=89401
CFQHighMed: his interview +combinedFields=titleTokenized^20.0,body^1.0 # freq=1771920 freq=94736
CFQHighMed: http 9 +combinedFields=titleTokenized^20.0,body^1.0 # freq=3289683 freq=541405
CFQHighMed: they hard +combinedFields=titleTokenized^20.0,body^1.0 # freq=1031516 freq=92045
CFQHighMed: title bay +combinedFields=titleTokenized^20.0,body^1.0 # freq=2077102 freq=117167
CFQHighLow: should marque +combinedFields=titleTokenized^20.0,body^1.0 # freq=401379 freq=1801
CFQHighLow: location cloverleaf +combinedFields=titleTokenized^20.0,body^1.0 # freq=489791 freq=1097
CFQHighLow: three geronimo +combinedFields=titleTokenized^20.0,body^1.0 # freq=598349 freq=1158
CFQHighLow: do necessities +combinedFields=titleTokenized^20.0,body^1.0 # freq=511178 freq=1195
CFQHighLow: had halfback +combinedFields=titleTokenized^20.0,body^1.0 # freq=1246743 freq=1205

0 comments on commit 0550148

Please sign in to comment.