Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tasks for positional queries with nested disjunctions #133

Merged
merged 1 commit into from
Jun 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 98 additions & 11 deletions src/main/perf/TaskParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,14 @@
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.queries.spans.SpanNearQuery;
import org.apache.lucene.queries.spans.SpanOrQuery;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.queries.spans.SpanTermQuery;
import org.apache.lucene.queries.intervals.Intervals;
import org.apache.lucene.queries.intervals.IntervalsSource;
import org.apache.lucene.queries.intervals.IntervalQuery;
import java.io.IOException;
import java.util.Arrays;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
Expand Down Expand Up @@ -297,6 +300,12 @@ Query buildQuery(String type, String text, int minShouldMatch) throws ParseExcep
switch(type) {
case "ordered":
return parseOrderedQuery();
case "spanDis":
return parseSpanDisjunctions();
case "intervalDis":
return parseIntervalDisjunctions(true);
case "intervalDisMin":
return parseIntervalDisjunctions(false);
case "near":
return parseNearQuery();
case "multiPhrase":
Expand Down Expand Up @@ -411,28 +420,106 @@ Query parseNearQuery() {
true);
}

Query parseSpanDisjunctions() {
String[] fieldHolder = new String[1];
int[] slopHolder = new int[] {10}; // default to slop of 10
String[][][] clauses = parseDisjunctionSpec(fieldHolder, slopHolder);
String field = fieldHolder[0];
int slop = slopHolder[0];
SpanQuery[] spanClauses = Arrays.stream(clauses).map((component) -> {
SpanQuery[] disjunct = Arrays.stream(component).map((words) -> {
if (words.length == 1) {
return new SpanTermQuery(new Term(field, words[0]));
} else {
return new SpanNearQuery(Arrays.stream(words).map((word) -> {
return new SpanTermQuery(new Term(field, word));
}).toArray((size) -> new SpanQuery[size]), 0, true);
}
}).toArray((size) -> new SpanQuery[size]);
return disjunct.length == 1 ? disjunct[0] : new SpanOrQuery(disjunct);
}).toArray((size) -> new SpanQuery[size]);
// NOTE: in contrast to intervals (below), with spans there is no special
// case for slop==0; we have only SpanNearQuery
return spanClauses.length == 1 ? spanClauses[0] : new SpanNearQuery(spanClauses, slop, true);
}

Query parseIntervalDisjunctions(boolean rewrite) {
String[] fieldHolder = new String[1];
int[] slopHolder = new int[] {10}; // default to slop of 10
String[][][] clauses = parseDisjunctionSpec(fieldHolder, slopHolder);
String field = fieldHolder[0];
int slop = slopHolder[0];
IntervalsSource[] intervalClauses = Arrays.stream(clauses).map((component) -> {
IntervalsSource[] disjunct = Arrays.stream(component).map((words) -> {
if (words.length == 1) {
return Intervals.term(words[0]);
} else {
IntervalsSource[] intervalWords = Arrays.stream(words).map((word) -> {
return Intervals.term(word);
}).toArray((size) -> new IntervalsSource[size]);
return Intervals.phrase(intervalWords);
}
}).toArray((size) -> new IntervalsSource[size]);
return disjunct.length == 1 ? disjunct[0] : Intervals.or(rewrite, disjunct);
}).toArray((size) -> new IntervalsSource[size]);
IntervalsSource positional;
if (intervalClauses.length == 1) {
// NOTE: apparently maxgaps/ordered/phrase do not rewrite for the single-clause
// case? ... or in any event not in a way that's transparent immediately after
// query construction. So we do it manually here, in order be sure that Intervals
// can "put their best foot forward" on the plain-disjunction case.
positional = intervalClauses[0];
} else if (slop == 0) {
// assumption: "phrase" is equivalent to "maxgaps(0, ordered)"?
positional = Intervals.phrase(intervalClauses);
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this effectively the same thing as a good old PhraseQuery with slop=0?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think they should be functionally the same, yes. @romseygeek might be in the best position to confirm though.

} else {
// the usual case
positional = Intervals.maxgaps(slop, Intervals.ordered(intervalClauses));
}
return new IntervalQuery(field, positional);
}

Query parseMultiPhrase() {
String[] fieldHolder = new String[1];
int[] slopHolder = new int[1]; // implicit default to slop=0
String[][][] clauses = parseDisjunctionSpec(fieldHolder, slopHolder);
if (slopHolder[0] != 0) {
throw new IllegalArgumentException("multiPhrase only supports slop==0; found:" + slopHolder[0]);
}
String field = fieldHolder[0];
MultiPhraseQuery.Builder b = new MultiPhraseQuery.Builder();
for (int i = 0; i < clauses.length; i++) {
String words[][] = clauses[i];
Term terms[] = new Term[words.length];
for (int j = 0; j < words.length; j++) {
terms[j] = new Term(field, words[j][0]);
}
b.add(terms);
}
return b.build();
}

private String[][][] parseDisjunctionSpec(String[] fieldHolder, int[] slopHolder) {
int colon = text.indexOf(':');
if (colon == -1) {
throw new RuntimeException("failed to parse query=" + text);
}
String field = text.substring("(".length(), colon);
fieldHolder[0] = text.substring("(".length(), colon);
MultiPhraseQuery.Builder b = new MultiPhraseQuery.Builder();
int endParen = text.indexOf(')');
if (endParen == -1) {
throw new RuntimeException("failed to parse query=" + text);
}
String queryText = text.substring(colon+1, endParen);
String elements[] = queryText.split("\\s+");
for (int i = 0; i < elements.length; i++) {
String words[] = elements[i].split("\\|");
Term terms[] = new Term[words.length];
for (int j = 0; j < words.length; j++) {
terms[j] = new Term(field, words[j]);
}
b.add(terms);
int checkExplicitSlop = endParen + 1;
if (text.length() > checkExplicitSlop && text.charAt(checkExplicitSlop) == '~') {
slopHolder[0] = Integer.parseInt(text.substring(checkExplicitSlop + 1).split("[^0-9]", 2)[0]);
}
return b.build();
String queryText = text.substring(colon+1, endParen);
return Arrays.stream(queryText.split("\\s+")).map((clause) -> {
return Arrays.stream(clause.split("\\|")).map((component) -> {
return component.split("-");
}).toArray((size) -> new String[size][]);
}).toArray((size) -> new String[size][][]);
}

Query parseDisjunctionMax() {
Expand Down
18 changes: 18 additions & 0 deletions src/python/competition.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,21 @@ def __init__(self, name, lineFile, numDocs, tasksFile):

WIKI_VECTOR_10K = Data('wikivector10k', constants.WIKI_MEDIUM_DOCS_LINE_FILE, 10000, constants.WIKI_VECTOR_TASKS_FILE)

DISJUNCTION_SIMPLE = Data('disjunctionSimple',
constants.DISJUNCTION_DOCS_LINE_FILE,
constants.DISJUNCTION_DOC_COUNT,
constants.DISJUNCTION_SIMPLE_TASKS_FILE)

DISJUNCTION_REALISTIC = Data('disjunctionRealistic',
constants.DISJUNCTION_DOCS_LINE_FILE,
constants.DISJUNCTION_DOC_COUNT,
constants.DISJUNCTION_REALISTIC_TASKS_FILE)

DISJUNCTION_INTENSIVE = Data('disjunctionIntensive',
constants.DISJUNCTION_DOCS_LINE_FILE,
constants.DISJUNCTION_DOC_COUNT,
constants.DISJUNCTION_INTENSIVE_TASKS_FILE)

DATA = {'wikimediumall': WIKI_MEDIUM_ALL,
'wikimedium10m' : WIKI_MEDIUM_10M,
'wikimedium1m' : WIKI_MEDIUM_1M,
Expand All @@ -69,6 +84,9 @@ def __init__(self, name, lineFile, numDocs, tasksFile):
'wikibig1m' : WIKI_BIG_1M,
'euromedium' : EURO_MEDIUM,
'wikivector10k' : WIKI_VECTOR_10K,
'disjunctionSimple' : DISJUNCTION_SIMPLE,
'disjunctionRealistic' : DISJUNCTION_REALISTIC,
'disjunctionIntensive' : DISJUNCTION_INTENSIVE
}

# for multi-segment index:
Expand Down
6 changes: 6 additions & 0 deletions src/python/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
WIKI_MEDIUM_TASKS_ALL_FILE = '%s/tasks/wikimedium.10M.tasks' % BENCH_BASE_DIR
WIKI_VECTOR_TASKS_FILE = '%s/tasks/vector.tasks' % BENCH_BASE_DIR
SORTED_TASKS_FILE = '%s/tasks/sorted.tasks' % BENCH_BASE_DIR
DISJUNCTION_SIMPLE_TASKS_FILE = '%s/tasks/disjunctionSimple.tasks' % BENCH_BASE_DIR
DISJUNCTION_REALISTIC_TASKS_FILE = '%s/tasks/disjunctionRealistic.tasks' % BENCH_BASE_DIR
DISJUNCTION_INTENSIVE_TASKS_FILE = '%s/tasks/disjunctionIntensive.tasks' % BENCH_BASE_DIR

# wget http://home.apache.org/~mikemccand/enwiki-20100302-pages-articles-lines.txt.bz2
WIKI_BIG_DOCS_LINE_FILE = '%s/data/enwiki-20100302-pages-articles-lines.txt' % BASE_DIR
Expand All @@ -60,6 +63,9 @@
# enwiki-20130102-lines.txt has 6647577 docs
WIKI_BIG_DOCS_COUNT = 6726515

DISJUNCTION_DOC_COUNT = 500000
DISJUNCTION_DOCS_LINE_FILE = WIKI_MEDIUM_DOCS_LINE_FILE

#WIKI_FILE = '%s/data/enwiki-20100302-pages-articles.xml.bz2' % BENCH_BASE_DIR

# 5607746 docs:
Expand Down
29 changes: 29 additions & 0 deletions tasks/disjunctionIntensive.tasks
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# these are artificially designed to create intensive disjunction queries. The
# goal is to illustrate performance profile; particularly, e.g., with
# potentially adversarial queries

# NOTE: alternating between "a|in-the" and "the|in-the" clauses is required
# to induce `pullUpDisjunctions()` query rewriting, which otherwise seems
# to be prevented by `OrderedIntervalSource.deduplicate()`?

# NOTE: the "smith" term is arbitrary, just to restrict the domain somewhat
# and push QPS into friendlier number ranges

IntervalDis1: intervalDis//(body:smith a|in-the)
IntervalMinDis1: intervalMinDis//(body:smith a|in-the)
SpanDis1: spanDis//(body:smith a|in-the)
IntervalDis2: intervalDis//(body:smith a|in-the the|in-the)
IntervalMinDis2: intervalMinDis//(body:smith a|in-the the|in-the)
SpanDis2: spanDis//(body:smith a|in-the the|in-the)
IntervalDis3: intervalDis//(body:smith a|in-the the|in-the a|in-the)
IntervalMinDis3: intervalMinDis//(body:smith a|in-the the|in-the a|in-the)
SpanDis3: spanDis//(body:smith a|in-the the|in-the a|in-the)
IntervalDis4: intervalDis//(body:smith a|in-the the|in-the a|in-the the|in-the)
IntervalMinDis4: intervalMinDis//(body:smith a|in-the the|in-the a|in-the the|in-the)
SpanDis4: spanDis//(body:smith a|in-the the|in-the a|in-the the|in-the)
IntervalDis5: intervalDis//(body:smith a|in-the the|in-the a|in-the the|in-the a|in-the)
IntervalMinDis5: intervalMinDis//(body:smith a|in-the the|in-the a|in-the the|in-the a|in-the)
SpanDis5: spanDis//(body:smith a|in-the the|in-the a|in-the the|in-the a|in-the)
IntervalDis6: intervalDis//(body:smith a|in-the the|in-the a|in-the the|in-the a|in-the the|in-the)
IntervalMinDis6: intervalMinDis//(body:smith a|in-the the|in-the a|in-the the|in-the a|in-the the|in-the)
SpanDis6: spanDis//(body:smith a|in-the the|in-the a|in-the the|in-the a|in-the the|in-the)
6 changes: 6 additions & 0 deletions tasks/disjunctionRealistic.tasks
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# This is intended to illustrate a realistic-type query that would have
# relatively significant combinatorial expansion in `pullUpDisjunctions()`

IntervalDis: intervalDis//(body:us|united-states health|health-care policy|public-policy law|legal-aspects)
IntervalMinDis: intervalMinDis//(body:us|united-states health|health-care policy|public-policy law|legal-aspects)
SpanDis: spanDis//(body:us|united-states health|health-care policy|public-policy law|legal-aspects)
6 changes: 6 additions & 0 deletions tasks/disjunctionSimple.tasks
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# These should not actually result in positional queries; they exist to evaluate
# realistic pure-disjunction queries in isolation (i.e.: SpanOrQuery,
# DisjunctionIntervalsSource)

PlainIntervalDis: intervalDis//(body:trash|waste|garbage|recycling|refuse)
PlainSpanDis: spanDis//(body:trash|waste|garbage|recycling|refuse)