From 32e4a58d2e1cc9fa41501cc0bc740e36dce50271 Mon Sep 17 00:00:00 2001 From: Martijn van Groningen Date: Thu, 24 Jan 2013 15:38:35 +0100 Subject: [PATCH] Added suggest api. # Suggest feature The suggest feature suggests similar looking terms based on a provided text by using a suggester. At the moment there the only supported suggester is `fuzzy`. The suggest feature is available since version `0.21.0`. # Fuzzy suggester The `fuzzy` suggester suggests terms based on edit distance. The provided suggest text is analyzed before terms are suggested. The suggested terms are provided per analyzed suggest text token. The `fuzzy` suggester doesn't take the query into account that is part of request. # Suggest API The suggest request part is defined along side the query part as top field in the json request. ``` curl -s -XPOST 'localhost:9200/_search' -d '{ "query" : { ... }, "suggest" : { ... } }' ``` Several suggestions can be specified per request. Each suggestion is identified with an arbitary name. In the example below two suggestions are requested. The `my-suggest-1` suggestion uses the `body` field and `my-suggest-2` uses the `title` field. The `type` field is a required field and defines what suggester to use for a suggestion. ``` "suggest" : { "suggestions" : { "my-suggest-1" : { "type" : "fuzzy", "field" : "body", "text" : "the amsterdma meetpu" }, "my-suggest-2" : { "type" : "fuzzy", "field" : "title", "text" : "the rottredam meetpu" } } } ``` The below suggest response example includes the suggestions part for `my-suggest-1` and `my-suggest-2`. Each suggestion part contains a terms array, that contains all terms outputted by the analyzed suggest text. Each term object includes the term itself, the original start and end offset in the suggest text and if found an arbitary number of suggestions. ``` { ... "suggest": { "my-suggest-1": { "terms" : [ { "term" : "amsterdma", "start_offset": 5, "end_offset": 14, "suggestions": [ ... ] } ... ] }, "my-suggest-2" : { "terms" : [ ... ] } } ``` Each suggestions array contains a suggestion object that includes the suggested term, its document frequency and score compared to the suggest text term. The meaning of the score depends on the used suggester. The fuzzy suggester's score is based on the edit distance. ``` "suggestions": [ { "term": "amsterdam", "frequency": 77, "score": 0.8888889 }, ... ] ``` # Global suggest text To avoid repitition of the suggest text, it is possible to define a global text. In the example below the suggest text is a global option and applies to the `my-suggest-1` and `my-suggest-2` suggestions. ``` "suggest" : { "suggestions" : { "text" : "the amsterdma meetpu", "my-suggest-1" : { "type" : "fuzzy", "field" : "title" }, "my-suggest-2" : { "type" : "fuzzy", "field" : "body" } } } ``` The suggest text can be specied as global option or as suggestion specific option. The suggest text specified on suggestion level override the suggest text on the global level. # Other suggest example. In the below example we request suggestions for the following suggest text: `devloping distibutd saerch engies` on the `title` field with a maximum of 3 suggestions per term inside the suggest text. Note that in this example we use the `count` search type. This isn't required, but a nice optimalization. The suggestions are gather in the `query` phase and in the case that we only care about suggestions (so no hits) we don't need to execute the `fetch` phase. ``` curl -s -XPOST 'localhost:9200/_search?search_type=count' -d '{ "suggest" : { "suggestions" : { "my-title-suggestions" : { "suggester" : "fuzzy", "field" : "title", "text" : "devloping distibutd saerch engies", "size" : 3 } } } }' ``` The above request could yield the response as stated in the code example below. As you can see if we take the first suggested term of each suggest text term we get `developing distributed search engines` as result. ``` { ... "suggest": { "my-title-suggestions": { "terms": [ { "term": "devloping", "start_offset": 0, "end_offset": 9, "suggestions": [ { "term": "developing", "frequency": 77, "score": 0.8888889 }, { "term": "deloping", "frequency": 1, "score": 0.875 }, { "term": "deploying", "frequency": 2, "score": 0.7777778 } ] }, { "term": "distibutd", "start_offset": 10, "end_offset": 19, "suggestions": [ { "term": "distributed", "frequency": 217, "score": 0.7777778 }, { "term": "disributed", "frequency": 1, "score": 0.7777778 }, { "term": "distribute", "frequency": 1, "score": 0.7777778 } ] }, { "term": "saerch", "start_offset": 20, "end_offset": 26, "suggestions": [ { "term": "search", "frequency": 1038, "score": 0.8333333 }, { "term": "smerch", "frequency": 3, "score": 0.8333333 }, { "term": "serch", "frequency": 2, "score": 0.8 } ] }, { "term": "engies", "start_offset": 27, "end_offset": 33, "suggestions": [ { "term": "engines", "frequency": 568, "score": 0.8333333 }, { "term": "engles", "frequency": 3, "score": 0.8333333 }, { "term": "eggies", "frequency": 1, "score": 0.8333333 } ] } ] } } ... } ``` # Common suggest options: * `suggester` - The suggester implementation type. The only supported value is 'fuzzy'. This is a required option. * `text` - The suggest text. The suggest text is a required option that needs to be set globally or per suggestion. # Common fuzzy suggest options * `field` - The field to fetch the candidate suggestions from. This is an required option that either needs to be set globally or per suggestion. * `analyzer` - The analyzer to analyse the suggest text with. Defaults to the search analyzer of the suggest field. * `size` - The maximum corrections to be returned per suggest text token. * `sort` - Defines how suggestions should be sorted per suggest text term. Two possible value: ** `score` - Sort by sore first, then document frequency and then the term itself. ** `frequency` - Sort by document frequency first, then simlarity score and then the term itself. * `suggest_mode` - The suggest mode controls what suggestions are included or controls for what suggest text terms, suggestions should be suggested. Three possible values can be specified: ** `missing` - Only suggest terms in the suggest text that aren't in the index. This is the default. ** `popular` - Only suggest suggestions that occur in more docs then the original suggest text term. ** `always` - Suggest any matching suggestions based on terms in the suggest text. # Other fuzzy suggest options: * `lowercase_terms` - Lower cases the suggest text terms after text analyzation. * `max_edits` - The maximum edit distance candidate suggestions can have in order to be considered as a suggestion. Can only be a value between 1 and 2. Any other value result in an bad request error being thrown. Defaults to 2. * `min_prefix` - The number of minimal prefix characters that must match in order be a candidate suggestions. Defaults to 1. Increasing this number improves spellcheck performance. Usually misspellings don't occur in the beginning of terms. * `min_query_length` - The minimum length a suggest text term must have in order to be included. Defaults to 4. * `shard_size` - Sets the maximum number of suggestions to be retrieved from each individual shard. During the reduce phase only the top N suggestions are returned based on the `size` option. Defaults to the `size` option. Setting this to a value higher than the `size` can be useful in order to get a more accurate document frequency for spelling corrections at the cost of performance. Due to the fact that terms are partitioned amongst shards, the shard level document frequencies of spelling corrections may not be precise. Increasing this will make these document frequencies more precise. * `max_inspections` - A factor that is used to multiply with the `shards_size` in order to inspect more candidate spell corrections on the shard level. Can improve accuracy at the cost of performance. Defaults to 5. * `threshold_frequency` - The minimal threshold in number of documents a suggestion should appear in. This can be specified as an absolute number or as a relative percentage of number of documents. This can improve quality by only suggesting high frequency terms. Defaults to 0f and is not enabled. If a value higher than 1 is specified then the number cannot be fractional. The shard level document frequencies are used for this option. * `max_query_frequency` - The maximum threshold in number of documents a sugges text token can exist in order to be included. Can be a relative percentage number (e.g 0.4) or an absolute number to represent document frequencies. If an value higher than 1 is specified then fractional can not be specified. Defaults to 0.01f. This can be used to exclude high frequency terms from being spellchecked. High frequency terms are usually spelled correctly on top of this this also improves the spellcheck performance. The shard level document frequencies are used for this option. Closes #2585 --- pom.xml | 6 + .../action/search/SearchRequestBuilder.java | 22 + .../action/search/SearchResponse.java | 5 + .../type/TransportSearchScrollScanAction.java | 2 +- .../rest/action/search/RestSearchAction.java | 15 + .../search/builder/SearchSourceBuilder.java | 14 + .../controller/SearchPhaseController.java | 34 +- .../internal/InternalSearchResponse.java | 24 +- .../search/internal/SearchContext.java | 11 + .../search/query/QueryPhase.java | 9 +- .../search/query/QuerySearchResult.java | 19 + .../elasticsearch/search/suggest/Suggest.java | 509 ++++++++++++++++++ .../search/suggest/SuggestBuilder.java | 383 +++++++++++++ .../search/suggest/SuggestParseElement.java | 235 ++++++++ .../search/suggest/SuggestPhase.java | 231 ++++++++ .../suggest/SuggestionSearchContext.java | 206 +++++++ .../search/SuggestSearchBenchMark.java | 166 ++++++ .../search/suggest/SuggestSearchTests.java | 348 ++++++++++++ 18 files changed, 2233 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/elasticsearch/search/suggest/Suggest.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/SuggestBuilder.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/SuggestParseElement.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/SuggestPhase.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/SuggestionSearchContext.java create mode 100644 src/test/java/org/elasticsearch/benchmark/search/SuggestSearchBenchMark.java create mode 100644 src/test/java/org/elasticsearch/test/integration/search/suggest/SuggestSearchTests.java diff --git a/pom.xml b/pom.xml index a3053ca004d19..98b5f0fd1c7fb 100644 --- a/pom.xml +++ b/pom.xml @@ -91,6 +91,12 @@ ${lucene.version} compile + + org.apache.lucene + lucene-suggest + ${lucene.version} + compile + diff --git a/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java b/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java index 6673f5e811ce0..0003ae465e1a5 100644 --- a/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java +++ b/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java @@ -37,6 +37,7 @@ import org.elasticsearch.search.highlight.HighlightBuilder; import org.elasticsearch.search.sort.SortBuilder; import org.elasticsearch.search.sort.SortOrder; +import org.elasticsearch.search.suggest.SuggestBuilder; import java.util.Map; @@ -646,6 +647,22 @@ public SearchRequestBuilder setHighlighterType(String type) { return this; } + /** + * Delegates to {@link org.elasticsearch.search.suggest.SuggestBuilder#setText(String)}. + */ + public SearchRequestBuilder setSuggestText(String globalText) { + suggestBuilder().setText(globalText); + return this; + } + + /** + * Delegates to {@link org.elasticsearch.search.suggest.SuggestBuilder#addSuggestion(org.elasticsearch.search.suggest.SuggestBuilder.Suggestion)}. + */ + public SearchRequestBuilder addSuggestion(SuggestBuilder.Suggestion suggestion) { + suggestBuilder().addSuggestion(suggestion); + return this; + } + /** * Sets the source of the request as a json string. Note, settings anything other * than the search type will cause this source to be overridden, consider using @@ -818,4 +835,9 @@ private SearchSourceBuilder sourceBuilder() { private HighlightBuilder highlightBuilder() { return sourceBuilder().highlighter(); } + + private SuggestBuilder suggestBuilder() { + return sourceBuilder().suggest(); + } + } diff --git a/src/main/java/org/elasticsearch/action/search/SearchResponse.java b/src/main/java/org/elasticsearch/action/search/SearchResponse.java index e9f931682148d..7339018e44563 100644 --- a/src/main/java/org/elasticsearch/action/search/SearchResponse.java +++ b/src/main/java/org/elasticsearch/action/search/SearchResponse.java @@ -31,6 +31,7 @@ import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.facet.Facets; import org.elasticsearch.search.internal.InternalSearchResponse; +import org.elasticsearch.search.suggest.Suggest; import java.io.IOException; @@ -105,6 +106,10 @@ public Facets facets() { return internalResponse.facets(); } + public Suggest suggest() { + return internalResponse.suggest(); + } + /** * The search facets. */ diff --git a/src/main/java/org/elasticsearch/action/search/type/TransportSearchScrollScanAction.java b/src/main/java/org/elasticsearch/action/search/type/TransportSearchScrollScanAction.java index f8569e17bc099..5279d1ae49852 100644 --- a/src/main/java/org/elasticsearch/action/search/type/TransportSearchScrollScanAction.java +++ b/src/main/java/org/elasticsearch/action/search/type/TransportSearchScrollScanAction.java @@ -127,7 +127,7 @@ protected final void addShardFailure(ShardSearchFailure failure) { public void start() { if (scrollId.context().length == 0) { - final InternalSearchResponse internalResponse = new InternalSearchResponse(new InternalSearchHits(InternalSearchHits.EMPTY, Long.parseLong(this.scrollId.attributes().get("total_hits")), 0.0f), null, false); + final InternalSearchResponse internalResponse = new InternalSearchResponse(new InternalSearchHits(InternalSearchHits.EMPTY, Long.parseLong(this.scrollId.attributes().get("total_hits")), 0.0f), null, null, false); searchCache.releaseQueryFetchResults(queryFetchResults); listener.onResponse(new SearchResponse(internalResponse, request.scrollId(), 0, 0, 0l, buildShardFailures())); return; diff --git a/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java b/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java index 18d8dbb4862ef..b636e6eedb986 100644 --- a/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java +++ b/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java @@ -45,6 +45,7 @@ import static org.elasticsearch.rest.RestRequest.Method.POST; import static org.elasticsearch.rest.RestStatus.BAD_REQUEST; import static org.elasticsearch.rest.action.support.RestXContentBuilder.restContentBuilder; +import static org.elasticsearch.search.suggest.SuggestBuilder.fuzzySuggestion; /** * @@ -276,6 +277,20 @@ private SearchSourceBuilder parseSearchSource(RestRequest request) { searchSourceBuilder.stats(Strings.splitStringByCommaToArray(sStats)); } + String suggestField = request.param("suggest_field"); + if (suggestField != null) { + String suggestText = request.param("suggest_text", queryString); + int suggestSize = request.paramAsInt("suggest_size", 5); + if (searchSourceBuilder == null) { + searchSourceBuilder = new SearchSourceBuilder(); + } + String suggestMode = request.param("suggest_mode"); + searchSourceBuilder.suggest().addSuggestion( + fuzzySuggestion(suggestField).setField(suggestField).setText(suggestText).setSize(suggestSize) + .setSuggestMode(suggestMode) + ); + } + return searchSourceBuilder; } } diff --git a/src/main/java/org/elasticsearch/search/builder/SearchSourceBuilder.java b/src/main/java/org/elasticsearch/search/builder/SearchSourceBuilder.java index 6e92b49b15d9c..db18ecec67aec 100644 --- a/src/main/java/org/elasticsearch/search/builder/SearchSourceBuilder.java +++ b/src/main/java/org/elasticsearch/search/builder/SearchSourceBuilder.java @@ -41,6 +41,7 @@ import org.elasticsearch.search.sort.SortBuilder; import org.elasticsearch.search.sort.SortBuilders; import org.elasticsearch.search.sort.SortOrder; +import org.elasticsearch.search.suggest.SuggestBuilder; import java.io.IOException; import java.util.ArrayList; @@ -103,6 +104,8 @@ public static HighlightBuilder highlight() { private HighlightBuilder highlightBuilder; + private SuggestBuilder suggestBuilder; + private TObjectFloatHashMap indexBoost = null; private String[] stats; @@ -400,6 +403,13 @@ public SearchSourceBuilder highlight(HighlightBuilder highlightBuilder) { return this; } + public SuggestBuilder suggest() { + if (suggestBuilder == null) { + suggestBuilder = new SuggestBuilder(); + } + return suggestBuilder; + } + /** * Sets no fields to be loaded, resulting in only id and type to be returned per field. */ @@ -709,6 +719,10 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws highlightBuilder.toXContent(builder, params); } + if (suggestBuilder != null) { + suggestBuilder.toXContent(builder, params); + } + if (stats != null) { builder.startArray("stats"); for (String stat : stats) { diff --git a/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java b/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java index 2a38084e21464..6363e7f413ba8 100644 --- a/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java +++ b/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java @@ -48,6 +48,7 @@ import org.elasticsearch.search.internal.InternalSearchResponse; import org.elasticsearch.search.query.QuerySearchResult; import org.elasticsearch.search.query.QuerySearchResultProvider; +import org.elasticsearch.search.suggest.Suggest; import java.util.ArrayList; import java.util.Collection; @@ -373,7 +374,38 @@ public InternalSearchResponse merge(ShardDoc[] sortedDocs, Map mergedSuggestions = null; + for (QuerySearchResultProvider resultProvider : queryResults.values()) { + Suggest shardResult = resultProvider.queryResult().suggest(); + if (shardResult == null) { + continue; + } + + if (mergedSuggestions == null) { + mergedSuggestions = shardResult.getSuggestions(); + continue; + } + + for (Suggest.Suggestion shardCommand : shardResult.getSuggestions()) { + for (Suggest.Suggestion mergedSuggestion : mergedSuggestions) { + if (mergedSuggestion.getName().equals(shardCommand.getName())) { + mergedSuggestion.reduce(shardCommand); + } + } + } + } + if (mergedSuggestions != null) { + suggest = new Suggest(mergedSuggestions); + for (Suggest.Suggestion suggestion : mergedSuggestions) { + suggestion.trim(); + } + } + } + InternalSearchHits searchHits = new InternalSearchHits(hits.toArray(new InternalSearchHit[hits.size()]), totalHits, maxScore); - return new InternalSearchResponse(searchHits, facets, timedOut); + return new InternalSearchResponse(searchHits, facets, suggest, timedOut); } } diff --git a/src/main/java/org/elasticsearch/search/internal/InternalSearchResponse.java b/src/main/java/org/elasticsearch/search/internal/InternalSearchResponse.java index 75c705a0e56f3..e3ffc1fbe0cd5 100644 --- a/src/main/java/org/elasticsearch/search/internal/InternalSearchResponse.java +++ b/src/main/java/org/elasticsearch/search/internal/InternalSearchResponse.java @@ -27,6 +27,7 @@ import org.elasticsearch.search.SearchHits; import org.elasticsearch.search.facet.Facets; import org.elasticsearch.search.facet.InternalFacets; +import org.elasticsearch.search.suggest.Suggest; import java.io.IOException; @@ -41,16 +42,19 @@ public class InternalSearchResponse implements Streamable, ToXContent { private InternalFacets facets; + private Suggest suggest; + private boolean timedOut; - public static final InternalSearchResponse EMPTY = new InternalSearchResponse(new InternalSearchHits(new InternalSearchHit[0], 0, 0), null, false); + public static final InternalSearchResponse EMPTY = new InternalSearchResponse(new InternalSearchHits(new InternalSearchHit[0], 0, 0), null, null, false); private InternalSearchResponse() { } - public InternalSearchResponse(InternalSearchHits hits, InternalFacets facets, boolean timedOut) { + public InternalSearchResponse(InternalSearchHits hits, InternalFacets facets, Suggest suggest, boolean timedOut) { this.hits = hits; this.facets = facets; + this.suggest = suggest; this.timedOut = timedOut; } @@ -66,12 +70,19 @@ public Facets facets() { return facets; } + public Suggest suggest() { + return suggest; + } + @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { hits.toXContent(builder, params); if (facets != null) { facets.toXContent(builder, params); } + if (suggest != null) { + suggest.toXContent(builder, params); + } return builder; } @@ -87,6 +98,9 @@ public void readFrom(StreamInput in) throws IOException { if (in.readBoolean()) { facets = InternalFacets.readFacets(in); } + if (in.readBoolean()) { + suggest = Suggest.readSuggest(in); + } timedOut = in.readBoolean(); } @@ -99,6 +113,12 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(true); facets.writeTo(out); } + if (suggest == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + suggest.writeTo(out); + } out.writeBoolean(timedOut); } } diff --git a/src/main/java/org/elasticsearch/search/internal/SearchContext.java b/src/main/java/org/elasticsearch/search/internal/SearchContext.java index f98a89c075fd1..c16b9505443a3 100644 --- a/src/main/java/org/elasticsearch/search/internal/SearchContext.java +++ b/src/main/java/org/elasticsearch/search/internal/SearchContext.java @@ -60,6 +60,7 @@ import org.elasticsearch.search.lookup.SearchLookup; import org.elasticsearch.search.query.QuerySearchResult; import org.elasticsearch.search.scan.ScanContext; +import org.elasticsearch.search.suggest.SuggestionSearchContext; import java.util.ArrayList; import java.util.HashMap; @@ -160,6 +161,8 @@ public static SearchContext current() { private SearchContextHighlight highlight; + private SuggestionSearchContext suggest; + private SearchLookup searchLookup; private boolean queryRewritten; @@ -301,6 +304,14 @@ public void highlight(SearchContextHighlight highlight) { this.highlight = highlight; } + public SuggestionSearchContext suggest() { + return suggest; + } + + public void suggest(SuggestionSearchContext suggest) { + this.suggest = suggest; + } + public boolean hasScriptFields() { return scriptFields != null; } diff --git a/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/src/main/java/org/elasticsearch/search/query/QueryPhase.java index 1ff009b4882c6..61068ec80837c 100644 --- a/src/main/java/org/elasticsearch/search/query/QueryPhase.java +++ b/src/main/java/org/elasticsearch/search/query/QueryPhase.java @@ -35,6 +35,7 @@ import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.sort.SortParseElement; import org.elasticsearch.search.sort.TrackScoresParseElement; +import org.elasticsearch.search.suggest.SuggestPhase; import java.util.Map; @@ -44,10 +45,12 @@ public class QueryPhase implements SearchPhase { private final FacetPhase facetPhase; + private final SuggestPhase suggestPhase; @Inject - public QueryPhase(FacetPhase facetPhase) { + public QueryPhase(FacetPhase facetPhase, SuggestPhase suggestPhase) { this.facetPhase = facetPhase; + this.suggestPhase = suggestPhase; } @Override @@ -68,7 +71,8 @@ public QueryPhase(FacetPhase facetPhase) { .put("min_score", new MinScoreParseElement()) .put("minScore", new MinScoreParseElement()) .put("timeout", new TimeoutParseElement()) - .putAll(facetPhase.parseElements()); + .putAll(facetPhase.parseElements()) + .putAll(suggestPhase.parseElements()); return parseElements.build(); } @@ -185,6 +189,7 @@ public void execute(SearchContext searchContext) throws QueryPhaseExecutionExcep searchContext.searcher().processedScope(); } + suggestPhase.execute(searchContext); facetPhase.execute(searchContext); } } diff --git a/src/main/java/org/elasticsearch/search/query/QuerySearchResult.java b/src/main/java/org/elasticsearch/search/query/QuerySearchResult.java index 6ed9ae81fa58e..decf5dca37416 100644 --- a/src/main/java/org/elasticsearch/search/query/QuerySearchResult.java +++ b/src/main/java/org/elasticsearch/search/query/QuerySearchResult.java @@ -25,6 +25,7 @@ import org.elasticsearch.search.SearchShardTarget; import org.elasticsearch.search.facet.Facets; import org.elasticsearch.search.facet.InternalFacets; +import org.elasticsearch.search.suggest.Suggest; import org.elasticsearch.transport.TransportResponse; import java.io.IOException; @@ -43,6 +44,7 @@ public class QuerySearchResult extends TransportResponse implements QuerySearchR private int size; private TopDocs topDocs; private InternalFacets facets; + private Suggest suggest; private boolean searchTimedOut; public QuerySearchResult() { @@ -101,6 +103,14 @@ public void facets(InternalFacets facets) { this.facets = facets; } + public Suggest suggest() { + return suggest; + } + + public void suggest(Suggest suggest) { + this.suggest = suggest; + } + public int from() { return from; } @@ -136,6 +146,9 @@ public void readFrom(StreamInput in) throws IOException { if (in.readBoolean()) { facets = InternalFacets.readFacets(in); } + if (in.readBoolean()) { + suggest = Suggest.readSuggest(in); + } searchTimedOut = in.readBoolean(); } @@ -153,6 +166,12 @@ public void writeTo(StreamOutput out) throws IOException { out.writeBoolean(true); facets.writeTo(out); } + if (suggest == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + suggest.writeTo(out); + } out.writeBoolean(searchTimedOut); } } diff --git a/src/main/java/org/elasticsearch/search/suggest/Suggest.java b/src/main/java/org/elasticsearch/search/suggest/Suggest.java new file mode 100644 index 0000000000000..ab594ca97f606 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/Suggest.java @@ -0,0 +1,509 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest; + +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Streamable; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentBuilderString; + +import java.io.IOException; +import java.util.*; + +/** + * Top level suggest result, containing the result for each suggestion. + */ +public class Suggest implements Iterable, Streamable, ToXContent { + + static class Fields { + + static final XContentBuilderString SUGGEST = new XContentBuilderString("suggest"); + + } + + private List suggestions; + + Suggest() { + } + + public Suggest(List suggestions) { + this.suggestions = suggestions; + } + + /** + * @return the suggestions + */ + public List getSuggestions() { + return suggestions; + } + + @Override + public Iterator iterator() { + return suggestions.iterator(); + } + + @Override + public void readFrom(StreamInput in) throws IOException { + int size = in.readVInt(); + suggestions = new ArrayList(size); + for (int i = 0; i < size; i++) { + Suggestion suggestion = new Suggestion(); + suggestion.readFrom(in); + suggestions.add(suggestion); + } + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(suggestions.size()); + for (Suggestion command : suggestions) { + command.writeTo(out); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(Fields.SUGGEST); + for (Suggestion suggestion : suggestions) { + suggestion.toXContent(builder, params); + } + builder.endObject(); + return null; + } + + public static Suggest readSuggest(StreamInput in) throws IOException { + Suggest result = new Suggest(); + result.readFrom(in); + return result; + } + + /** + * The suggestion responses corresponding with the suggestions in the request. + */ + public static class Suggestion implements Streamable, ToXContent { + + static class Fields { + + static final XContentBuilderString TERMS = new XContentBuilderString("terms"); + + } + + private String name; + private int size; + private Sort sort; + private final List terms = new ArrayList(5); + + Suggestion() { + } + + Suggestion(String name, int size, Sort sort) { + this.name = name; + this.size = size; // The suggested term size specified in request, only used for merging shard responses + this.sort = sort; + } + + void addTerm(Term term) { + terms.add(term); + } + + /** + * @return The terms outputted by the suggest analyzer using the suggested text. Embeds the actual suggested + * terms. + */ + public List getTerms() { + return terms; + } + + /** + * @return The name of the suggestion as is defined in the request. + */ + public String getName() { + return name; + } + + /** + * Merges the result of another suggestion into this suggestion. + */ + public void reduce(Suggestion other) { + assert name.equals(other.name); + assert terms.size() == other.terms.size(); + for (int i = 0; i < terms.size(); i++) { + Term thisTerm = terms.get(i); + Term otherTerm = other.terms.get(i); + thisTerm.reduce(otherTerm, sort); + } + } + + /** + * Trims the number of suggestions per suggest text term to the requested size. + */ + public void trim() { + for (Term term : terms) { + term.trim(size); + } + } + + @Override + public void readFrom(StreamInput in) throws IOException { + name = in.readString(); + size = in.readVInt(); + sort = Sort.fromId(in.readByte()); + int size = in.readVInt(); + terms.clear(); + for (int i = 0; i < size; i++) { + terms.add(Term.read(in)); + } + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeString(name); + out.writeVInt(size); + out.writeByte(sort.id()); + out.writeVInt(terms.size()); + for (Term term : terms) { + term.writeTo(out); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(name); + builder.startArray(Fields.TERMS); + for (Term term : terms) { + term.toXContent(builder, params); + } + builder.endArray(); + builder.endObject(); + return builder; + } + + + /** + * Represents a term from the suggest text, that contains the term, start/end offsets and zero or more suggested + * terms for this term in the suggested text. + */ + public static class Term implements Streamable, ToXContent { + + static class Fields { + + static final XContentBuilderString TERM = new XContentBuilderString("term"); + static final XContentBuilderString SUGGESTIONS = new XContentBuilderString("suggestions"); + static final XContentBuilderString START_OFFSET = new XContentBuilderString("start_offset"); + static final XContentBuilderString END_OFFSET = new XContentBuilderString("end_offset"); + + } + + private Text term; + private int startOffset; + private int endOffset; + + private List suggested; + + public Term(Text term, int startOffset, int endOffset) { + this.term = term; + this.startOffset = startOffset; + this.endOffset = endOffset; + this.suggested = new ArrayList(5); + } + + Term() { + } + + void addSuggested(SuggestedTerm suggestedTerm) { + suggested.add(suggestedTerm); + } + + void reduce(Term otherTerm, Sort sort) { + assert term.equals(otherTerm.term()); + assert startOffset == otherTerm.startOffset; + assert endOffset == otherTerm.endOffset; + + for (SuggestedTerm otherSuggestedTerm : otherTerm.suggested) { + int index = suggested.indexOf(otherSuggestedTerm); + if (index >= 0) { + SuggestedTerm thisSuggestedTerm = suggested.get(index); + thisSuggestedTerm.setFrequency(thisSuggestedTerm.frequency + otherSuggestedTerm.frequency); + } else { + suggested.add(otherSuggestedTerm); + } + } + + Comparator comparator; + switch (sort) { + case SCORE: + comparator = SuggestPhase.SCORE; + break; + case FREQUENCY: + comparator = SuggestPhase.FREQUENCY; + break; + default: + throw new ElasticSearchException("Could not resolve comparator in reduce phase."); + } + Collections.sort(suggested, comparator); + } + + public Text term() { + return term; + } + + /** + * @return the term (analyzed by suggest analyzer) originating from the suggest text. + */ + public String getTerm() { + return term().string(); + } + + /** + * @return the start offset of this term in the suggest text. + */ + public int getStartOffset() { + return startOffset; + } + + /** + * @return the end offset of this term in the suggest text. + */ + public int getEndOffset() { + return endOffset; + } + + /** + * @return The suggested terms for this particular suggest text term. If there are no suggested terms then + * an empty list is returned. + */ + public List getSuggested() { + return suggested; + } + + void trim(int size) { + int suggestionsToRemove = Math.max(0, suggested.size() - size); + for (int i = 0; i < suggestionsToRemove; i++) { + suggested.remove(suggested.size() - 1); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Term term = (Term) o; + + if (endOffset != term.endOffset) return false; + if (startOffset != term.startOffset) return false; + if (!this.term.equals(term.term)) return false; + + return true; + } + + @Override + public int hashCode() { + int result = term.hashCode(); + result = 31 * result + startOffset; + result = 31 * result + endOffset; + return result; + } + + static Term read(StreamInput in) throws IOException { + Term term = new Term(); + term.readFrom(in); + return term; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + term = in.readText(); + startOffset = in.readVInt(); + endOffset = in.readVInt(); + int suggestedWords = in.readVInt(); + suggested = new ArrayList(suggestedWords); + for (int j = 0; j < suggestedWords; j++) { + suggested.add(SuggestedTerm.create(in)); + } + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeText(term); + out.writeVInt(startOffset); + out.writeVInt(endOffset); + out.writeVInt(suggested.size()); + for (SuggestedTerm suggestedTerm : suggested) { + suggestedTerm.writeTo(out); + } + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + builder.field(Fields.TERM, term); + builder.field(Fields.START_OFFSET, startOffset); + builder.field(Fields.END_OFFSET, endOffset); + builder.startArray(Fields.SUGGESTIONS); + for (SuggestedTerm suggestedTerm : suggested) { + suggestedTerm.toXContent(builder, params); + } + builder.endArray(); + builder.endObject(); + return builder; + } + + /** + * Represents the suggested term, containing a term and its document frequency and score. + */ + public static class SuggestedTerm implements Streamable, ToXContent { + + static class Fields { + + static final XContentBuilderString TERM = new XContentBuilderString("term"); + static final XContentBuilderString FREQUENCY = new XContentBuilderString("frequency"); + static final XContentBuilderString SCORE = new XContentBuilderString("score"); + + } + + private Text term; + private int frequency; + private float score; + + SuggestedTerm(Text term, int frequency, float score) { + this.term = term; + this.frequency = frequency; + this.score = score; + } + + SuggestedTerm() { + } + + public void setFrequency(int frequency) { + this.frequency = frequency; + } + + /** + * @return The actual term. + */ + public Text getTerm() { + return term; + } + + /** + * @return How often this suggested term appears in the index. + */ + public int getFrequency() { + return frequency; + } + + /** + * @return The score based on the edit distance difference between the suggested term and the + * term in the suggest text. + */ + public float getScore() { + return score; + } + + static SuggestedTerm create(StreamInput in) throws IOException { + SuggestedTerm suggestion = new SuggestedTerm(); + suggestion.readFrom(in); + return suggestion; + } + + @Override + public void readFrom(StreamInput in) throws IOException { + term = in.readText(); + frequency = in.readVInt(); + score = in.readFloat(); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeText(term); + out.writeVInt(frequency); + out.writeFloat(score); + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(); + builder.field(Fields.TERM, term); + builder.field(Fields.FREQUENCY, frequency); + builder.field(Fields.SCORE, score); + builder.endObject(); + return builder; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + SuggestedTerm that = (SuggestedTerm) o; + return term.equals(that.term); + + } + + @Override + public int hashCode() { + return term.hashCode(); + } + } + + } + + enum Sort { + + /** + * Sort should first be based on score. + */ + SCORE((byte) 0x0), + + /** + * Sort should first be based on document frequency. + */ + FREQUENCY((byte) 0x1); + + private byte id; + + private Sort(byte id) { + this.id = id; + } + + public byte id() { + return id; + } + + static Sort fromId(byte id) { + if (id == 0) { + return SCORE; + } else if (id == 1) { + return FREQUENCY; + } else { + throw new ElasticSearchException("Illegal suggest sort " + id); + } + } + + } + + } + +} diff --git a/src/main/java/org/elasticsearch/search/suggest/SuggestBuilder.java b/src/main/java/org/elasticsearch/search/suggest/SuggestBuilder.java new file mode 100644 index 0000000000000..55f8213078acd --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/SuggestBuilder.java @@ -0,0 +1,383 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest; + +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Defines how to perform suggesting. This builders allows a number of global options to be specified and + * an arbitrary number of {@link org.elasticsearch.search.suggest.SuggestBuilder.FuzzySuggestion} instances. + *

+ * Suggesting works by suggesting terms that appear in the suggest text that are similar compared to the terms in + * provided text. These spelling suggestions are based on several options described in this class. + */ +public class SuggestBuilder implements ToXContent { + + private String globalText; + + private final List suggestions = new ArrayList(); + + /** + * Sets the text to provide suggestions for. The suggest text is a required option that needs + * to be set either via this setter or via the {@link org.elasticsearch.search.suggest.SuggestBuilder.Suggestion#setText(String)} method. + *

+ * The suggest text gets analyzed by the suggest analyzer or the suggest field search analyzer. + * For each analyzed token, suggested terms are suggested if possible. + */ + public SuggestBuilder setText(String globalText) { + this.globalText = globalText; + return this; + } + + /** + * Adds an {@link org.elasticsearch.search.suggest.SuggestBuilder.FuzzySuggestion} instance under a user defined name. + * The order in which the Suggestions are added, is the same as in the response. + */ + public SuggestBuilder addSuggestion(Suggestion suggestion) { + suggestions.add(suggestion); + return this; + } + + /** + * Returns all suggestions with the defined names. + */ + public List getSuggestion() { + return suggestions; + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject("suggest"); + if (globalText != null) { + builder.field("text", globalText); + } + + builder.startObject("suggestions"); + for (Suggestion suggestion : suggestions) { + builder = suggestion.toXContent(builder, params); + } + builder.endObject(); + + builder.endObject(); + return builder; + } + + /** + * Convenience factory method. + * + * @param name The name of this suggestion. This is a required parameter. + */ + public static FuzzySuggestion fuzzySuggestion(String name) { + return new FuzzySuggestion(name); + } + + public static abstract class Suggestion implements ToXContent { + + private String name; + private String suggester; + private String text; + + public Suggestion(String name, String suggester) { + this.name = name; + this.suggester = suggester; + } + + /** + * Same as in {@link SuggestBuilder#setText(String)}, but in the suggestion scope. + */ + public T setText(String text) { + this.text = text; + return (T) this; + } + + @Override + public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { + builder.startObject(name); + if (suggester != null) { + builder.field("suggester", suggester); + } + if (text != null) { + builder.field("text", text); + } + builder = innerToXContent(builder, params); + builder.endObject(); + return builder; + } + + protected abstract XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException; + } + + /** + * Defines the actual suggest command. Each command uses the global options unless defined in the suggestion itself. + * All options are the same as the global options, but are only applicable for this suggestion. + */ + public static class FuzzySuggestion extends Suggestion { + + private String field; + private String analyzer; + private String suggestMode; + private Float accuracy; + private Integer size; + private String sort; + private String stringDistance; + private Boolean lowerCaseTerms; + private Integer maxEdits; + private Integer factor; + private Float maxTermFreq; + private Integer prefixLength; + private Integer minWordLength; + private Float minDocFreq; + private Integer shardSize; + + /** + * @param name The name of this suggestion. This is a required parameter. + */ + public FuzzySuggestion(String name) { + super(name, "fuzzy"); + } + + /** + * Sets from what field to fetch the candidate suggestions from. This is an required option and needs to be set + * via this setter or {@link org.elasticsearch.search.suggest.SuggestBuilder.FuzzySuggestion#setField(String)} method + */ + public FuzzySuggestion setField(String field) { + this.field = field; + return this; + } + + /** + * Sets the analyzer to analyse to suggest text with. Defaults to the search analyzer of the suggest field. + */ + public FuzzySuggestion setAnalyzer(String analyzer) { + this.analyzer = analyzer; + return this; + } + + /** + * The global suggest mode controls what suggested terms are included or controls for what suggest text tokens, + * terms should be suggested for. Three possible values can be specified: + *

    + *
  1. missing - Only suggest terms in the suggest text that aren't in the index. This is the default. + *
  2. popular - Only suggest terms that occur in more docs then the original suggest text term. + *
  3. always - Suggest any matching suggest terms based on tokens in the suggest text. + *
+ */ + public FuzzySuggestion setSuggestMode(String suggestMode) { + this.suggestMode = suggestMode; + return this; + } + + /** + * Sets how similar the suggested terms at least need to be compared to the original suggest text tokens. + * A value between 0 and 1 can be specified. This value will be compared to the string distance result of each + * candidate spelling correction. + *

+ * Default is 0.5f. + */ + public FuzzySuggestion setAccuracy(float accuracy) { + this.accuracy = accuracy; + return this; + } + + /** + * Sets the maximum suggestions to be returned per suggest text term. + */ + public FuzzySuggestion setSize(int size) { + if (size <= 0) { + throw new ElasticSearchIllegalArgumentException("Size must be positive"); + } + + this.size = size; + return this; + } + + /** + * Sets how to sort the suggest terms per suggest text token. + * Two possible values: + *

    + *
  1. score - Sort should first be based on score, then document frequency and then the term itself. + *
  2. frequency - Sort should first be based on document frequency, then scotr and then the term itself. + *
+ *

+ * What the score is depends on the suggester being used. + */ + public FuzzySuggestion setSort(String sort) { + this.sort = sort; + return this; + } + + /** + * Sets what string distance implementation to use for comparing how similar suggested terms are. + * Four possible values can be specified: + *

    + *
  1. internal - This is the default and is based on damerau_levenshtein, but + * highly optimized for comparing string distance for terms inside the index. + *
  2. damerau_levenshtein - String distance algorithm based on Damerau-Levenshtein algorithm. + *
  3. levenstein - String distance algorithm based on Levenstein edit distance algorithm. + *
  4. jarowinkler - String distance algorithm based on Jaro-Winkler algorithm. + *
  5. ngram - String distance algorithm based on n-grams. + *
+ */ + public FuzzySuggestion setStringDistance(String stringDistance) { + this.stringDistance = stringDistance; + return this; + } + + /** + * Sets whether to lowercase the suggest text tokens just before suggesting terms. + */ + public FuzzySuggestion setLowerCaseTerms(Boolean lowerCaseTerms) { + this.lowerCaseTerms = lowerCaseTerms; + return this; + } + + /** + * Sets the maximum edit distance candidate suggestions can have in order to be considered as a suggestion. + * Can only be a value between 1 and 2. Any other value result in an bad request error being thrown. Defaults to 2. + */ + public FuzzySuggestion setMaxEdits(Integer maxEdits) { + this.maxEdits = maxEdits; + return this; + } + + /** + * A factor that is used to multiply with the size in order to inspect more candidate suggestions. + * Can improve accuracy at the cost of performance. Defaults to 5. + */ + public FuzzySuggestion setFactor(Integer factor) { + this.factor = factor; + return this; + } + + /** + * Sets a maximum threshold in number of documents a suggest text token can exist in order to be corrected. + * Can be a relative percentage number (e.g 0.4) or an absolute number to represent document frequencies. + * If an value higher than 1 is specified then fractional can not be specified. Defaults to 0.01f. + *

+ * This can be used to exclude high frequency terms from being suggested. High frequency terms are usually + * spelled correctly on top of this this also improves the suggest performance. + */ + public FuzzySuggestion setMaxTermFreq(float maxTermFreq) { + this.maxTermFreq = maxTermFreq; + return this; + } + + /** + * Sets the number of minimal prefix characters that must match in order be a candidate suggestion. + * Defaults to 1. Increasing this number improves suggest performance. Usually misspellings don't occur in the + * beginning of terms. + */ + public FuzzySuggestion setPrefixLength(int prefixLength) { + this.prefixLength = prefixLength; + return this; + } + + /** + * The minimum length a suggest text term must have in order to be corrected. Defaults to 4. + */ + public FuzzySuggestion setMinWordLength(int minWordLength) { + this.minWordLength = minWordLength; + return this; + } + + /** + * Sets a minimal threshold in number of documents a suggested term should appear in. This can be specified as + * an absolute number or as a relative percentage of number of documents. This can improve quality by only suggesting + * high frequency terms. Defaults to 0f and is not enabled. If a value higher than 1 is specified then the number + * cannot be fractional. + */ + public FuzzySuggestion setMinDocFreq(float minDocFreq) { + this.minDocFreq = minDocFreq; + return this; + } + + /** + * Sets the maximum number of suggested term to be retrieved from each individual shard. During the reduce + * phase the only the top N suggestions are returned based on the size option. Defaults to the + * size option. + *

+ * Setting this to a value higher than the `size` can be useful in order to get a more accurate document frequency + * for suggested terms. Due to the fact that terms are partitioned amongst shards, the shard level document + * frequencies of suggestions may not be precise. Increasing this will make these document frequencies + * more precise. + */ + public FuzzySuggestion setShardSize(Integer shardSize) { + this.shardSize = shardSize; + return this; + } + + @Override + public XContentBuilder innerToXContent(XContentBuilder builder, Params params) throws IOException { + if (analyzer != null) { + builder.field("analyzer", analyzer); + } + if (field != null) { + builder.field("field", field); + } + if (suggestMode != null) { + builder.field("suggest_mode", suggestMode); + } + if (accuracy != null) { + builder.field("accuracy", accuracy); + } + if (size != null) { + builder.field("size", size); + } + if (sort != null) { + builder.field("sort", sort); + } + if (stringDistance != null) { + builder.field("string_distance", stringDistance); + } + if (lowerCaseTerms != null) { + builder.field("lowercase_terms", lowerCaseTerms); + } + if (maxEdits != null) { + builder.field("max_edits", maxEdits); + } + if (factor != null) { + builder.field("factor", factor); + } + if (maxTermFreq != null) { + builder.field("max_term_freq", maxTermFreq); + } + if (prefixLength != null) { + builder.field("prefix_length", prefixLength); + } + if (minWordLength != null) { + builder.field("min_word_len", minWordLength); + } + if (minDocFreq != null) { + builder.field("min_doc_freq", minDocFreq); + } + if (shardSize != null) { + builder.field("shard_size", shardSize); + } + return builder; + } + } + +} diff --git a/src/main/java/org/elasticsearch/search/suggest/SuggestParseElement.java b/src/main/java/org/elasticsearch/search/suggest/SuggestParseElement.java new file mode 100644 index 0000000000000..bfb60477b6279 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/SuggestParseElement.java @@ -0,0 +1,235 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.spell.*; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.LevenshteinAutomata; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.search.SearchParseElement; +import org.elasticsearch.search.internal.SearchContext; + +/** + * + */ +public class SuggestParseElement implements SearchParseElement { + + @Override + public void parse(XContentParser parser, SearchContext context) throws Exception { + SuggestionSearchContext suggestionSearchContext = new SuggestionSearchContext(); + + BytesRef globalText = null; + + Analyzer defaultAnalyzer = context.mapperService().searchAnalyzer(); + float defaultAccuracy = SpellChecker.DEFAULT_ACCURACY; + int defaultSize = 5; + SuggestMode defaultSuggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; + Suggest.Suggestion.Sort defaultSort = Suggest.Suggestion.Sort.SCORE; + StringDistance defaultStringDistance = DirectSpellChecker.INTERNAL_LEVENSHTEIN; + boolean defaultLowerCaseTerms = false; // changed from Lucene default because we rely on search analyzer to properly handle it + int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + int defaultFactor = 5; + float defaultMaxTermFreq = 0.01f; + int defaultPrefixLength = 1; + int defaultMinQueryLength = 4; + float defaultMinDocFreq = 0f; + + String fieldName = null; + XContentParser.Token token; + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + fieldName = parser.currentName(); + } else if (token.isValue()) { + if ("text".equals(fieldName)) { + globalText = parser.bytes(); + } else { + throw new ElasticSearchIllegalArgumentException("[suggest] does not support [" + fieldName + "]"); + } + } else if (token == XContentParser.Token.START_OBJECT) { + // TODO: Once we have more suggester impls we need to have different parsing logic per suggester. + // This code is now specific for the fuzzy suggester + if ("suggestions".equals(fieldName)) { + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + fieldName = parser.currentName(); + } else if (token == XContentParser.Token.START_OBJECT) { + SuggestionSearchContext.Suggestion suggestion = new SuggestionSearchContext.Suggestion(); + suggestionSearchContext.addSuggestion(fieldName, suggestion); + + while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) { + if (token == XContentParser.Token.FIELD_NAME) { + fieldName = parser.currentName(); + } else if (token.isValue()) { + if ("suggester".equals(fieldName)) { + suggestion.suggester(parser.text()); + } else if ("analyzer".equals(fieldName)) { + String analyzerName = parser.text(); + Analyzer analyzer = context.mapperService().analysisService().analyzer(analyzerName); + if (analyzer == null) { + throw new ElasticSearchIllegalArgumentException("Analyzer [" + analyzerName + "] doesn't exists"); + } + suggestion.analyzer(analyzer); + } else if ("text".equals(fieldName)) { + suggestion.text(parser.bytes()); + } else if ("field".equals(fieldName)) { + suggestion.setField(parser.text()); + } else if ("accuracy".equals(fieldName)) { + suggestion.accuracy(parser.floatValue()); + } else if ("size".equals(fieldName)) { + suggestion.size(parser.intValue()); + } else if ("suggest_mode".equals(fieldName) || "suggestMode".equals(fieldName)) { + suggestion.suggestMode(resolveSuggestMode(parser.text())); + } else if ("sort".equals(fieldName)) { + suggestion.sort(resolveSort(parser.text())); + } else if ("string_distance".equals(fieldName) || "stringDistance".equals(fieldName)) { + suggestion.stringDistance(resolveDistance(parser.text())); + } else if ("lowercase_terms".equals(fieldName) || "lowercaseTerms".equals(fieldName)) { + suggestion.lowerCaseTerms(parser.booleanValue()); + } else if ("max_edits".equals(fieldName) || "maxEdits".equals(fieldName) || "fuzziness".equals(fieldName)) { + suggestion.maxEdits(parser.intValue()); + if (suggestion.maxEdits() < 1 || suggestion.maxEdits() > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) { + throw new ElasticSearchIllegalArgumentException("Illegal max_edits value " + suggestion.maxEdits()); + } + } else if ("factor".equals(fieldName)) { + suggestion.factor(parser.intValue()); + } else if ("max_term_freq".equals(fieldName) || "maxTermFreq".equals(fieldName)) { + suggestion.maxTermFreq(parser.floatValue()); + } else if ("prefix_length".equals(fieldName) || "prefixLength".equals(fieldName)) { + suggestion.prefixLength(parser.intValue()); + } else if ("min_word_len".equals(fieldName) || "minWordLen".equals(fieldName)) { + suggestion.minQueryLength(parser.intValue()); + } else if ("min_doc_freq".equals(fieldName) || "minDocFreq".equals(fieldName)) { + suggestion.minDocFreq(parser.floatValue()); + } else if ("shard_size".equals(fieldName) || "shardSize".equals(fieldName)) { + suggestion.shardSize(parser.intValue()); + } else { + throw new ElasticSearchIllegalArgumentException("suggester[fuzzy] doesn't support [" + fieldName + "]"); + } + } + } + } + } + } + } + } + + // Verify options and set defaults + for (SuggestionSearchContext.Suggestion command : suggestionSearchContext.suggestions().values()) { + if (command.suggester() == null) { + throw new ElasticSearchIllegalArgumentException("The required suggester option is missing"); + } + if (command.field() == null) { + throw new ElasticSearchIllegalArgumentException("The required field option is missing"); + } + + if (command.text() == null) { + if (globalText == null) { + throw new ElasticSearchIllegalArgumentException("The required text option is missing"); + } + + command.text(globalText); + } + if (command.analyzer() == null) { + command.analyzer(defaultAnalyzer); + } + if (command.accuracy() == null) { + command.accuracy(defaultAccuracy); + } + if (command.size() == null) { + command.size(defaultSize); + } + if (command.suggestMode() == null) { + command.suggestMode(defaultSuggestMode); + } + if (command.sort() == null) { + command.sort(defaultSort); + } + if (command.stringDistance() == null) { + command.stringDistance(defaultStringDistance); + } + if (command.lowerCaseTerms() == null) { + command.lowerCaseTerms(defaultLowerCaseTerms); + } + if (command.maxEdits() == null) { + command.maxEdits(defaultMaxEdits); + } + if (command.factor() == null) { + command.factor(defaultFactor); + } + if (command.maxTermFreq() == null) { + command.maxTermFreq(defaultMaxTermFreq); + } + if (command.prefixLength() == null) { + command.prefixLength(defaultPrefixLength); + } + if (command.minWordLength() == null) { + command.minQueryLength(defaultMinQueryLength); + } + if (command.minDocFreq() == null) { + command.minDocFreq(defaultMinDocFreq); + } + if (command.shardSize() == null) { + command.shardSize(defaultSize); + } + } + context.suggest(suggestionSearchContext); + } + + private SuggestMode resolveSuggestMode(String sortVal) { + if ("missing".equals(sortVal)) { + return SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; + } else if ("popular".equals(sortVal)) { + return SuggestMode.SUGGEST_MORE_POPULAR; + } else if ("always".equals(sortVal)) { + return SuggestMode.SUGGEST_ALWAYS; + } else { + throw new ElasticSearchIllegalArgumentException("Illegal suggest mode " + sortVal); + } + } + + private Suggest.Suggestion.Sort resolveSort(String sortVal) { + if ("score".equals(sortVal)) { + return Suggest.Suggestion.Sort.SCORE; + } else if ("frequency".equals(sortVal)) { + return Suggest.Suggestion.Sort.FREQUENCY; + } else { + throw new ElasticSearchIllegalArgumentException("Illegal suggest sort " + sortVal); + } + } + + private StringDistance resolveDistance(String distanceVal) { + if ("internal".equals(distanceVal)) { + return DirectSpellChecker.INTERNAL_LEVENSHTEIN; + } else if ("damerau_levenshtein".equals(distanceVal)) { + return new LuceneLevenshteinDistance(); + } else if ("levenstein".equals(distanceVal)) { + return new LevensteinDistance(); + } else if ("jarowinkler".equals(distanceVal)) { + return new JaroWinklerDistance(); + } else if ("ngram".equals(distanceVal)) { + return new NGramDistance(); + } else { + throw new ElasticSearchIllegalArgumentException("Illegal distance option " + distanceVal); + } + } + +} diff --git a/src/main/java/org/elasticsearch/search/suggest/SuggestPhase.java b/src/main/java/org/elasticsearch/search/suggest/SuggestPhase.java new file mode 100644 index 0000000000000..811c01eca23cb --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/SuggestPhase.java @@ -0,0 +1,231 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest; + +import com.google.common.collect.ImmutableMap; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.spell.DirectSpellChecker; +import org.apache.lucene.search.spell.SuggestWord; +import org.apache.lucene.search.spell.SuggestWordFrequencyComparator; +import org.apache.lucene.search.spell.SuggestWordQueue; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; +import org.apache.lucene.util.UnicodeUtil; +import org.elasticsearch.ElasticSearchException; +import org.elasticsearch.ElasticSearchIllegalArgumentException; +import org.elasticsearch.common.bytes.BytesArray; +import org.elasticsearch.common.component.AbstractComponent; +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.io.FastCharArrayReader; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.text.BytesText; +import org.elasticsearch.common.text.StringText; +import org.elasticsearch.common.text.Text; +import org.elasticsearch.search.SearchParseElement; +import org.elasticsearch.search.SearchPhase; +import org.elasticsearch.search.internal.SearchContext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +import static org.elasticsearch.search.suggest.Suggest.Suggestion; + +/** + */ +public class SuggestPhase extends AbstractComponent implements SearchPhase { + + @Inject + public SuggestPhase(Settings settings) { + super(settings); + } + + @Override + public Map parseElements() { + ImmutableMap.Builder parseElements = ImmutableMap.builder(); + parseElements.put("suggest", new SuggestParseElement()); + return parseElements.build(); + } + + @Override + public void preProcess(SearchContext context) { + } + + @Override + public void execute(SearchContext context) throws ElasticSearchException { + SuggestionSearchContext suggest = context.suggest(); + if (suggest == null) { + return; + } + + try { + CharsRef spare = new CharsRef(); // Maybe add CharsRef to CacheRecycler? + List suggestions = new ArrayList(2); + for (Map.Entry entry : suggest.suggestions().entrySet()) { + SuggestionSearchContext.Suggestion suggestion = entry.getValue(); + if ("fuzzy".equals(suggestion.suggester())) { + suggestions.add(executeDirectSpellChecker(entry.getKey(), suggestion, context, spare)); + } else { + throw new ElasticSearchIllegalArgumentException("Unsupported suggester[" + suggestion.suggester() + "]"); + } + } + context.queryResult().suggest(new Suggest(suggestions)); + } catch (IOException e) { + throw new ElasticSearchException("I/O exception during suggest phase", e); + } + } + + private Suggestion executeDirectSpellChecker(String name, SuggestionSearchContext.Suggestion suggestion, SearchContext context, CharsRef spare) throws IOException { + DirectSpellChecker directSpellChecker = new DirectSpellChecker(); + directSpellChecker.setAccuracy(suggestion.accuracy()); + Comparator comparator; + switch (suggestion.sort()) { + case SCORE: + comparator = SuggestWordQueue.DEFAULT_COMPARATOR; + break; + case FREQUENCY: + comparator = LUCENE_FREQUENCY; + break; + default: + throw new ElasticSearchIllegalArgumentException("Illegal suggest sort: " + suggestion.sort()); + } + directSpellChecker.setComparator(comparator); + directSpellChecker.setDistance(suggestion.stringDistance()); + directSpellChecker.setLowerCaseTerms(suggestion.lowerCaseTerms()); + directSpellChecker.setMaxEdits(suggestion.maxEdits()); + directSpellChecker.setMaxInspections(suggestion.factor()); + directSpellChecker.setMaxQueryFrequency(suggestion.maxTermFreq()); + directSpellChecker.setMinPrefix(suggestion.prefixLength()); + directSpellChecker.setMinQueryLength(suggestion.minWordLength()); + directSpellChecker.setThresholdFrequency(suggestion.minDocFreq()); + + Suggestion response = new Suggestion( + name, suggestion.size(), suggestion.sort() + ); + List tokens = queryTerms(suggestion, spare); + for (Token token : tokens) { + IndexReader indexReader = context.searcher().getIndexReader(); + // TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef + SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar( + token.term, suggestion.shardSize(), indexReader, suggestion.suggestMode() + ); + Text key = new BytesText(new BytesArray(token.term.bytes())); + Suggestion.Term resultTerm = new Suggestion.Term(key, token.startOffset, token.endOffset); + for (SuggestWord suggestWord : suggestedWords) { + Text word = new StringText(suggestWord.string); + resultTerm.addSuggested(new Suggestion.Term.SuggestedTerm(word, suggestWord.freq, suggestWord.score)); + } + response.addTerm(resultTerm); + } + return response; + } + + private List queryTerms(SuggestionSearchContext.Suggestion suggestion, CharsRef spare) throws IOException { + UnicodeUtil.UTF8toUTF16(suggestion.text(), spare); + TokenStream ts = suggestion.analyzer().tokenStream( + suggestion.field(), new FastCharArrayReader(spare.chars, spare.offset, spare.length) + ); + ts.reset(); + + TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class); + OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); + BytesRef termRef = termAtt.getBytesRef(); + + List result = new ArrayList(5); + while (ts.incrementToken()) { + termAtt.fillBytesRef(); + Term term = new Term(suggestion.field(), BytesRef.deepCopyOf(termRef)); + result.add(new Token(term, offsetAtt.startOffset(), offsetAtt.endOffset())); + } + return result; + } + + private static Comparator LUCENE_FREQUENCY = new SuggestWordFrequencyComparator(); + public static Comparator SCORE = new Score(); + public static Comparator FREQUENCY = new Frequency(); + + // Same behaviour as comparators in suggest module, but for SuggestedWord + // Highest score first, then highest freq first, then lowest term first + public static class Score implements Comparator { + + @Override + public int compare(Suggestion.Term.SuggestedTerm first, Suggestion.Term.SuggestedTerm second) { + // first criteria: the distance + int cmp = Float.compare(second.getScore(), first.getScore()); + if (cmp != 0) { + return cmp; + } + + // second criteria (if first criteria is equal): the popularity + cmp = second.getFrequency() - first.getFrequency(); + if (cmp != 0) { + return cmp; + } + // third criteria: term text + return first.getTerm().compareTo(second.getTerm()); + } + + } + + // Same behaviour as comparators in suggest module, but for SuggestedWord + // Highest freq first, then highest score first, then lowest term first + public static class Frequency implements Comparator { + + @Override + public int compare(Suggestion.Term.SuggestedTerm first, Suggestion.Term.SuggestedTerm second) { + // first criteria: the popularity + int cmp = second.getFrequency() - first.getFrequency(); + if (cmp != 0) { + return cmp; + } + + // second criteria (if first criteria is equal): the distance + cmp = Float.compare(second.getScore(), first.getScore()); + if (cmp != 0) { + return cmp; + } + + // third criteria: term text + return first.getTerm().compareTo(second.getTerm()); + } + + } + + private static class Token { + + public final Term term; + public final int startOffset; + public final int endOffset; + + private Token(Term term, int startOffset, int endOffset) { + this.term = term; + this.startOffset = startOffset; + this.endOffset = endOffset; + } + + } + +} diff --git a/src/main/java/org/elasticsearch/search/suggest/SuggestionSearchContext.java b/src/main/java/org/elasticsearch/search/suggest/SuggestionSearchContext.java new file mode 100644 index 0000000000000..54952ed76d929 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/SuggestionSearchContext.java @@ -0,0 +1,206 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.suggest; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.search.spell.StringDistance; +import org.apache.lucene.search.spell.SuggestMode; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.ElasticSearchIllegalArgumentException; + +import java.util.LinkedHashMap; +import java.util.Map; + +/** + */ +public class SuggestionSearchContext { + + private final Map suggestions = new LinkedHashMap(4); + + public void addSuggestion(String name, Suggestion suggestion) { + suggestions.put(name, suggestion); + } + + public Map suggestions() { + return suggestions; + } + + public static class Suggestion { + + private String suggester; + private BytesRef text; + private String field; + private Analyzer analyzer; + private SuggestMode suggestMode; + private Float accuracy; + private Integer size; + private Suggest.Suggestion.Sort sort; + private StringDistance stringDistance; + private Boolean lowerCaseTerms; + private Integer maxEdits; + private Integer factor; + private Float maxTermFreq; + private Integer prefixLength; + private Integer minWordLength; + private Float minDocFreq; + private Integer shardSize; + + public String suggester() { + return suggester; + } + + public void suggester(String suggester) { + this.suggester = suggester; + } + + public BytesRef text() { + return text; + } + + public void text(BytesRef text) { + this.text = text; + } + + public Analyzer analyzer() { + return analyzer; + } + + public void analyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } + + public String field() { + return field; + } + + public void setField(String field) { + this.field = field; + } + + public SuggestMode suggestMode() { + return suggestMode; + } + + public void suggestMode(SuggestMode suggestMode) { + this.suggestMode = suggestMode; + } + + public Float accuracy() { + return accuracy; + } + + public void accuracy(float accuracy) { + this.accuracy = accuracy; + } + + public Integer size() { + return size; + } + + public void size(int size) { + if (size <= 0) { + throw new ElasticSearchIllegalArgumentException("Size must be positive"); + } + + this.size = size; + } + + public Suggest.Suggestion.Sort sort() { + return sort; + } + + public void sort(Suggest.Suggestion.Sort sort) { + this.sort = sort; + } + + public StringDistance stringDistance() { + return stringDistance; + } + + public void stringDistance(StringDistance distance) { + this.stringDistance = distance; + } + + public Boolean lowerCaseTerms() { + return lowerCaseTerms; + } + + public void lowerCaseTerms(boolean lowerCaseTerms) { + this.lowerCaseTerms = lowerCaseTerms; + } + + public Integer maxEdits() { + return maxEdits; + } + + public void maxEdits(int maxEdits) { + this.maxEdits = maxEdits; + } + + public Integer factor() { + return factor; + } + + public void factor(int factor) { + this.factor = factor; + } + + public Float maxTermFreq() { + return maxTermFreq; + } + + public void maxTermFreq(float maxTermFreq) { + this.maxTermFreq = maxTermFreq; + } + + public Integer prefixLength() { + return prefixLength; + } + + public void prefixLength(int prefixLength) { + this.prefixLength = prefixLength; + } + + public Integer minWordLength() { + return minWordLength; + } + + public void minQueryLength(int minQueryLength) { + this.minWordLength = minQueryLength; + } + + public Float minDocFreq() { + return minDocFreq; + } + + public void minDocFreq(float minDocFreq) { + this.minDocFreq = minDocFreq; + } + + public Integer shardSize() { + return shardSize; + } + + public void shardSize(Integer shardSize) { + this.shardSize = shardSize; + } + } + +} diff --git a/src/test/java/org/elasticsearch/benchmark/search/SuggestSearchBenchMark.java b/src/test/java/org/elasticsearch/benchmark/search/SuggestSearchBenchMark.java new file mode 100644 index 0000000000000..799b2fccb73a0 --- /dev/null +++ b/src/test/java/org/elasticsearch/benchmark/search/SuggestSearchBenchMark.java @@ -0,0 +1,166 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.benchmark.search; + +import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.client.Client; +import org.elasticsearch.client.Requests; +import org.elasticsearch.common.StopWatch; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.SizeValue; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.node.Node; +import org.elasticsearch.search.suggest.Suggest; +import org.elasticsearch.search.suggest.SuggestBuilder; + +import java.io.IOException; +import java.util.List; + +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS; +import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; +import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.index.query.QueryBuilders.*; +import static org.elasticsearch.node.NodeBuilder.nodeBuilder; + +/** + */ +public class SuggestSearchBenchMark { + + public static void main(String[] args) throws Exception { + int SEARCH_ITERS = 200; + + Settings settings = settingsBuilder() + .put(SETTING_NUMBER_OF_SHARDS, 1) + .put(SETTING_NUMBER_OF_REPLICAS, 0) + .build(); + + Node[] nodes = new Node[1]; + for (int i = 0; i < nodes.length; i++) { + nodes[i] = nodeBuilder().settings(settingsBuilder().put(settings).put("name", "node" + i)).node(); + } + + Client client = nodes[0].client(); + try { + client.admin().indices().prepareCreate("test").setSettings(settings).addMapping("type1", XContentFactory.jsonBuilder().startObject().startObject("type1") + .startObject("_source").field("enabled", false).endObject() + .startObject("_all").field("enabled", false).endObject() + .startObject("_type").field("index", "no").endObject() + .startObject("_id").field("index", "no").endObject() + .startObject("properties") + .startObject("field").field("type", "string").field("index", "not_analyzed").field("omit_norms", true).endObject() + .endObject() + .endObject().endObject()).execute().actionGet(); + ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth("test").setWaitForGreenStatus().execute().actionGet(); + if (clusterHealthResponse.timedOut()) { + System.err.println("--> Timed out waiting for cluster health"); + } + + StopWatch stopWatch = new StopWatch().start(); + long COUNT = SizeValue.parseSizeValue("10m").singles(); + int BATCH = 100; + System.out.println("Indexing [" + COUNT + "] ..."); + long ITERS = COUNT / BATCH; + long i = 1; + char character = 'a'; + int idCounter = 0; + for (; i <= ITERS; i++) { + int termCounter = 0; + BulkRequestBuilder request = client.prepareBulk(); + for (int j = 0; j < BATCH; j++) { + request.add(Requests.indexRequest("test").type("type1").id(Integer.toString(idCounter++)).source(source("prefix" + character + termCounter++))); + } + character++; + BulkResponse response = request.execute().actionGet(); + if (response.hasFailures()) { + System.err.println("failures..."); + } + } + System.out.println("Indexing took " + stopWatch.totalTime()); + + client.admin().indices().prepareRefresh().execute().actionGet(); + System.out.println("Count: " + client.prepareCount().setQuery(matchAllQuery()).execute().actionGet().count()); + } catch (Exception e) { + System.out.println("--> Index already exists, ignoring indexing phase, waiting for green"); + ClusterHealthResponse clusterHealthResponse = client.admin().cluster().prepareHealth().setWaitForGreenStatus().setTimeout("10m").execute().actionGet(); + if (clusterHealthResponse.timedOut()) { + System.err.println("--> Timed out waiting for cluster health"); + } + client.admin().indices().prepareRefresh().execute().actionGet(); + System.out.println("Count: " + client.prepareCount().setQuery(matchAllQuery()).execute().actionGet().count()); + } + + + System.out.println("Warming up..."); + char startChar = 'a'; + for (int i = 0; i <= 20; i++) { + String term = "prefix" + startChar; + SearchResponse response = client.prepareSearch() + .setQuery(prefixQuery("field", term)) + .addSuggestion(new SuggestBuilder.FuzzySuggestion("field").setField("field").setText(term).setSuggestMode("always")) + .execute().actionGet(); + if (response.hits().totalHits() == 0) { + System.err.println("No hits"); + continue; + } + startChar++; + } + + + System.out.println("Starting benchmarking suggestions."); + startChar = 'a'; + long timeTaken = 0; + for (int i = 0; i <= SEARCH_ITERS; i++) { + String term = "prefix" + startChar; + SearchResponse response = client.prepareSearch() + .setQuery(matchQuery("field", term)) + .addSuggestion(new SuggestBuilder.FuzzySuggestion("field").setText(term).setField("field").setSuggestMode("always")) + .execute().actionGet(); + timeTaken += response.tookInMillis(); + if (response.suggest() == null) { + System.err.println("No suggestions"); + continue; + } + List suggestedTerms = response.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested(); + if (suggestedTerms == null || suggestedTerms.isEmpty()) { + System.err.println("No suggestions"); + } + startChar++; + } + + System.out.println("Avg time taken without filter " + (timeTaken / SEARCH_ITERS)); + + client.close(); + for (Node node : nodes) { + node.close(); + } + } + + private static XContentBuilder source(String nameValue) throws IOException { + return jsonBuilder().startObject() + .field("field", nameValue) + .endObject(); + } + +} diff --git a/src/test/java/org/elasticsearch/test/integration/search/suggest/SuggestSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/suggest/SuggestSearchTests.java new file mode 100644 index 0000000000000..578519f2a9e9f --- /dev/null +++ b/src/test/java/org/elasticsearch/test/integration/search/suggest/SuggestSearchTests.java @@ -0,0 +1,348 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.test.integration.search.suggest; + +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.client.Client; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.test.integration.AbstractNodesTests; +import org.testng.annotations.AfterClass; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; + +import static org.elasticsearch.index.query.QueryBuilders.matchQuery; +import static org.elasticsearch.search.suggest.SuggestBuilder.fuzzySuggestion; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.notNullValue; + +/** + */ +public class SuggestSearchTests extends AbstractNodesTests { + + private Client client; + + @BeforeClass + public void createNodes() throws Exception { + startNode("server1"); + startNode("server2"); + client = getClient(); + } + + @AfterClass + public void closeNodes() { + client.close(); + closeAllNodes(); + } + + protected Client getClient() { + return client("server1"); + } + + @Test + public void testSimple() throws Exception { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + client.admin().indices().prepareCreate("test").execute().actionGet(); + + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("text", "abcd") + .endObject() + ) + .execute().actionGet(); + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("text", "aacd") + .endObject() + ) + .execute().actionGet(); + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("text", "abbd") + .endObject() + ) + .execute().actionGet(); + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("text", "abcc") + .endObject() + ) + .execute().actionGet(); + client.admin().indices().prepareRefresh().execute().actionGet(); + + SearchResponse search = client.prepareSearch() + .setQuery(matchQuery("text", "spellcecker")) + .addSuggestion( + fuzzySuggestion("test").setSuggestMode("always") // Always, otherwise the results can vary between requests. + .setText("abcd") + .setField("text")) + .execute().actionGet(); + + assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); + assertThat(search.suggest(), notNullValue()); + assertThat(search.suggest().getSuggestions().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getName(), equalTo("test")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getTerm(), equalTo("abcd")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().size(), equalTo(3)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(0).getTerm().string(), equalTo("aacd")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(1).getTerm().string(), equalTo("abbd")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(2).getTerm().string(), equalTo("abcc")); + + client.prepareSearch() + .addSuggestion( + fuzzySuggestion("test").setSuggestMode("always") // Always, otherwise the results can vary between requests. + .setText("abcd") + .setField("text")) + .execute().actionGet(); + + assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); + assertThat(search.suggest(), notNullValue()); + assertThat(search.suggest().getSuggestions().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getName(), equalTo("test")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().size(), equalTo(3)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(0).getTerm().string(), equalTo("aacd")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(1).getTerm().string(), equalTo("abbd")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(2).getTerm().string(), equalTo("abcc")); + } + + @Test + public void testEmpty() throws Exception { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + client.admin().indices().prepareCreate("test").execute().actionGet(); + + SearchResponse search = client.prepareSearch() + .setQuery(matchQuery("text", "spellcecker")) + .addSuggestion( + fuzzySuggestion("test").setSuggestMode("always") // Always, otherwise the results can vary between requests. + .setText("abcd") + .setField("text")) + .execute().actionGet(); + + assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); + assertThat(search.suggest(), notNullValue()); + assertThat(search.suggest().getSuggestions().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getName(), equalTo("test")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getTerm(), equalTo("abcd")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().size(), equalTo(0)); + + client.prepareSearch() + .addSuggestion( + fuzzySuggestion("test").setSuggestMode("always") // Always, otherwise the results can vary between requests. + .setText("abcd") + .setField("text")) + .execute().actionGet(); + + assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); + assertThat(search.suggest(), notNullValue()); + assertThat(search.suggest().getSuggestions().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getName(), equalTo("test")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().size(), equalTo(0)); + } + + @Test + public void testWithMultipleCommands() throws Exception { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + client.admin().indices().prepareCreate("test").execute().actionGet(); + + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("field1", "prefix_abcd") + .field("field2", "prefix_efgh") + .endObject() + ) + .execute().actionGet(); + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("field1", "prefix_aacd") + .field("field2", "prefix_eeeh") + .endObject() + ) + .execute().actionGet(); + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("field1", "prefix_abbd") + .field("field2", "prefix_efff") + .endObject() + ) + .execute().actionGet(); + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("field1", "prefix_abcc") + .field("field2", "prefix_eggg") + .endObject() + ) + .execute().actionGet(); + client.admin().indices().prepareRefresh().execute().actionGet(); + + SearchResponse search = client.prepareSearch() + .addSuggestion(fuzzySuggestion("size1") + .setSize(1).setText("prefix_abcd").setMaxTermFreq(10).setMinDocFreq(0) + .setField("field1").setSuggestMode("always")) + .addSuggestion(fuzzySuggestion("field2") + .setField("field2").setText("prefix_eeeh prefix_efgh") + .setMaxTermFreq(10).setMinDocFreq(0).setSuggestMode("always")) + .addSuggestion(fuzzySuggestion("accuracy") + .setField("field2").setText("prefix_efgh").setAccuracy(1f) + .setMaxTermFreq(10).setMinDocFreq(0).setSuggestMode("always")) + .execute().actionGet(); + + assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); + assertThat(search.suggest(), notNullValue()); + assertThat(search.suggest().getSuggestions().size(), equalTo(3)); + assertThat(search.suggest().getSuggestions().get(0).getName(), equalTo("size1")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(0).getTerm().string(), equalTo("prefix_aacd")); + assertThat(search.suggest().getSuggestions().get(1).getName(), equalTo("field2")); + assertThat(search.suggest().getSuggestions().get(1).getTerms().size(), equalTo(2)); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(0).getSuggested().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(1).getSuggested().size(), equalTo(3)); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(1).getSuggested().get(0).getTerm().string(), equalTo("prefix_eeeh")); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(1).getSuggested().get(1).getTerm().string(), equalTo("prefix_efff")); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(1).getSuggested().get(2).getTerm().string(), equalTo("prefix_eggg")); + assertThat(search.suggest().getSuggestions().get(2).getName(), equalTo("accuracy")); + assertThat(search.suggest().getSuggestions().get(2).getTerms().get(0).getSuggested().isEmpty(), equalTo(true)); + } + + @Test + public void testSizeAndSort() throws Exception { + try { + client.admin().indices().prepareDelete("test").execute().actionGet(); + } catch (Exception e) { + // ignore + } + client.admin().indices().prepareCreate("test").execute().actionGet(); + + Map termsAndDocCount = new HashMap(); + termsAndDocCount.put("prefix_aaad", 20); + termsAndDocCount.put("prefix_abbb", 18); + termsAndDocCount.put("prefix_aaca", 16); + termsAndDocCount.put("prefix_abba", 14); + termsAndDocCount.put("prefix_accc", 12); + termsAndDocCount.put("prefix_addd", 10); + termsAndDocCount.put("prefix_abaa", 8); + termsAndDocCount.put("prefix_dbca", 6); + termsAndDocCount.put("prefix_cbad", 4); + + termsAndDocCount.put("prefix_aacd", 1); + termsAndDocCount.put("prefix_abcc", 1); + termsAndDocCount.put("prefix_accd", 1); + + for (Map.Entry entry : termsAndDocCount.entrySet()) { + for (int i = 0; i < entry.getValue(); i++) { + client.prepareIndex("test", "type1") + .setSource(XContentFactory.jsonBuilder() + .startObject() + .field("field1", entry.getKey()) + .endObject() + ) + .execute().actionGet(); + } + } + client.admin().indices().prepareRefresh().execute().actionGet(); + + SearchResponse search = client.prepareSearch() + .setSuggestText("prefix_abcd") + .addSuggestion(fuzzySuggestion("size3SortScoreFirst") + .setSize(3).setMinDocFreq(0).setField("field1").setSuggestMode("always")) + .addSuggestion(fuzzySuggestion("size10SortScoreFirst") + .setSize(10).setMinDocFreq(0).setField("field1").setSuggestMode("always")) + .addSuggestion(fuzzySuggestion("size3SortScoreFirstMaxEdits1") + .setMaxEdits(1) + .setSize(10).setMinDocFreq(0).setField("field1").setSuggestMode("always")) + .addSuggestion(fuzzySuggestion("size10SortFrequencyFirst") + .setSize(10).setSort("frequency").setShardSize(1000) + .setMinDocFreq(0).setField("field1").setSuggestMode("always")) + .execute().actionGet(); + + assertThat(Arrays.toString(search.shardFailures()), search.failedShards(), equalTo(0)); + assertThat(search.suggest(), notNullValue()); + assertThat(search.suggest().getSuggestions().size(), equalTo(4)); + assertThat(search.suggest().getSuggestions().get(0).getName(), equalTo("size3SortScoreFirst")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().size(), equalTo(3)); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(0).getTerm().string(), equalTo("prefix_aacd")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(1).getTerm().string(), equalTo("prefix_abcc")); + assertThat(search.suggest().getSuggestions().get(0).getTerms().get(0).getSuggested().get(2).getTerm().string(), equalTo("prefix_accd")); + + assertThat(search.suggest().getSuggestions().get(1).getName(), equalTo("size10SortScoreFirst")); + assertThat(search.suggest().getSuggestions().get(1).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(0).getSuggested().size(), equalTo(10)); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(0).getSuggested().get(0).getTerm().string(), equalTo("prefix_aacd")); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(0).getSuggested().get(1).getTerm().string(), equalTo("prefix_abcc")); + assertThat(search.suggest().getSuggestions().get(1).getTerms().get(0).getSuggested().get(2).getTerm().string(), equalTo("prefix_accd")); + // This fails sometimes. Depending on how the docs are sharded. The suggested suggest corrections get the df on shard level, which + // isn't correct comparing it to the index level. +// assertThat(search.suggest().suggestions().get(1).getSuggestedWords().get("prefix_abcd").get(3).getTerm(), equalTo("prefix_aaad")); + + assertThat(search.suggest().getSuggestions().get(2).getName(), equalTo("size3SortScoreFirstMaxEdits1")); + assertThat(search.suggest().getSuggestions().get(2).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(2).getTerms().get(0).getSuggested().size(), equalTo(3)); + assertThat(search.suggest().getSuggestions().get(2).getTerms().get(0).getSuggested().get(0).getTerm().string(), equalTo("prefix_aacd")); + assertThat(search.suggest().getSuggestions().get(2).getTerms().get(0).getSuggested().get(1).getTerm().string(), equalTo("prefix_abcc")); + assertThat(search.suggest().getSuggestions().get(2).getTerms().get(0).getSuggested().get(2).getTerm().string(), equalTo("prefix_accd")); + + assertThat(search.suggest().getSuggestions().get(3).getName(), equalTo("size10SortFrequencyFirst")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().size(), equalTo(1)); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().size(), equalTo(10)); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(0).getTerm().string(), equalTo("prefix_aaad")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(1).getTerm().string(), equalTo("prefix_abbb")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(2).getTerm().string(), equalTo("prefix_aaca")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(3).getTerm().string(), equalTo("prefix_abba")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(4).getTerm().string(), equalTo("prefix_accc")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(5).getTerm().string(), equalTo("prefix_addd")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(6).getTerm().string(), equalTo("prefix_abaa")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(7).getTerm().string(), equalTo("prefix_dbca")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(8).getTerm().string(), equalTo("prefix_cbad")); + assertThat(search.suggest().getSuggestions().get(3).getTerms().get(0).getSuggested().get(9).getTerm().string(), equalTo("prefix_aacd")); +// assertThat(search.suggest().suggestions().get(3).getSuggestedWords().get("prefix_abcd").get(4).getTerm(), equalTo("prefix_abcc")); +// assertThat(search.suggest().suggestions().get(3).getSuggestedWords().get("prefix_abcd").get(4).getTerm(), equalTo("prefix_accd")); + } + + +}