From 2bc624806d3cb1ea2bdf2318c2730d594a4ce5bf Mon Sep 17 00:00:00 2001 From: Shay Banon Date: Thu, 28 Feb 2013 16:02:38 +0100 Subject: [PATCH 01/97] not bytes... --- .../fieldcomparator/DoubleValuesComparatorBase.java | 6 +++--- .../fielddata/fieldcomparator/LongValuesComparatorBase.java | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/DoubleValuesComparatorBase.java b/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/DoubleValuesComparatorBase.java index 46843594e79d0..3fb38f03caf65 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/DoubleValuesComparatorBase.java +++ b/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/DoubleValuesComparatorBase.java @@ -56,7 +56,7 @@ public final int compareDocToValue(int doc, T valueObj) throws IOException { public final FieldComparator setNextReader(AtomicReaderContext context) throws IOException { readerValues = indexFieldData.load(context).getDoubleValues(); if (readerValues.isMultiValued()) { - readerValues = new MultiValuedBytesWrapper(readerValues, sortMode); + readerValues = new MultiValueWrapper(readerValues, sortMode); } return this; } @@ -71,11 +71,11 @@ static final int compare(double left, double right) { } } - static final class MultiValuedBytesWrapper extends DoubleValues.FilteredDoubleValues { + static final class MultiValueWrapper extends DoubleValues.FilteredDoubleValues { private final SortMode sortMode; - public MultiValuedBytesWrapper(DoubleValues delegate, SortMode sortMode) { + public MultiValueWrapper(DoubleValues delegate, SortMode sortMode) { super(delegate); this.sortMode = sortMode; } diff --git a/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/LongValuesComparatorBase.java b/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/LongValuesComparatorBase.java index e9da18ef4a50d..257f5d6ec2e60 100644 --- a/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/LongValuesComparatorBase.java +++ b/src/main/java/org/elasticsearch/index/fielddata/fieldcomparator/LongValuesComparatorBase.java @@ -67,16 +67,16 @@ static final int compare(long left, long right) { public final FieldComparator setNextReader(AtomicReaderContext context) throws IOException { readerValues = indexFieldData.load(context).getLongValues(); if (readerValues.isMultiValued()) { - readerValues = new MultiValuedBytesWrapper(readerValues, sortMode); + readerValues = new MultiValueWrapper(readerValues, sortMode); } return this; } - private static final class MultiValuedBytesWrapper extends LongValues.FilteredLongValues { + private static final class MultiValueWrapper extends LongValues.FilteredLongValues { private final SortMode sortMode; - public MultiValuedBytesWrapper(LongValues delegate, SortMode sortMode) { + public MultiValueWrapper(LongValues delegate, SortMode sortMode) { super(delegate); this.sortMode = sortMode; } From d4ec03ed76673e14c5e5566a78fc3785b8a73f1e Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 11 Feb 2013 18:19:50 +0100 Subject: [PATCH 02/97] # Phrase Suggester The `term` suggester provides a very convenient API to access word alternatives on token basis within a certain string distance. The API allows accessing each token in the stream individually while suggest-selection is left to the API consumer. Yet, often already ranked / selected suggestions are required in order to present to the end-user. Inside ElasticSearch we have the ability to access way more statistics and information quickly to make better decision which token alternative to pick or if to pick an alternative at all. This `phrase` suggester adds some logic on top of the `term` suggester to select entire corrected phrases instead of individual tokens weighted based on a *ngram-langugage models*. In practice it will be able to make better decision about which tokens to pick based on co-occurence and frequencies. The current implementation is kept quite general and leaves room for future improvements. # API Example The `phrase` request is defined along side the query part in the json request: ```json curl -s -XPOST 'localhost:9200/_search' -d { "suggest" : { "text" : "Xor the Got-Jewel", "simple_phrase" : { "phrase" : { "analyzer" : "body", "field" : "bigram", "size" : 1, "real_word_error_likelihood" : 0.95, "max_errors" : 0.5, "gram_size" : 2, "direct_generator" : [ { "field" : "body", "suggest_mode" : "always", "min_word_len" : 1 } ] } } } } ``` The response contains suggested sored by the most likely spell correction first. In this case we got the expected correction `xorr the god jewel` first while the second correction is less conservative where only one of the errors is corrected. Note, the request is executed with `max_errors` set to `0.5` so 50% of the terms can contain misspellings (See parameter descriptions below). ```json { "took" : 37, "timed_out" : false, "_shards" : { "total" : 5, "successful" : 5, "failed" : 0 }, "hits" : { "total" : 2938, "max_score" : 0.0, "hits" : [ ] }, "suggest" : { "simple_phrase" : [ { "text" : "Xor the Got-Jewel", "offset" : 0, "length" : 17, "options" : [ { "text" : "xorr the god jewel", "score" : 0.17877324 }, { "text" : "xor the god jewel", "score" : 0.14231323 } ] } ] } } ```` # Phrase suggest API ## Basic parameters * `field` - the name of the field used to do n-gram lookups for the language model, the suggester will use this field to gain statistics to score corrections. * `gram_size` - sets max size of the n-grams (shingles) in the `field`. If the field doesn't contain n-grams (shingles) this should be omitted or set to `1`. * `real_word_error_likelihood` - the likelihood of a term being a misspelled even if the term exists in the dictionary. The default it `0.95` corresponding to 5% or the real words are misspelled. * `confidence` - The confidence level defines a factor applied to the input phrases score which is used as a threshold for other suggest candidates. Only candidates that score higher than the threshold will be included in the result. For instance a confidence level of `1.0` will only return suggestions that score higher than the input phrase. If set to `0.0` the top N candidates are returned. The default is `1.0`. * `max_errors` - the maximum percentage of the terms that at most considered to be misspellings in order to form a correction. This method accepts a float value in the range `[0..1)` as a fraction of the actual query terms a number `>=1` as an absolut number of query terms. The default is set to `1.0` which corresponds to that only corrections with at most 1 misspelled term are returned. * `separator` - the separator that is used to separate terms in the bigram field. If not set the whitespce character is used as a separator. * `size` - the number of candidates that are generated for each individual query term Low numbers like `3` or `5` typically produce good results. Raising this can bring up terms with higher edit distances. The default is `5`. * `analyzer` - Sets the analyzer to analyse to suggest text with. Defaults to the search analyzer of the suggest field passed via `field`. * `shard_size` - Sets the maximum number of suggested term to be retrieved from each individual shard. During the reduce phase the only the top N suggestions are returned based on the `size` option. Defaults to `5`. * `text` - Sets the text / query to provide suggestions for. ## Smoothing Models The `phrase` suggester supports multiple smoothing models to balance weight between infrequent grams (grams (shingles) are not existing in the index) and frequent grams (appear at least once in the index). * `laplace` - the default model that uses an additive smoothing model where a constant (typically `1.0` or smaller) is added to all counts to balance weights, The default `alpha` is `0.5`. * `stupid_backoff` - a simple backoff model that backs off to lower order n-gram models if the higher order count is `0` and discounts the lower order n-gram model by a constant factor. The default `discount` is `0.4`. * `linear_interpolation` - a smoothing model that takes the weighted mean of the unigrams, bigrams and trigrams based on user supplied weights (lambdas). Linear Interpolation doesn't have any default values. All parameters (`trigram_lambda`, `bigram_lambda`, `unigram_lambda`) must be supplied. ## Candidate Generators The `phrase` suggester uses candidate generators to produce a list of possible terms per term in the given text. A single candidate generator is similar to a `term` suggester called for each individual term in the text. The output of the generators is subsequently scored in in combination with the candidates from the other terms to for suggestion candidates. Currently only one type of candidate generator is supported, the `direct_generator`. The Phrase suggest API accepts a list of generators under the key `direct_generator` each of the generators in the list are called per term in the original text. ## Direct Generators The direct generators support the following parameters: * `field` - The field to fetch the candidate suggestions from. This is an required option that either needs to be set globally or per suggestion. * `analyzer` - The analyzer to analyse the suggest text with. Defaults to the search analyzer of the suggest field. * `size` - The maximum corrections to be returned per suggest text token. * `suggest_mode` - The suggest mode controls what suggestions are included or controls for what suggest text terms, suggestions should be suggested. Three possible values can be specified: * `missing` - Only suggest terms in the suggest text that aren't in the index. This is the default. * `popular` - Only suggest suggestions that occur in more docs then the original suggest text term. * `always` - Suggest any matching suggestions based on terms in the suggest text. * `max_edits` - The maximum edit distance candidate suggestions can have in order to be considered as a suggestion. Can only be a value between 1 and 2. Any other value result in an bad request error being thrown. Defaults to 2. * `min_prefix` - The number of minimal prefix characters that must match in order be a candidate suggestions. Defaults to 1. Increasing this number improves spellcheck performance. Usually misspellings don't occur in the beginning of terms. * `min_query_length` - The minimum length a suggest text term must have in order to be included. Defaults to 4. * `max_inspections` - A factor that is used to multiply with the `shards_size` in order to inspect more candidate spell corrections on the shard level. Can improve accuracy at the cost of performance. Defaults to 5. * `threshold_frequency` - The minimal threshold in number of documents a suggestion should appear in. This can be specified as an absolute number or as a relative percentage of number of documents. This can improve quality by only suggesting high frequency terms. Defaults to 0f and is not enabled. If a value higher than 1 is specified then the number cannot be fractional. The shard level document frequencies are used for this option. * `max_query_frequency` - The maximum threshold in number of documents a sugges text token can exist in order to be included. Can be a relative percentage number (e.g 0.4) or an absolute number to represent document frequencies. If an value higher than 1 is specified then fractional can not be specified. Defaults to 0.01f. This can be used to exclude high frequency terms from being spellchecked. High frequency terms are usually spelled correctly on top of this this also improves the spellcheck performance. The shard level document frequencies are used for this option. * pre_filter - a filter (analyzer) that is applied to each of the tokens passed to this candidate generator. This filter is applied to the original token before candidates are generated. (optional) * post_filter - a filter (analyzer) that is applied to each of the generated tokens before they are passed to the actual phrase scorer. (optional) The following example shows a `phrase` suggest call with two generators, the first one is using a field containing ordinary indexed terms and the second one uses a field that uses terms indexed with a `reverse` filter (tokens are index in reverse order). This is used to overcome the limitation of the direct generators to require a constant prefix to provide high-performance suggestions. The `pre_filter` and `post_filter` options accept ordinary analyzer names. ```json curl -s -XPOST 'localhost:9200/_search' -d { "suggest" : { "text" : "Xor the Got-Jewel", "simple_phrase" : { "phrase" : { "analyzer" : "body", "field" : "bigram", "size" : 4, "real_word_error_likelihood" : 0.95, "confidence" : 2.0, "gram_size" : 2, "direct_generator" : [ { "field" : "body", "suggest_mode" : "always", "min_word_len" : 1 }, { "field" : "reverse", "suggest_mode" : "always", "min_word_len" : 1, "pre_filter" : "reverse", "post_filter" : "reverse" } ] } } } } ``` `pre_filter` and `post_filter` can also be used to inject synonyms after candidates are generated. For instance for the query `captain usq` we might generate a candidate `usa` for term `usq` which is a synonym for `america` which allows to present `captain america` to the user if this phrase scores high enough. Closes #2709 --- .../action/search/SearchRequestBuilder.java | 4 +- .../analysis/ShingleTokenFilterFactory.java | 18 +- .../index/mapper/MapperService.java | 9 + .../rest/action/search/RestSearchAction.java | 6 +- .../action/SearchServiceTransportAction.java | 2 +- .../controller/SearchPhaseController.java | 37 +- .../suggest/DirectSpellcheckerSettings.java | 119 +++ .../elasticsearch/search/suggest/Suggest.java | 322 ++++---- .../search/suggest/SuggestBuilder.java | 314 ++------ .../search/suggest/SuggestContextParser.java | 29 + .../search/suggest/SuggestParseElement.java | 191 +---- .../search/suggest/SuggestPhase.java | 181 +---- .../search/suggest/SuggestUtils.java | 293 +++++++ .../search/suggest/Suggester.java | 31 + .../suggest/SuggestionSearchContext.java | 155 +--- .../suggest/phrase/CandidateGenerator.java | 46 ++ .../suggest/phrase/CandidateScorer.java | 114 +++ .../search/suggest/phrase/Correction.java | 57 ++ .../phrase/DirectCandidateGenerator.java | 242 ++++++ .../search/suggest/phrase/LaplaceScorer.java | 65 ++ .../phrase/LinearInterpoatingScorer.java | 64 ++ .../MultiCandidateGeneratorWrapper.java | 77 ++ .../phrase/NoisyChannelSpellChecker.java | 138 ++++ .../suggest/phrase/PhraseSuggestParser.java | 285 +++++++ .../suggest/phrase/PhraseSuggester.java | 86 +++ .../phrase/PhraseSuggestionBuilder.java | 578 ++++++++++++++ .../phrase/PhraseSuggestionContext.java | 157 ++++ .../suggest/phrase/StupidBackoffScorer.java | 67 ++ .../search/suggest/phrase/WordScorer.java | 98 +++ .../suggest/term/TermSuggestParser.java | 60 ++ .../search/suggest/term/TermSuggester.java | 95 +++ .../search/suggest/term/TermSuggestion.java | 201 +++++ .../suggest/term/TermSuggestionBuilder.java | 224 ++++++ .../suggest/term/TermSuggestionContext.java | 37 + .../search/SuggestSearchBenchMark.java | 8 +- .../search/suggest/SuggestSearchTests.java | 718 +++++++++++++++--- .../phrase/NoisyChannelSpellCheckerTests.java | 381 ++++++++++ 37 files changed, 4532 insertions(+), 977 deletions(-) create mode 100644 src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/SuggestContextParser.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/SuggestUtils.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/Suggester.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/CandidateGenerator.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/CandidateScorer.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/Correction.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/DirectCandidateGenerator.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/LaplaceScorer.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/LinearInterpoatingScorer.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/MultiCandidateGeneratorWrapper.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/NoisyChannelSpellChecker.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestParser.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggester.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestionBuilder.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/PhraseSuggestionContext.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/StupidBackoffScorer.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/phrase/WordScorer.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/term/TermSuggestParser.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/term/TermSuggester.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/term/TermSuggestion.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/term/TermSuggestionBuilder.java create mode 100644 src/main/java/org/elasticsearch/search/suggest/term/TermSuggestionContext.java create mode 100644 src/test/java/org/elasticsearch/test/unit/search/suggest/phrase/NoisyChannelSpellCheckerTests.java diff --git a/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java b/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java index 7f740b1dad691..40c49b2698489 100644 --- a/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java +++ b/src/main/java/org/elasticsearch/action/search/SearchRequestBuilder.java @@ -657,9 +657,9 @@ public SearchRequestBuilder setSuggestText(String globalText) { } /** - * Delegates to {@link org.elasticsearch.search.suggest.SuggestBuilder#addSuggestion(org.elasticsearch.search.suggest.SuggestBuilder.Suggestion)}. + * Delegates to {@link org.elasticsearch.search.suggest.SuggestBuilder#addSuggestion(org.elasticsearch.search.suggest.SuggestBuilder.SuggestionBuilder)}. */ - public SearchRequestBuilder addSuggestion(SuggestBuilder.Suggestion suggestion) { + public SearchRequestBuilder addSuggestion(SuggestBuilder.SuggestionBuilder suggestion) { suggestBuilder().addSuggestion(suggestion); return this; } diff --git a/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java index 3a61bf2dc6c15..66e6564b0c1c0 100644 --- a/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java +++ b/src/main/java/org/elasticsearch/index/analysis/ShingleTokenFilterFactory.java @@ -36,7 +36,7 @@ public class ShingleTokenFilterFactory extends AbstractTokenFilterFactory { private final boolean outputUnigrams; - private Boolean outputUnigramsIfNoShingles; + private final boolean outputUnigramsIfNoShingles; private String tokenSeparator; @@ -60,4 +60,20 @@ public TokenStream create(TokenStream tokenStream) { filter.setTokenSeparator(tokenSeparator); return filter; } + + public int getMaxShingleSize() { + return maxShingleSize; + } + + public int getMinShingleSize() { + return minShingleSize; + } + + public boolean getOutputUnigrams() { + return outputUnigrams; + } + + public boolean getOutputUnigramsIfNoShingles() { + return outputUnigramsIfNoShingles; + } } diff --git a/src/main/java/org/elasticsearch/index/mapper/MapperService.java b/src/main/java/org/elasticsearch/index/mapper/MapperService.java index ea188d9c12b2c..ff99f708e3cc8 100644 --- a/src/main/java/org/elasticsearch/index/mapper/MapperService.java +++ b/src/main/java/org/elasticsearch/index/mapper/MapperService.java @@ -762,6 +762,15 @@ public Analyzer searchAnalyzer() { public Analyzer searchQuoteAnalyzer() { return this.searchQuoteAnalyzer; } + + public Analyzer fieldSearchAnalyzer(String field) { + return this.searchAnalyzer.getWrappedAnalyzer(field); + } + + public Analyzer fieldSearchQuoteAnalyzer(String field) { + return this.searchQuoteAnalyzer.getWrappedAnalyzer(field); + } + /** * Resolves the closest inherited {@link ObjectMapper} that is nested. diff --git a/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java b/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java index b636e6eedb986..96aa70c668a26 100644 --- a/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java +++ b/src/main/java/org/elasticsearch/rest/action/search/RestSearchAction.java @@ -45,7 +45,7 @@ import static org.elasticsearch.rest.RestRequest.Method.POST; import static org.elasticsearch.rest.RestStatus.BAD_REQUEST; import static org.elasticsearch.rest.action.support.RestXContentBuilder.restContentBuilder; -import static org.elasticsearch.search.suggest.SuggestBuilder.fuzzySuggestion; +import static org.elasticsearch.search.suggest.SuggestBuilder.termSuggestion; /** * @@ -286,8 +286,8 @@ private SearchSourceBuilder parseSearchSource(RestRequest request) { } String suggestMode = request.param("suggest_mode"); searchSourceBuilder.suggest().addSuggestion( - fuzzySuggestion(suggestField).setField(suggestField).setText(suggestText).setSize(suggestSize) - .setSuggestMode(suggestMode) + termSuggestion(suggestField).field(suggestField).text(suggestText).size(suggestSize) + .suggestMode(suggestMode) ); } diff --git a/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java b/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java index c118893ec5ad0..f3720ec541310 100644 --- a/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java +++ b/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java @@ -140,7 +140,7 @@ public void sendExecuteQuery(DiscoveryNode node, final ShardSearchRequest reques try { QuerySearchResult result = searchService.executeQueryPhase(request); listener.onResult(result); - } catch (Exception e) { + } catch (Throwable e) { listener.onFailure(e); } } else { diff --git a/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java b/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java index 55db09af5e54e..43c91f7f0e688 100644 --- a/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java +++ b/src/main/java/org/elasticsearch/search/controller/SearchPhaseController.java @@ -49,9 +49,13 @@ import org.elasticsearch.search.query.QuerySearchResult; import org.elasticsearch.search.query.QuerySearchResultProvider; import org.elasticsearch.search.suggest.Suggest; +import org.elasticsearch.search.suggest.Suggest.Suggestion; +import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry; +import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -376,32 +380,31 @@ public InternalSearchResponse merge(ShardDoc[] sortedDocs, Map mergedSuggestions = null; + Map> groupedSuggestions = new HashMap>(); for (QuerySearchResultProvider resultProvider : queryResults.values()) { Suggest shardResult = resultProvider.queryResult().suggest(); if (shardResult == null) { continue; } - - if (mergedSuggestions == null) { - mergedSuggestions = shardResult.getSuggestions(); - continue; - } - - for (Suggest.Suggestion shardCommand : shardResult.getSuggestions()) { - for (Suggest.Suggestion mergedSuggestion : mergedSuggestions) { - if (mergedSuggestion.getName().equals(shardCommand.getName())) { - mergedSuggestion.reduce(shardCommand); - } + for (Suggestion> suggestion : shardResult) { + List list = groupedSuggestions.get(suggestion.getName()); + if (list == null) { + list = new ArrayList(); + groupedSuggestions.put(suggestion.getName(), list); } + list.add(suggestion); } + } - if (mergedSuggestions != null) { - suggest = new Suggest(mergedSuggestions); - for (Suggest.Suggestion suggestion : mergedSuggestions) { - suggestion.trim(); - } + List>> reduced = new ArrayList>>(); + for (java.util.Map.Entry> unmergedResults : groupedSuggestions.entrySet()) { + List value = unmergedResults.getValue(); + Suggestion reduce = value.get(0).reduce(value); + reduce.trim(); + reduced.add(reduce); + } + suggest = new Suggest(reduced); } InternalSearchHits searchHits = new InternalSearchHits(hits.toArray(new InternalSearchHit[hits.size()]), totalHits, maxScore); diff --git a/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java b/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java new file mode 100644 index 0000000000000..2d95116354050 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/suggest/DirectSpellcheckerSettings.java @@ -0,0 +1,119 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.search.suggest; + +import org.apache.lucene.search.spell.DirectSpellChecker; +import org.apache.lucene.search.spell.StringDistance; +import org.apache.lucene.search.spell.SuggestMode; +import org.apache.lucene.util.automaton.LevenshteinAutomata; + +public class DirectSpellcheckerSettings { + + private SuggestMode suggestMode = SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX; + private float accuracy = 0.5f; + private Suggest.Suggestion.Sort sort = Suggest.Suggestion.Sort.SCORE; + private StringDistance stringDistance = DirectSpellChecker.INTERNAL_LEVENSHTEIN; + private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE; + private int maxInspections = 5; + private float maxTermFreq = 0.01f; + private int prefixLength = 1; + private int minWordLength = 4; + private float minDocFreq = 0f; + + public SuggestMode suggestMode() { + return suggestMode; + } + + public void suggestMode(SuggestMode suggestMode) { + this.suggestMode = suggestMode; + } + + public float accuracy() { + return accuracy; + } + + public void accuracy(float accuracy) { + this.accuracy = accuracy; + } + + public Suggest.Suggestion.Sort sort() { + return sort; + } + + public void sort(Suggest.Suggestion.Sort sort) { + this.sort = sort; + } + + public StringDistance stringDistance() { + return stringDistance; + } + + public void stringDistance(StringDistance distance) { + this.stringDistance = distance; + } + + public int maxEdits() { + return maxEdits; + } + + public void maxEdits(int maxEdits) { + this.maxEdits = maxEdits; + } + + public int maxInspections() { + return maxInspections; + } + + public void maxInspections(int maxInspections) { + this.maxInspections = maxInspections; + } + + public float maxTermFreq() { + return maxTermFreq; + } + + public void maxTermFreq(float maxTermFreq) { + this.maxTermFreq = maxTermFreq; + } + + public int prefixLength() { + return prefixLength; + } + + public void prefixLength(int prefixLength) { + this.prefixLength = prefixLength; + } + + public int minWordLength() { + return minWordLength; + } + + public void minQueryLength(int minQueryLength) { + this.minWordLength = minQueryLength; + } + + public float minDocFreq() { + return minDocFreq; + } + + public void minDocFreq(float minDocFreq) { + this.minDocFreq = minDocFreq; + } + +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/search/suggest/Suggest.java b/src/main/java/org/elasticsearch/search/suggest/Suggest.java index 08e37e77f98e8..72160a15dd1b6 100644 --- a/src/main/java/org/elasticsearch/search/suggest/Suggest.java +++ b/src/main/java/org/elasticsearch/search/suggest/Suggest.java @@ -16,9 +16,17 @@ * specific language governing permissions and limitations * under the License. */ - package org.elasticsearch.search.suggest; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + import org.elasticsearch.ElasticSearchException; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -27,48 +35,83 @@ import org.elasticsearch.common.xcontent.ToXContent; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilderString; - -import java.io.IOException; -import java.util.*; +import org.elasticsearch.search.suggest.Suggest.Suggestion; +import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry; +import org.elasticsearch.search.suggest.Suggest.Suggestion.Entry.Option; +import org.elasticsearch.search.suggest.term.TermSuggestion; /** * Top level suggest result, containing the result for each suggestion. */ -public class Suggest implements Iterable, Streamable, ToXContent { +public class Suggest implements Iterable>>, Streamable, ToXContent { static class Fields { - static final XContentBuilderString SUGGEST = new XContentBuilderString("suggest"); - } - private List suggestions; + private static final Comparator