Skip to content
This repository has been archived by the owner on Aug 9, 2022. It is now read-only.

Rest Layer and Async Search Cleanup Management #9

Merged
merged 33 commits into from
Dec 23, 2020
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
9c921d2
rest actions
eirsep Dec 14, 2020
591d8ad
stats api. async search clean up.
eirsep Dec 14, 2020
28d39b7
register transport action and rest handler for stats.
eirsep Dec 14, 2020
93ecef5
wire async search stats listener to active context
eirsep Dec 14, 2020
959439a
status field in async search response. async search cleanup refactor
eirsep Dec 15, 2020
1bbf255
state field in async search response. submit async search rest tests
eirsep Dec 16, 2020
9fc6a89
submit async search rest tests. added wait for completion timeout max…
eirsep Dec 17, 2020
0678622
Merge branch 'master' of github.com:opendistro-for-elasticsearch/asyn…
eirsep Dec 17, 2020
c830645
validate keep alive change
eirsep Dec 17, 2020
7cbcbf8
more submit api tests
eirsep Dec 17, 2020
97d65c4
api param validation tests.
eirsep Dec 17, 2020
ceecbac
Merge branch 'rest-layer' of github.com:opendistro-for-elasticsearch/…
eirsep Dec 17, 2020
903b107
Management layer changes
Bukhtawar Dec 17, 2020
85fed8f
async search request routing tests
eirsep Dec 18, 2020
6f24bf0
async search settings tests and added more api tests
eirsep Dec 18, 2020
a62f6cb
Merge branch 'rest-layer' of github.com:opendistro-for-elasticsearch/…
eirsep Dec 18, 2020
c7b2df1
Management layer IT
Bukhtawar Dec 19, 2020
d072d6d
Disabling transport clients for test
Bukhtawar Dec 19, 2020
fa5cd67
fix failing tests
eirsep Dec 19, 2020
bc64fc3
Management layers tests
Bukhtawar Dec 20, 2020
3350aa8
Minor fixups around permits and timeouts
Bukhtawar Dec 20, 2020
d504524
Update expiration strengthen tests
Bukhtawar Dec 20, 2020
c164993
async search stats multi nodes tests.
eirsep Dec 21, 2020
b71e57f
Merge branch 'rest-layer' of github.com:opendistro-for-elasticsearch/…
eirsep Dec 21, 2020
362ab77
async search throttling count stats
eirsep Dec 21, 2020
99aacde
Management layer changes for PR feedback
Bukhtawar Dec 21, 2020
cfac793
request routing tests. coordinator node drop scenarios
eirsep Dec 21, 2020
30c3a66
Merge branch 'rest-layer' of github.com:opendistro-for-elasticsearch/…
eirsep Dec 21, 2020
34865ca
Async search post processor tests
Bukhtawar Dec 22, 2020
0878892
onContextPersistFailed stat. onRunningContextClosed hook to decremen…
eirsep Dec 23, 2020
8173ba0
remove persisting to closed transition. catch IOException instead of …
eirsep Dec 23, 2020
150ad90
equals method fix in async search response
eirsep Dec 23, 2020
f90d6be
revert removing PERSISTING->CLOSED transition.
eirsep Dec 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ dependencies {
exclude group: 'org.hamcrest'
}
testCompile "org.elasticsearch.plugin:reindex-client:${es_version}"
testCompile "org.elasticsearch.test:framework:${es_version}"
compileOnly "org.elasticsearch.plugin:transport-netty4-client:${es_version}"
compileOnly "org.elasticsearch:elasticsearch:${es_version}"
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/

package com.amazon.opendistroforelasticsearch.search.async.action;

import com.amazon.opendistroforelasticsearch.search.async.response.AsyncSearchStatsResponse;
import org.elasticsearch.action.ActionType;
import org.elasticsearch.common.io.stream.Writeable;

public class AsyncSearchStatsAction extends ActionType<AsyncSearchStatsResponse> {


public static final AsyncSearchStatsAction INSTANCE = new AsyncSearchStatsAction();
public static final String NAME = "cluster:admin/async_search/stats";

private AsyncSearchStatsAction() {
super(NAME, AsyncSearchStatsResponse::new);
}

@Override
public Writeable.Reader<AsyncSearchStatsResponse> getResponseReader() {
return AsyncSearchStatsResponse::new;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,7 @@
import com.amazon.opendistroforelasticsearch.search.async.response.AsyncSearchResponse;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.util.set.Sets;

import java.util.Collections;
import java.util.Set;
import java.util.function.LongSupplier;


Expand Down Expand Up @@ -87,13 +84,8 @@ public boolean isExpired() {
return getExpirationTimeMillis() < currentTimeSupplier.getAsLong();
}

public Set<AsyncSearchState> retainedStages() {
return Collections.unmodifiableSet(Sets.newHashSet(AsyncSearchState.INIT, AsyncSearchState.RUNNING, AsyncSearchState.SUCCEEDED,
AsyncSearchState.FAILED, AsyncSearchState.PERSISTING));
}

public AsyncSearchResponse getAsyncSearchResponse() {
return new AsyncSearchResponse(getAsyncSearchId(), isRunning(), getStartTimeMillis(),
return new AsyncSearchResponse(getAsyncSearchId(), getAsyncSearchState(), getStartTimeMillis(),
getExpirationTimeMillis(), getSearchResponse(), getSearchError());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,11 @@ public void acquireAllContextPermits(final ActionListener<Releasable> onPermitAc
}

public boolean isAlive() {
if (closed.get()) {
assert getAsyncSearchState() == CLOSED : "State must be closed for async search id " + getAsyncSearchId();
return false;
}
return true;
if (closed.get()) {
assert getAsyncSearchState() == CLOSED : "State must be closed for async search id " + getAsyncSearchId();
return false;
}
return true;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
package com.amazon.opendistroforelasticsearch.search.async.context.active;

import com.amazon.opendistroforelasticsearch.search.async.context.AsyncSearchContextId;
import com.amazon.opendistroforelasticsearch.search.async.context.state.AsyncSearchStateMachine;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.service.ClusterService;
Expand All @@ -26,6 +25,7 @@

import java.util.Map;
import java.util.Optional;
import java.util.function.Consumer;

import static org.elasticsearch.common.util.concurrent.ConcurrentCollections.newConcurrentMapLongWithAggressiveConcurrency;

Expand All @@ -34,26 +34,25 @@ public class AsyncSearchActiveStore {

private static Logger logger = LogManager.getLogger(AsyncSearchActiveStore.class);
private volatile int maxRunningContext;
private final AsyncSearchStateMachine asyncSearchStateMachine;
public static final Setting<Integer> MAX_RUNNING_CONTEXT = Setting.intSetting(
"async_search.max_running_context", 100, 0, Setting.Property.Dynamic, Setting.Property.NodeScope);
"async_search.max_running_context", 100, 10, Setting.Property.Dynamic, Setting.Property.NodeScope);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets revert to 0, this would help us turn the feature off


private final ConcurrentMapLong<AsyncSearchActiveContext> activeContexts = newConcurrentMapLongWithAggressiveConcurrency();


public AsyncSearchActiveStore(ClusterService clusterService, AsyncSearchStateMachine stateMachine) {
public AsyncSearchActiveStore(ClusterService clusterService) {
Settings settings = clusterService.getSettings();
maxRunningContext = MAX_RUNNING_CONTEXT.get(settings);
this.asyncSearchStateMachine = stateMachine;
clusterService.getClusterSettings().addSettingsUpdateConsumer(MAX_RUNNING_CONTEXT, this::setMaxRunningContext);
}

private void setMaxRunningContext(int maxRunningContext) {
this.maxRunningContext = maxRunningContext;
}

public synchronized void putContext(AsyncSearchContextId asyncSearchContextId, AsyncSearchActiveContext asyncSearchContext) {
public synchronized void putContext(AsyncSearchContextId asyncSearchContextId, AsyncSearchActiveContext asyncSearchContext,
Consumer<AsyncSearchContextId> contextRejectionEventConsumer) {
if (activeContexts.size() >= maxRunningContext) {
contextRejectionEventConsumer.accept(asyncSearchContextId);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this consumer needed? For cleanup ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consumer is required for async search throttled count stat at node level. This is harnessed in our stats API

throw new AsyncSearchRejectedException("Trying to create too many running contexts. Must be less than or equal to: ["
+ maxRunningContext + "]. This limit can be set by changing the [" + MAX_RUNNING_CONTEXT.getKey() + "] setting.",
maxRunningContext);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.store.AlreadyClosedException;
import org.elasticsearch.ElasticsearchTimeoutException;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.unit.TimeValue;
Expand Down Expand Up @@ -58,7 +59,7 @@ public AsyncSearchContextPermits(AsyncSearchContextId asyncSearchContextId, Thre
this.semaphore = new Semaphore(TOTAL_PERMITS, true);
}

private Releasable acquirePermits(int permits, TimeValue timeout, final String details) throws TimeoutException {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reason for making it run time?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We throw ElasticSearchTimeoutException here. That's serializable over the wire.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it. So IllegalStateException is no more expected here?

private Releasable acquirePermits(int permits, TimeValue timeout, final String details) {
RunOnce release = new RunOnce(() -> {});
if (closed) {
logger.debug("Trying to acquire permit for closed context [{}]", asyncSearchContextId);
Expand All @@ -71,17 +72,15 @@ private Releasable acquirePermits(int permits, TimeValue timeout, final String d
logger.warn("Releasing permit(s) [{}] with reason [{}]", permits, lockDetails);
semaphore.release(permits);});
if (closed) {
release.run();
logger.debug("Trying to acquire permit for closed context [{}]", asyncSearchContextId);
throw new AlreadyClosedException("trying to acquire permits on closed context ["+ asyncSearchContextId +"]");
}
return release::run;
} else {
throw new TimeoutException("obtaining context lock" + asyncSearchContextId + "timed out after " +
throw new ElasticsearchTimeoutException("obtaining context lock" + asyncSearchContextId + "timed out after " +
timeout.getMillis() + "ms, previous lock details: [" + lockDetails + "] trying to lock for [" + details + "]");
}
} catch (IllegalStateException e){
release.run();
throw new RuntimeException("Context already closed while trying to obtain context lock", e);
} catch (InterruptedException e ) {
Thread.currentThread().interrupt();
release.run();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,17 @@
import org.elasticsearch.ResourceNotFoundException;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.DocWriteResponse;
import org.elasticsearch.action.NoShardAvailableActionException;
import org.elasticsearch.action.bulk.BackoffPolicy;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.get.GetRequest;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.update.UpdateRequest;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.block.ClusterBlockException;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.stream.NotSerializableExceptionWrapper;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
Expand Down Expand Up @@ -67,20 +68,19 @@ public class AsyncSearchPersistenceService {
public static final String ERROR = "error";

private static final Logger logger = LogManager.getLogger(AsyncSearchPersistenceService.class);
public static final String ASYNC_SEARCH_RESPONSE_INDEX = ".asynchronous_search_response";
public static final String ASYNC_SEARCH_RESPONSE_INDEX = ".opendistro_asynchronous_search_response";
private static final String MAPPING_TYPE = "_doc";
/**
* The backoff policy to use when saving a task result fails. The total wait
* The backoff policy to use when saving a async search response fails. The total wait
* time is 600000 milliseconds, ten minutes.
*/
private static final BackoffPolicy STORE_BACKOFF_POLICY =
BackoffPolicy.exponentialBackoff(timeValueMillis(50), 5);
public static final BackoffPolicy STORE_BACKOFF_POLICY =
BackoffPolicy.exponentialBackoff(timeValueMillis(250), 14);
Comment on lines +77 to +78

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the reason to increase the delay & retries based on some test run data?

Copy link
Contributor Author

@eirsep eirsep Dec 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The backoff policy to use when saving a search response fails. The total wait time intended is 600000 milliseconds i.e. 10 minutes. We had this value earlier but got changed inadvertently. Reverted it back to what it was earlier i.e. BackoffPolicy.exponentialBackoff(timeValueMillis(250), 14)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah i meant why 10 minutes. Do we think it is just a good default to start with or has any backing from code/tests.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason for not adding jitter?


private final Client client;
private final ClusterService clusterService;
private final ThreadPool threadPool;

@Inject
public AsyncSearchPersistenceService(Client client, ClusterService clusterService, ThreadPool threadPool) {
this.client = client;
this.clusterService = clusterService;
Expand Down Expand Up @@ -291,7 +291,9 @@ public void onResponse(IndexResponse indexResponse) {

@Override
public void onFailure(Exception e) {
if (!(e instanceof EsRejectedExecutionException) || !backoff.hasNext()) {
if (((e instanceof EsRejectedExecutionException || e instanceof ClusterBlockException
|| e instanceof NoShardAvailableActionException) == false) || backoff.hasNext() == false) {
Comment on lines +294 to +295

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it the complete list of non-retriable exceptions? For instance why ClusterBlockException here or not the CircuitBreakingException. Can we not use ElasticsearchException directly to save on the type check and covers all runtime exceptions.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, we don't have an exhaustive list of retryable exceptions. If we run into CircuitBreakingException its ideal we back out and clean up response. Will revisit this later

logger.warn(() -> new ParameterizedMessage("failed to store async search response, not retrying"), e);
listener.onFailure(e);
} else {
TimeValue wait = backoff.next();
Expand All @@ -304,7 +306,7 @@ public void onFailure(Exception e) {

private Settings indexSettings() {
return Settings.builder()
.put(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1)
.put(IndexMetadata.INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 5)
.put(IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING.getKey(), "0-1")
.put(IndexMetadata.SETTING_PRIORITY, Integer.MAX_VALUE)
.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,11 @@ default void onContextClosed(AsyncSearchContextId contextId) {
default void onContextRunning(AsyncSearchContextId contextId) {

}

/**
* @param contextId Executed when async search context creation is rejected
*/
default void onContextRejected(AsyncSearchContextId contextId) {

}
}
Loading