Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

instrument retrying channels scheduled executor service #485

Merged
merged 3 commits into from
Mar 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelog/@unreleased/pr-485.v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
type: improvement
improvement:
description: Produce metrics about requests that are going to be retried
links:
- https://github.com/palantir/dialogue/pull/485
1 change: 1 addition & 0 deletions dialogue-core/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ dependencies {
compile 'com.palantir.tracing:tracing'
compile 'io.dropwizard.metrics:metrics-core'
compile 'com.palantir.safethreadlocalrandom:safe-thread-local-random'
implementation 'com.palantir.tritium:tritium-metrics'

testImplementation 'com.palantir.tracing:tracing-test-utils'
testImplementation 'com.palantir.safe-logging:preconditions-assertj'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import com.github.benmanes.caffeine.cache.Ticker;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.ListeningScheduledExecutorService;
import com.google.errorprone.annotations.CheckReturnValue;
import com.palantir.conjure.java.client.config.ClientConfiguration;
import com.palantir.dialogue.Channel;
Expand All @@ -32,6 +31,7 @@
import java.util.Collection;
import java.util.List;
import java.util.Random;
import java.util.concurrent.ScheduledExecutorService;
import java.util.function.Supplier;
import javax.annotation.Nullable;

Expand All @@ -53,7 +53,7 @@ public static final class Builder {
private final List<Channel> channels = new ArrayList<>();
private Ticker clock = Ticker.systemTicker();
private Random random = SafeThreadLocalRandom.get();
private Supplier<ListeningScheduledExecutorService> scheduler = RetryingChannel.sharedScheduler;
private Supplier<ScheduledExecutorService> scheduler = RetryingChannel.sharedScheduler;

@Nullable
private ClientConfiguration config;
Expand Down Expand Up @@ -82,7 +82,7 @@ Builder random(Random value) {
}

@VisibleForTesting
Builder scheduler(ListeningScheduledExecutorService value) {
Builder scheduler(ScheduledExecutorService value) {
this.scheduler = () -> value;
return this;
}
Expand Down Expand Up @@ -132,16 +132,14 @@ private static void preconditions(Collection<? extends Channel> channels, Client
}

private static Channel retryingChannel(
ClientConfiguration conf,
Channel channel,
Supplier<ListeningScheduledExecutorService> scheduler,
Random random) {
ClientConfiguration conf, Channel channel, Supplier<ScheduledExecutorService> scheduler, Random random) {
if (conf.maxNumRetries() == 0) {
return channel;
}

return new RetryingChannel(
channel,
conf.taggedMetricRegistry(),
conf.maxNumRetries(),
conf.backoffSlotSize(),
conf.serverQoS(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.palantir.dialogue.core;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Suppliers;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
Expand All @@ -33,9 +34,13 @@
import com.palantir.logsafe.exceptions.SafeRuntimeException;
import com.palantir.tracing.DetachedSpan;
import com.palantir.tracing.Tracers;
import com.palantir.tritium.metrics.MetricRegistries;
import com.palantir.tritium.metrics.registry.DefaultTaggedMetricRegistry;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import java.net.SocketTimeoutException;
import java.time.Duration;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.function.DoubleSupplier;
Expand All @@ -49,20 +54,19 @@
final class RetryingChannel implements Channel {

private static final Logger log = LoggerFactory.getLogger(RetryingChannel.class);
private static final String SCHEDULER_NAME = "dialogue-RetryingChannel-scheduler";

/*
* Shared single thread executor is reused between all retrying channels. If it becomes oversaturated
* we may wait longer than expected before resuming requests, but this is an
* edge case where services are already operating in a degraded state and we should not
* spam servers.
*/
static final Supplier<ListeningScheduledExecutorService> sharedScheduler =
Suppliers.memoize(() -> MoreExecutors.listeningDecorator(Tracers.wrap(
"dialogue-RetryingChannel-scheduler",
Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder()
.setNameFormat("dialogue-RetryingChannel-scheduler-%d")
.setDaemon(false)
.build()))));
static final Supplier<ScheduledExecutorService> sharedScheduler =
Suppliers.memoize(() -> Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder()
.setNameFormat(SCHEDULER_NAME + "-%d")
.setDaemon(false)
.build()));

private final ListeningScheduledExecutorService scheduler;
private final Channel delegate;
Expand All @@ -72,6 +76,7 @@ final class RetryingChannel implements Channel {
private final Duration backoffSlotSize;
private final DoubleSupplier jitter;

@VisibleForTesting
RetryingChannel(
Channel delegate,
int maxRetries,
Expand All @@ -80,6 +85,7 @@ final class RetryingChannel implements Channel {
ClientConfiguration.RetryOnTimeout retryOnTimeout) {
this(
delegate,
new DefaultTaggedMetricRegistry(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we need to add this to the constructor and pass the registry from our ClientConfiguration

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes we do do that, this constructor is only visible for testing. Let me make that more clear

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh doh, I misread, sorry!

maxRetries,
backoffSlotSize,
serverQoS,
Expand All @@ -90,18 +96,19 @@ final class RetryingChannel implements Channel {

RetryingChannel(
Channel delegate,
TaggedMetricRegistry metrics,
int maxRetries,
Duration backoffSlotSize,
ClientConfiguration.ServerQoS serverQoS,
ClientConfiguration.RetryOnTimeout retryOnTimeout,
ListeningScheduledExecutorService scheduler,
ScheduledExecutorService scheduler,
DoubleSupplier jitter) {
this.delegate = delegate;
this.maxRetries = maxRetries;
this.backoffSlotSize = backoffSlotSize;
this.serverQoS = serverQoS;
this.retryOnTimeout = retryOnTimeout;
this.scheduler = scheduler;
this.scheduler = instrument(scheduler, metrics);
this.jitter = jitter;
}

Expand Down Expand Up @@ -245,4 +252,10 @@ private static boolean shouldPropagateQos(ClientConfiguration.ServerQoS serverQo
throw new SafeIllegalStateException(
"Encountered unknown propagate QoS configuration", SafeArg.of("serverQoS", serverQoS));
}

private static ListeningScheduledExecutorService instrument(
ScheduledExecutorService delegate, TaggedMetricRegistry metrics) {
return MoreExecutors.listeningDecorator(
Tracers.wrap(SCHEDULER_NAME, MetricRegistries.instrument(metrics, delegate, SCHEDULER_NAME)));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

}
}
19 changes: 12 additions & 7 deletions versions.lock
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ com.fasterxml.jackson.datatype:jackson-datatype-jdk8:2.10.2 (2 constraints: 2529
com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.10.2 (1 constraints: 7b1c92a4)
com.fasterxml.jackson.module:jackson-module-afterburner:2.10.2 (2 constraints: 2529c2cb)
com.github.ben-manes.caffeine:caffeine:2.8.1 (2 constraints: c017484f)
com.google.code.findbugs:jsr305:3.0.2 (7 constraints: b058251a)
com.google.code.findbugs:jsr305:3.0.2 (11 constraints: 6b9fbbdf)
com.google.errorprone:error_prone_annotations:2.3.4 (5 constraints: 4a4818d6)
com.google.guava:failureaccess:1.0.1 (1 constraints: 140ae1b4)
com.google.guava:guava:28.1-jre (7 constraints: 7e7647a3)
com.google.guava:guava:28.1-jre (9 constraints: d2974be9)
com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava (1 constraints: bd17c918)
com.google.j2objc:j2objc-annotations:1.3 (1 constraints: b809eda0)
com.netflix.concurrency-limits:concurrency-limits-core:0.2.0 (1 constraints: 0405f135)
Expand All @@ -23,19 +23,22 @@ com.palantir.conjure.java.runtime:client-config:4.67.0 (1 constraints: 4305643b)
com.palantir.conjure.java.runtime:conjure-java-jackson-serialization:4.67.0 (2 constraints: 6316b80f)
com.palantir.conjure.java.runtime:keystores:4.67.0 (2 constraints: 651921de)
com.palantir.ri:resource-identifier:1.1.0 (2 constraints: ed14f4b5)
com.palantir.safe-logging:preconditions:1.13.0 (11 constraints: d9b7864c)
com.palantir.safe-logging:safe-logging:1.13.0 (7 constraints: 81793449)
com.palantir.safe-logging:preconditions:1.13.0 (13 constraints: 03d73cdd)
com.palantir.safe-logging:safe-logging:1.13.0 (9 constraints: ab98172c)
com.palantir.safethreadlocalrandom:safe-thread-local-random:0.1.0 (1 constraints: 0305ee35)
com.palantir.tokens:auth-tokens:3.6.1 (3 constraints: 90270b2e)
com.palantir.tracing:tracing:4.4.0 (2 constraints: f41532e9)
com.palantir.tracing:tracing-api:4.4.0 (1 constraints: a00c4509)
com.palantir.tritium:tritium-registry:0.16.6 (1 constraints: 1f14d376)
com.palantir.tritium:tritium-api:0.16.6 (2 constraints: 3b1fb8bd)
com.palantir.tritium:tritium-core:0.16.6 (1 constraints: 451044a2)
com.palantir.tritium:tritium-metrics:0.16.6 (1 constraints: 3f053b3b)
com.palantir.tritium:tritium-registry:0.16.6 (3 constraints: a129110e)
com.squareup.okhttp3:mockwebserver:3.13.1 (1 constraints: 3a053f3b)
com.squareup.okhttp3:okhttp:3.13.1 (2 constraints: a014ba9d)
com.squareup.okio:okio:1.17.2 (1 constraints: 850cc309)
commons-codec:commons-codec:1.11 (1 constraints: f20f8881)
commons-logging:commons-logging:1.2 (3 constraints: 43254b24)
io.dropwizard.metrics:metrics-core:3.2.6 (2 constraints: a115c8c3)
io.dropwizard.metrics:metrics-core:3.2.6 (3 constraints: b325741c)
jakarta.annotation:jakarta.annotation-api:1.3.5 (1 constraints: f10f7399)
jakarta.ws.rs:jakarta.ws.rs-api:2.1.6 (2 constraints: fb144cb7)
junit:junit:4.12 (2 constraints: e213d85a)
Expand All @@ -48,10 +51,12 @@ org.checkerframework:checker-qual:3.1.0 (2 constraints: 161ab343)
org.codehaus.mojo:animal-sniffer-annotations:1.18 (1 constraints: ee09d9aa)
org.hamcrest:hamcrest:2.2 (3 constraints: f41da570)
org.hamcrest:hamcrest-core:2.2 (3 constraints: 2f17d637)
org.hdrhistogram:HdrHistogram:2.1.12 (1 constraints: 3e103aa2)
org.immutables:value:2.8.3 (1 constraints: 0f051036)
org.mockito:mockito-core:3.3.1 (2 constraints: ca13e664)
org.mpierce.metrics.reservoir:hdrhistogram-metrics-reservoir:1.1.2 (1 constraints: 0c10f891)
org.objenesis:objenesis:2.6 (1 constraints: b40a14bd)
org.slf4j:slf4j-api:1.7.30 (7 constraints: 9d63de0d)
org.slf4j:slf4j-api:1.7.30 (9 constraints: d382e5e7)

[Test dependencies]
com.google.code.findbugs:annotations:3.0.1 (1 constraints: 9e0aafc3)
Expand Down
6 changes: 3 additions & 3 deletions versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@ com.github.ben-manes.caffeine:caffeine = 2.8.1
com.google.code.findbugs:jsr305 = 3.0.2
com.google.guava:guava = 27.0.1-jre
com.netflix.concurrency-limits:* = 0.2.0
com.palantir.baseline:* = 0.39.1
com.palantir.conjure.java:* = 5.9.0
com.palantir.conjure.java.api:* = 2.11.1
com.palantir.conjure.java.runtime:* = 4.67.0
com.palantir.conjure.java:* = 5.9.0
com.palantir.conjure:conjure = 4.10.1
com.palantir.ri:resource-identifier = 1.1.0
com.palantir.safe-logging:* = 1.13.0
com.palantir.safethreadlocalrandom:safe-thread-local-random = 0.1.0
com.palantir.tokens:auth-tokens = 3.6.1
com.palantir.tracing:* = 4.4.0
com.palantir.tritium:* = 0.16.6
com.uber.nullaway:nullaway = 0.7.9
io.dropwizard.metrics:metrics-core = 3.2.6
org.immutables:* = 2.8.3
org.slf4j:* = 1.7.30
com.palantir.conjure:conjure = 4.10.1

# test deps
junit:junit = 4.12
Expand Down