diff --git a/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/InitialEndpointSupplier.java b/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/InitialEndpointSupplier.java index ffa46d60cfe..b37b532c5dd 100644 --- a/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/InitialEndpointSupplier.java +++ b/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/InitialEndpointSupplier.java @@ -15,13 +15,11 @@ import com.linecorp.armeria.client.Endpoint; import com.linecorp.armeria.client.endpoint.EndpointGroup; -import com.linecorp.armeria.client.endpoint.dns.DnsAddressEndpointGroup; import com.linecorp.armeria.client.endpoint.dns.DnsAddressEndpointGroupBuilder; import com.linecorp.armeria.common.SessionProtocol; import java.net.URI; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -64,25 +62,13 @@ final class InitialEndpointSupplier implements Supplier { // A host that isn't an IP may resolve to multiple IP addresses, so we use a endpoint // group to round-robin over them. Users can mix addresses that resolve to multiple IPs // with single IPs freely, they'll all get used. - endpointGroups.add(resolveDnsAddresses(host, port)); + endpointGroups.add(new DnsAddressEndpointGroupBuilder(host).port(port).build()); } } return EndpointGroup.of(endpointGroups); } - // Rather than result in an empty group. Await DNS resolution as this call is deferred anyway - // TODO: delete the await when https://github.com/line/armeria/issues/2071 is complete - DnsAddressEndpointGroup resolveDnsAddresses(String host, int port) { - DnsAddressEndpointGroup result = new DnsAddressEndpointGroupBuilder(host).port(port).build(); - try { - result.awaitInitialEndpoints(1, TimeUnit.SECONDS); - } catch (Exception e) { - // let it fail later - } - return result; - } - int getPort(URI url) { int port = url.getPort(); if (port == -1) port = sessionProtocol.defaultPort(); diff --git a/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/LazyHttpClientImpl.java b/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/LazyHttpClientImpl.java index 7663c1c1563..30749c3d0f8 100644 --- a/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/LazyHttpClientImpl.java +++ b/zipkin-server/src/main/java/zipkin2/server/internal/elasticsearch/LazyHttpClientImpl.java @@ -15,7 +15,6 @@ import com.linecorp.armeria.client.Endpoint; import com.linecorp.armeria.client.HttpClient; -import com.linecorp.armeria.client.endpoint.DynamicEndpointGroup; import com.linecorp.armeria.client.endpoint.EndpointGroup; import com.linecorp.armeria.client.endpoint.EndpointGroupRegistry; import com.linecorp.armeria.client.endpoint.healthcheck.HealthCheckedEndpointGroup; @@ -71,31 +70,37 @@ final class LazyHttpClientImpl implements LazyHttpClient { } Endpoint getEndpoint() { - EndpointGroup endpointGroup = initialEndpoints.get(); - if (endpointGroup instanceof Endpoint) return (Endpoint) endpointGroup; + EndpointGroup initial = initialEndpoints.get(); + // Only health-check when there are alternative endpoints. There aren't when instanceof Endpoint + if (initial instanceof Endpoint) return (Endpoint) initial; - // https://github.com/line/armeria/issues/2071 Composite endpoint groups don't extend dynamic - // once this is done, we can delete special-cased code in InitialEndpointSupplier about DNS, as - // when endpointGroup.isStatic() is false, we'd here. - if (endpointGroup instanceof DynamicEndpointGroup) { - try { - // Since we aren't holding up server startup, or sitting on the event loop, it is ok to - // block. The alternative is round-robin, which could be unlucky and hit a bad node first. - // - // We are blocking up to the connection timeout which should be enough time for any DNS - // resolution that hasn't happened yet to finish. - endpointGroup.awaitInitialEndpoints(timeoutMillis, TimeUnit.MILLISECONDS); - } catch (Exception e) { - // We'll try again next time around. - throw new IllegalStateException("couldn't connect any of " + endpointGroup.endpoints(), e); - } + // Wrap the result when health checking is enabled. + EndpointGroup result = initial; + if (healthCheck.isEnabled()) result = decorateHealthCheck(initial); + + boolean empty = true; + Exception thrown = null; + try { + // Since we aren't holding up server startup, or sitting on the event loop, it is ok to + // block. The alternative is round-robin, which could be unlucky and hit a bad node first. + // + // We are blocking up to the connection timeout which should be enough time for any DNS + // resolution that hasn't happened yet to finish. + empty = result.awaitInitialEndpoints(timeoutMillis, TimeUnit.MILLISECONDS).isEmpty(); + } catch (Exception e) { + thrown = e; } - if (healthCheck.isEnabled()) endpointGroup = decorateHealthCheck(endpointGroup); + // If health-checking is enabled, we can end up with no endpoints after waiting + // TODO: test this results in no cause following https://github.com/line/armeria/pull/2074 + if (empty) { + result.close(); // no-op when not health checked + throw new IllegalStateException("couldn't connect any of " + initial.endpoints(), thrown); + } - // TODO: why must we do this instead of using direct type references. - // The static factory is concerning. https://github.com/line/armeria/issues/1084 - EndpointGroupRegistry.register("elasticsearch", endpointGroup, ROUND_ROBIN); + // Currently, there's no alternative to static registries when using Endpoint.ofGroup + // https://github.com/line/armeria/issues/1084 + EndpointGroupRegistry.register("elasticsearch", result, ROUND_ROBIN); return Endpoint.ofGroup("elasticsearch"); } @@ -120,18 +125,6 @@ HealthCheckedEndpointGroup decorateHealthCheck(EndpointGroup endpointGroup) { .retryInterval(healthCheck.getInterval()) .build(); healthChecked.newMeterBinder("elasticsearch").bindTo(meterRegistry); - - // Since we aren't holding up server startup, or sitting on the event loop, it is ok to block. - // The alternative is round-robin, which could be unlucky and hit a bad node first. - // - // We are blocking up to the connection timeout which should be enough time for a health check - // to respond. - try { - healthChecked.awaitInitialEndpoints(timeoutMillis, TimeUnit.MILLISECONDS); - } catch (Exception e) { - healthChecked.close(); // we'll recreate it the next time around. - throw new IllegalStateException("couldn't connect any of " + endpointGroup.endpoints(), e); - } return healthChecked; }