Permalink
Browse files

Kicks out instead of correcting skew on single-host spans (#1483)

An instrumentation glitch can look like clock skew. This kicks out when
RPC annotations all belong to the same host.

This also backfills tests.
  • Loading branch information...
1 parent ad90718 commit 483f78ed3546b3d28e2555c50381eebf67bbf82d @adriancole adriancole committed on GitHub Jan 11, 2017
@@ -148,30 +148,34 @@ static boolean ipsMatch(Endpoint skew, Endpoint that) {
static ClockSkew getClockSkew(Span span) {
Map<String, Annotation> annotations = asMap(span.annotations);
- Long clientSend = getTimestamp(annotations, Constants.CLIENT_SEND);
- Long clientRecv = getTimestamp(annotations, Constants.CLIENT_RECV);
- Long serverRecv = getTimestamp(annotations, Constants.SERVER_RECV);
- Long serverSend = getTimestamp(annotations, Constants.SERVER_SEND);
+ Annotation clientSend = annotations.get(Constants.CLIENT_SEND);
+ Annotation clientRecv = annotations.get(Constants.CLIENT_RECV);
+ Annotation serverRecv = annotations.get(Constants.SERVER_RECV);
+ Annotation serverSend = annotations.get(Constants.SERVER_SEND);
if (clientSend == null || clientRecv == null || serverRecv == null || serverSend == null) {
return null;
}
- Endpoint server = annotations.get(Constants.SERVER_RECV).endpoint;
- server = server == null ? annotations.get(Constants.SERVER_SEND).endpoint : server;
+ Endpoint server = serverRecv.endpoint != null ? serverRecv.endpoint: serverSend.endpoint;
if (server == null) return null;
+ Endpoint client = clientSend.endpoint != null ? clientSend.endpoint: clientRecv.endpoint;
+ if (client == null) return null;
- long clientDuration = clientRecv - clientSend;
- long serverDuration = serverSend - serverRecv;
+ // There's no skew if the RPC is going to itself
+ if (ipsMatch(server, client)) return null;
+
+ long clientDuration = clientRecv.timestamp - clientSend.timestamp;
+ long serverDuration = serverSend.timestamp - serverRecv.timestamp;
+ // We assume latency is half the difference between the client and server duration.
+ // This breaks if client duration is smaller than server (due to async return for example).
+ if (clientDuration < serverDuration) return null;
- // There is only clock skew if CS is after SR or CR is before SS
- boolean csAhead = clientSend < serverRecv;
- boolean crAhead = clientRecv > serverSend;
- if (serverDuration > clientDuration || (csAhead && crAhead)) {
- return null;
- }
long latency = (clientDuration - serverDuration) / 2;
- long skew = serverRecv - latency - clientSend;
+ // We can't see skew when send happens before receive
+ if (latency < 0) return null;
+
+ long skew = serverRecv.timestamp - latency - clientSend.timestamp;
if (skew != 0L) {
return new ClockSkew(server, skew);
}
@@ -187,12 +191,6 @@ static ClockSkew getClockSkew(Span span) {
return result;
}
- @Nullable
- static Long getTimestamp(Map<String, Annotation> annotations, String value) {
- Annotation result = annotations.get(value);
- return result != null ? result.timestamp : null;
- }
-
private CorrectForClockSkew() {
}
}
@@ -37,6 +37,7 @@
import static zipkin.TestObjects.APP_ENDPOINT;
import static zipkin.TestObjects.DB_ENDPOINT;
import static zipkin.TestObjects.WEB_ENDPOINT;
+import static zipkin.internal.CorrectForClockSkew.getClockSkew;
import static zipkin.internal.CorrectForClockSkew.ipsMatch;
import static zipkin.internal.CorrectForClockSkew.isLocalSpan;
@@ -66,6 +67,94 @@ public void ipsMatch_falseWhenNoIp() {
assertFalse(ipsMatch(ipv6, noIp));
}
+ /**
+ * Instrumentation bugs might result in spans that look like clock skew is at play. When skew
+ * appears on the same host, we assume it is an instrumentation bug (rather than make it worse
+ * by adjusting it!)
+ */
+ @Test
+ public void getClockSkew_mustBeOnDifferentHosts() {
+ Span span = Span.builder()
+ .traceId(1L).parentId(2L).id(3L).name("")
+ .addAnnotation(Annotation.create(20, CLIENT_SEND, WEB_ENDPOINT))
+ .addAnnotation(Annotation.create(10 /* skew */, SERVER_RECV, WEB_ENDPOINT))
+ .addAnnotation(Annotation.create(20, SERVER_SEND, WEB_ENDPOINT))
+ .addAnnotation(Annotation.create(40, CLIENT_RECV, WEB_ENDPOINT))
+ .build();
+
+ assertThat(getClockSkew(span)).isNull();
+ }
+
+ @Test
+ public void getClockSkew_endpointIsServer() {
+ Span span = Span.builder()
+ .traceId(1L).parentId(2L).id(3L).name("")
+ .addAnnotation(Annotation.create(20, CLIENT_SEND, WEB_ENDPOINT))
+ .addAnnotation(Annotation.create(10 /* skew */, SERVER_RECV, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(20, SERVER_SEND, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(40, CLIENT_RECV, WEB_ENDPOINT))
+ .build();
+
+ assertThat(getClockSkew(span).endpoint).isEqualTo(APP_ENDPOINT);
+ }
+
+ /**
+ * Skew is relative to the server receive and centered by the difference between the server
+ * duration and the client duration.
+ */
+ @Test
+ public void getClockSkew_includesSplitTheLatency() {
+ Span span = Span.builder()
+ .traceId(1L).parentId(2L).id(3L).name("")
+ .addAnnotation(Annotation.create(20, CLIENT_SEND, WEB_ENDPOINT))
+ .addAnnotation(Annotation.create(10 /* skew */, SERVER_RECV, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(20, SERVER_SEND, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(40, CLIENT_RECV, WEB_ENDPOINT))
+ .build();
+
+ assertThat(getClockSkew(span).skew).isEqualTo(-15);
+ }
+
+ /** We can't currently correct async spans, where the server lets go early. */
+ @Test
+ public void getClockSkew_onlyWhenClientDurationIsLongerThanServer() {
+ Span span = Span.builder()
+ .traceId(1L).parentId(2L).id(3L).name("")
+ .addAnnotation(Annotation.create(20, CLIENT_SEND, WEB_ENDPOINT))
+ .addAnnotation(Annotation.create(10 /* skew */, SERVER_RECV, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(20, SERVER_SEND, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(25, CLIENT_RECV, WEB_ENDPOINT))
+ .build();
+
+ assertThat(getClockSkew(span)).isNull();
+ }
+
+ @Test
+ public void getClockSkew_basedOnServer() {
+ Span span = Span.builder()
+ .traceId(1L).parentId(2L).id(3L).name("")
+ .addAnnotation(Annotation.create(20, CLIENT_SEND, WEB_ENDPOINT))
+ .addAnnotation(Annotation.create(10 /* skew */, SERVER_RECV, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(20, SERVER_SEND, APP_ENDPOINT))
+ .addAnnotation(Annotation.create(40, CLIENT_RECV, WEB_ENDPOINT))
+ .build();
+
+ assertThat(getClockSkew(span).endpoint).isEqualTo(APP_ENDPOINT);
+ }
+
+ @Test
+ public void getClockSkew_requiresCoreAnnotationsToHaveEndpoints() {
+ Span span = Span.builder()
+ .traceId(1L).parentId(2L).id(3L).name("")
+ .addAnnotation(Annotation.create(20, CLIENT_SEND, null))
+ .addAnnotation(Annotation.create(10 /* skew */, SERVER_RECV, null))
+ .addAnnotation(Annotation.create(20, SERVER_SEND, null))
+ .addAnnotation(Annotation.create(40, CLIENT_RECV, null))
+ .build();
+
+ assertThat(getClockSkew(span)).isNull();
+ }
+
@Test
public void ipsMatch_falseWhenIpv4Different() {
Endpoint different = ipv4.toBuilder()

0 comments on commit 483f78e

Please sign in to comment.