From 0ca0d7e36dde74b0ae753c68d0c6d6f9a20a3131 Mon Sep 17 00:00:00 2001 From: Pavol Loffay Date: Tue, 9 Jan 2024 16:51:30 +0100 Subject: [PATCH] Add span RED alerting docs Signed-off-by: Pavol Loffay --- ...istr-tracing-tempo-config-spanmetrics.adoc | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/modules/distr-tracing-tempo-config-spanmetrics.adoc b/modules/distr-tracing-tempo-config-spanmetrics.adoc index 9988bbc09d6a..d2b0d867ffff 100644 --- a/modules/distr-tracing-tempo-config-spanmetrics.adoc +++ b/modules/distr-tracing-tempo-config-spanmetrics.adoc @@ -92,3 +92,47 @@ spec: ---- <1> Enables the monitoring tab in the Jaeger console. <2> The service name for Thanos Querier from user-workload monitoring. + +== Enable alerting on span RED metrics + +The metrics generated by the `spanmetrics` connector can be used in alerting rules. For instance to alert on a slow service or define service level objectives (SLOs). +The connector creates `duration_bucket` histogram and `calls` counter metric. These metrics have labels that identify service, API name, operation type and other attributes. + +.Labels present on the metrics created oin the `spanmetrics` connector. +[options="header"] +[cols="l, a, a"] +|=== +|Label |Description |Values +|service_name +| Service name set by `otel_service_name` environment variable. +|`frontend` + +|span_name +| Name of the operation. +|`/`, `/customer` + +|span_kind +| Span kind identifies the server, client, messaging or internal operation. +|`SPAN_KIND_SERVER`, `SPAN_KIND_CLIENT`, `SPAN_KIND_PRODUCER`, `SPAN_KIND_CONSUMER`, `SPAN_KIND_INTERNAL` +|=== + +.PrometheusRule custom resource to define an alert for SLO to serve 95% of requests within 2000ms on the frontend service. +[source,yaml] +---- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: span-red +spec: + groups: + - name: server-side-latency + rules: + - alert: SpanREDFrontendAPIRequestLatency + expr: histogram_quantile(0.95, sum(rate(duration_bucket{service_name="frontend", span_kind="SPAN_KIND_SERVER"}[5m])) by (le, service_name, span_name)) > 2000 <1> + labels: + severity: Warning + annotations: + summary: "High request latency on {{$labels.service_name}} and {{$labels.span_name}}" + description: "{{$labels.instance}} has 95th request latency above 2s (current value: {{$value}}s)" +---- +<1> The expression to check if 95% of frontend server response time is below 2000 ms. The time range (`[5m]`) should be at least four times the scrape interval and long enough to accommodate change in the metric.