From 0ca0d7e36dde74b0ae753c68d0c6d6f9a20a3131 Mon Sep 17 00:00:00 2001
From: Pavol Loffay <p.loffay@gmail.com>
Date: Tue, 9 Jan 2024 16:51:30 +0100
Subject: [PATCH] Add span RED alerting docs

Signed-off-by: Pavol Loffay <p.loffay@gmail.com>
---
 ...istr-tracing-tempo-config-spanmetrics.adoc | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/modules/distr-tracing-tempo-config-spanmetrics.adoc b/modules/distr-tracing-tempo-config-spanmetrics.adoc
index 9988bbc09d6a..d2b0d867ffff 100644
--- a/modules/distr-tracing-tempo-config-spanmetrics.adoc
+++ b/modules/distr-tracing-tempo-config-spanmetrics.adoc
@@ -92,3 +92,47 @@ spec:
 ----
 <1> Enables the monitoring tab in the Jaeger console.
 <2> The service name for Thanos Querier from user-workload monitoring.
+
+== Enable alerting on span RED metrics
+
+The metrics generated by the `spanmetrics` connector can be used in alerting rules. For instance to alert on a slow service or define service level objectives (SLOs).
+The connector creates `duration_bucket` histogram and `calls` counter metric. These metrics have labels that identify service, API name, operation type and other attributes.
+
+.Labels present on the metrics created oin the `spanmetrics` connector.
+[options="header"]
+[cols="l, a, a"]
+|===
+|Label |Description |Values
+|service_name
+| Service name set by `otel_service_name` environment variable.
+|`frontend`
+
+|span_name
+| Name of the operation.
+|`/`, `/customer`
+
+|span_kind
+| Span kind identifies the server, client, messaging or internal operation.
+|`SPAN_KIND_SERVER`, `SPAN_KIND_CLIENT`, `SPAN_KIND_PRODUCER`, `SPAN_KIND_CONSUMER`, `SPAN_KIND_INTERNAL`
+|===
+
+.PrometheusRule custom resource to define an alert for SLO to serve 95% of requests within 2000ms on the frontend service.
+[source,yaml]
+----
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: span-red
+spec:
+  groups:
+  - name: server-side-latency
+    rules:
+    - alert: SpanREDFrontendAPIRequestLatency
+      expr: histogram_quantile(0.95, sum(rate(duration_bucket{service_name="frontend", span_kind="SPAN_KIND_SERVER"}[5m])) by (le, service_name, span_name)) > 2000 <1>
+      labels:
+        severity: Warning
+      annotations:
+        summary: "High request latency on {{$labels.service_name}} and {{$labels.span_name}}"
+        description: "{{$labels.instance}} has 95th request latency above 2s (current value: {{$value}}s)"
+----
+<1> The expression to check if 95% of frontend server response time is below 2000 ms. The time range (`[5m]`) should be at least four times the scrape interval and long enough to accommodate change in the metric.