From 7feb7f067f578138c326e3bb503789331eea9ad0 Mon Sep 17 00:00:00 2001
From: Pavol Loffay
Date: Tue, 13 May 2025 15:53:54 +0200
Subject: [PATCH] TRACING-5364: RHOSDT 3.6 add tailsampling
Signed-off-by: Pavol Loffay
---
.../otel-collector-processors.adoc | 192 ++++++++++++++++++
1 file changed, 192 insertions(+)
diff --git a/observability/otel/otel-collector/otel-collector-processors.adoc b/observability/otel/otel-collector/otel-collector-processors.adoc
index cdc2c1fcb25a..a84f6e10b90d 100644
--- a/observability/otel/otel-collector/otel-collector-processors.adoc
+++ b/observability/otel/otel-collector/otel-collector-processors.adoc
@@ -22,6 +22,7 @@ Currently, the following General Availability and Technology Preview processors
- xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#cumulativetodelta-processor_otel-collector-processors[Cumulative-to-Delta Processor]
- xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#groupbyattrsprocessor-processor_otel-collector-processors[Group-by-Attributes Processor]
- xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#transform-processor_otel-collector-processors[Transform Processor]
+- xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#tail-sampling-processor_otel-collector-processors[Tail Sampling Processor]
[id="batch-processor_{context}"]
== Batch Processor
@@ -578,3 +579,194 @@ config:
[id="additional-resources_{context}"]
== Additional resources
* link:https://opentelemetry.io/docs/specs/otlp/[OpenTelemetry Protocol (OTLP)] (OpenTelemetry Documentation)
+
+[id="tailsampling-processor_{context}"]
+== Tail Sampling Processor
+
+The Tail Sampling Processor samples traces based on a set of defined policies.
+All spans for a given trace must be received by the same collector instance for effective sampling decisions.
+
+This processor must be placed in pipelines after any processors that rely on context, e.g. k8sattributes. It reassembles spans into new batches, causing them to lose their original context.
+
+:FeatureName: The Tail Sampling Processor
+include::snippets/technology-preview.adoc[]
+
+.Configuration summary
+[source,yaml]
+----
+# ...
+config:
+ processors:
+ tail_sampling:
+ decision_wait: 30s # <1>
+ num_traces: 50000 # <2>
+ expected_new_traces_per_sec: 10 # <3>
+ policies:
+ [
+ {
+ name: always-sample-policy,
+ type: always_sample # <4>
+ },
+ {
+ name: latency-policy,
+ type: latency, # <5>
+ latency: {threshold_ms: 5000, upper_threshold_ms: 10000}
+ },
+ {
+ name: numeric-attribute-policy,
+ type: numeric_attribute, # <6>
+ numeric_attribute: {key: key1, min_value: 50, max_value: 100}
+ },
+ {
+ name: probabilistic-policy,
+ type: probabilistic, # <7>
+ probabilistic: {sampling_percentage: 10}
+ },
+ {
+ name: status-code-policy,
+ type: status_code, # <8>
+ status_code: {status_codes: [ERROR, UNSET]}
+ },
+ {
+ name: string-attribute-policy,
+ type: string_attribute, # <9>
+ string_attribute: {key: key2, values: [value1, val*], enabled_regex_matching: true, cache_max_size: 10}
+ },
+ {
+ name: rate-limitting-policy,
+ type: rate_limiting, # <10>
+ rate_limiting: {spans_per_second: 35}
+ },
+ {
+ name: span-count-policy,
+ type: span_count, # <11>
+ span_count: {min_spans: 2, max_spans: 20}
+ },
+ {
+ name: trace-state-policy,
+ type: trace_state, # <12>
+ trace_state: { key: key3, values: [value1, value2] }
+ },
+ {
+ name: bool-attribute-policy,
+ type: boolean_attribute, # <13>
+ boolean_attribute: {key: key4, value: true}
+ },
+ {
+ name: ottl-policy,
+ type: ottl_condition, # <14>
+ ottl_condition: {
+ error_mode: ignore,
+ span: [
+ "attributes[\"test_attr_key_1\"] == \"test_attr_val_1\"",
+ "attributes[\"test_attr_key_2\"] != \"test_attr_val_1\"",
+ ],
+ spanevent: [
+ "name != \"test_span_event_name\"",
+ "attributes[\"test_event_attr_key_2\"] != \"test_event_attr_val_1\"",
+ ]
+ }
+ },
+ {
+ name: and-policy,
+ type: and, # <15>
+ and: {
+ and_sub_policy:
+ [
+ {
+ name: and-policy-1,
+ type: numeric_attribute,
+ numeric_attribute: { key: key1, min_value: 50, max_value: 100 }
+ },
+ {
+ name: and-policy-2,
+ type: string_attribute,
+ string_attribute: { key: key2, values: [ value1, value2 ] }
+ },
+ ]
+ }
+ },
+ {
+ name: drop-policy,
+ type: drop, # <16>
+ drop: {
+ drop_sub_policy:
+ [
+ {
+ name: drop-policy-1,
+ type: string_attribute,
+ string_attribute: {key: url.path, values: [\/health, \/metrics], enabled_regex_matching: true}
+ }
+ ]
+ }
+ },
+ {
+ name: composite-policy,
+ type: composite, # <17>
+ composite:
+ {
+ max_total_spans_per_second: 1000,
+ policy_order: [test-composite-policy-1, test-composite-policy-2, test-composite-policy-3],
+ composite_sub_policy:
+ [
+ {
+ name: composite-policy-1,
+ type: numeric_attribute,
+ numeric_attribute: {key: key1, min_value: 50}
+ },
+ {
+ name: composite-policy-2,
+ type: string_attribute,
+ string_attribute: {key: key2, values: [value1, value2]}
+ },
+ {
+ name: composite-policy-3,
+ type: always_sample
+ }
+ ],
+ rate_allocation:
+ [
+ {
+ policy: composite-rate-policy-1,
+ percent: 50
+ },
+ {
+ policy: composite-rate-policy-2,
+ percent: 25
+ }
+ ]
+ }
+ },
+ ]
+# ...
+----
+<1> Wait time since the first span of a trace before making a sampling decision. Default is 30s.
+<2> Number of traces kept in memory. Default is 50000
+<3> Expected number of new traces (helps in allocating data structures). Default is 0.
+<4> Policy to sample all traces.
+<5> Policy to sample based on the duration of a trace. The duration is determined by looking at the earliest start time and latest end time, without taking into consideration what happened in between. Supplying no upper bound will result in a policy sampling anything greater than `threshold_ms`.
+<6> Policy to sample based on number attributes (resource and record).
+<7> Policy to sample a percentage of traces.
+<8> Policy to sample based upon the status code (OK, ERROR or UNSET).
+<9> Policy to sample based on string attributes (resource and record) value matches, both exact and regex value matches are supported.
+<10> Policy to sample based on the rate of spans per second.
+<11> Policy to sample based on the minimum and/or maximum number of spans, inclusive. If the sum of all spans in the trace is outside the range threshold, the trace will not be sampled.
+<12> Policy to sample based on TraceState value matches.
+<13> Policy to sample based on boolean attribute (resource and record).
+<14> Policy to sample based on given boolean OTTL condition (span and span event).
+<15> Policy to sample based on multiple policies, creates an AND policy.
+<16> Policy to drop (not sample) based on multiple policies, creates a DROP policy.
+<17> Policy to sample based on a combination of above samplers, with ordering and rate allocation per sampler. Rate allocation allocates certain percentages of spans per policy order. For example if we have set `max_total_spans_per_second` as `100` then we can set `rate_allocation` as follows: test-composite-policy-1 = 50 % of max_total_spans_per_second = 50 spans_per_second, test-composite-policy-2 = 25 % of max_total_spans_per_second = 25 spans_per_second, To ensure remaining capacity is filled use always_sample as one of the policies.
+
+
+Each policy will result in a decision, and the processor will evaluate them to make a final decision:
+
+* When there's a "drop" decision, the trace is not sampled;
+* When there's a "sample" decision, the trace is sampled;
+* In all other cases, the trace is NOT sampled
+
+=== Scaling collectors with the tail sampling processor
+
+This processor requires all spans for a given trace to be sent to the same collector instance for the correct sampling decision to be derived.
+When scaling the collector, you'll then need to ensure that all spans for the same trace are reaching the same collector.
+You can achieve this by having two layers of collectors: one with the load balancing exporter, and one with the tail sampling processor.
\ No newline at end of file