diff --git a/observability/otel/otel-collector/otel-collector-processors.adoc b/observability/otel/otel-collector/otel-collector-processors.adoc index cdc2c1fcb25a..a84f6e10b90d 100644 --- a/observability/otel/otel-collector/otel-collector-processors.adoc +++ b/observability/otel/otel-collector/otel-collector-processors.adoc @@ -22,6 +22,7 @@ Currently, the following General Availability and Technology Preview processors - xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#cumulativetodelta-processor_otel-collector-processors[Cumulative-to-Delta Processor] - xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#groupbyattrsprocessor-processor_otel-collector-processors[Group-by-Attributes Processor] - xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#transform-processor_otel-collector-processors[Transform Processor] +- xref:../../../observability/otel/otel-collector/otel-collector-processors.adoc#tail-sampling-processor_otel-collector-processors[Tail Sampling Processor] [id="batch-processor_{context}"] == Batch Processor @@ -578,3 +579,194 @@ config: [id="additional-resources_{context}"] == Additional resources * link:https://opentelemetry.io/docs/specs/otlp/[OpenTelemetry Protocol (OTLP)] (OpenTelemetry Documentation) + +[id="tailsampling-processor_{context}"] +== Tail Sampling Processor + +The Tail Sampling Processor samples traces based on a set of defined policies. +All spans for a given trace must be received by the same collector instance for effective sampling decisions. + +This processor must be placed in pipelines after any processors that rely on context, e.g. k8sattributes. It reassembles spans into new batches, causing them to lose their original context. + +:FeatureName: The Tail Sampling Processor +include::snippets/technology-preview.adoc[] + +.Configuration summary +[source,yaml] +---- +# ... +config: + processors: + tail_sampling: + decision_wait: 30s # <1> + num_traces: 50000 # <2> + expected_new_traces_per_sec: 10 # <3> + policies: + [ + { + name: always-sample-policy, + type: always_sample # <4> + }, + { + name: latency-policy, + type: latency, # <5> + latency: {threshold_ms: 5000, upper_threshold_ms: 10000} + }, + { + name: numeric-attribute-policy, + type: numeric_attribute, # <6> + numeric_attribute: {key: key1, min_value: 50, max_value: 100} + }, + { + name: probabilistic-policy, + type: probabilistic, # <7> + probabilistic: {sampling_percentage: 10} + }, + { + name: status-code-policy, + type: status_code, # <8> + status_code: {status_codes: [ERROR, UNSET]} + }, + { + name: string-attribute-policy, + type: string_attribute, # <9> + string_attribute: {key: key2, values: [value1, val*], enabled_regex_matching: true, cache_max_size: 10} + }, + { + name: rate-limitting-policy, + type: rate_limiting, # <10> + rate_limiting: {spans_per_second: 35} + }, + { + name: span-count-policy, + type: span_count, # <11> + span_count: {min_spans: 2, max_spans: 20} + }, + { + name: trace-state-policy, + type: trace_state, # <12> + trace_state: { key: key3, values: [value1, value2] } + }, + { + name: bool-attribute-policy, + type: boolean_attribute, # <13> + boolean_attribute: {key: key4, value: true} + }, + { + name: ottl-policy, + type: ottl_condition, # <14> + ottl_condition: { + error_mode: ignore, + span: [ + "attributes[\"test_attr_key_1\"] == \"test_attr_val_1\"", + "attributes[\"test_attr_key_2\"] != \"test_attr_val_1\"", + ], + spanevent: [ + "name != \"test_span_event_name\"", + "attributes[\"test_event_attr_key_2\"] != \"test_event_attr_val_1\"", + ] + } + }, + { + name: and-policy, + type: and, # <15> + and: { + and_sub_policy: + [ + { + name: and-policy-1, + type: numeric_attribute, + numeric_attribute: { key: key1, min_value: 50, max_value: 100 } + }, + { + name: and-policy-2, + type: string_attribute, + string_attribute: { key: key2, values: [ value1, value2 ] } + }, + ] + } + }, + { + name: drop-policy, + type: drop, # <16> + drop: { + drop_sub_policy: + [ + { + name: drop-policy-1, + type: string_attribute, + string_attribute: {key: url.path, values: [\/health, \/metrics], enabled_regex_matching: true} + } + ] + } + }, + { + name: composite-policy, + type: composite, # <17> + composite: + { + max_total_spans_per_second: 1000, + policy_order: [test-composite-policy-1, test-composite-policy-2, test-composite-policy-3], + composite_sub_policy: + [ + { + name: composite-policy-1, + type: numeric_attribute, + numeric_attribute: {key: key1, min_value: 50} + }, + { + name: composite-policy-2, + type: string_attribute, + string_attribute: {key: key2, values: [value1, value2]} + }, + { + name: composite-policy-3, + type: always_sample + } + ], + rate_allocation: + [ + { + policy: composite-rate-policy-1, + percent: 50 + }, + { + policy: composite-rate-policy-2, + percent: 25 + } + ] + } + }, + ] +# ... +---- +<1> Wait time since the first span of a trace before making a sampling decision. Default is 30s. +<2> Number of traces kept in memory. Default is 50000 +<3> Expected number of new traces (helps in allocating data structures). Default is 0. +<4> Policy to sample all traces. +<5> Policy to sample based on the duration of a trace. The duration is determined by looking at the earliest start time and latest end time, without taking into consideration what happened in between. Supplying no upper bound will result in a policy sampling anything greater than `threshold_ms`. +<6> Policy to sample based on number attributes (resource and record). +<7> Policy to sample a percentage of traces. +<8> Policy to sample based upon the status code (OK, ERROR or UNSET). +<9> Policy to sample based on string attributes (resource and record) value matches, both exact and regex value matches are supported. +<10> Policy to sample based on the rate of spans per second. +<11> Policy to sample based on the minimum and/or maximum number of spans, inclusive. If the sum of all spans in the trace is outside the range threshold, the trace will not be sampled. +<12> Policy to sample based on TraceState value matches. +<13> Policy to sample based on boolean attribute (resource and record). +<14> Policy to sample based on given boolean OTTL condition (span and span event). +<15> Policy to sample based on multiple policies, creates an AND policy. +<16> Policy to drop (not sample) based on multiple policies, creates a DROP policy. +<17> Policy to sample based on a combination of above samplers, with ordering and rate allocation per sampler. Rate allocation allocates certain percentages of spans per policy order. For example if we have set `max_total_spans_per_second` as `100` then we can set `rate_allocation` as follows: test-composite-policy-1 = 50 % of max_total_spans_per_second = 50 spans_per_second, test-composite-policy-2 = 25 % of max_total_spans_per_second = 25 spans_per_second, To ensure remaining capacity is filled use always_sample as one of the policies. + + +Each policy will result in a decision, and the processor will evaluate them to make a final decision: + +* When there's a "drop" decision, the trace is not sampled; +* When there's a "sample" decision, the trace is sampled; +* In all other cases, the trace is NOT sampled + +=== Scaling collectors with the tail sampling processor + +This processor requires all spans for a given trace to be sent to the same collector instance for the correct sampling decision to be derived. +When scaling the collector, you'll then need to ensure that all spans for the same trace are reaching the same collector. +You can achieve this by having two layers of collectors: one with the load balancing exporter, and one with the tail sampling processor. \ No newline at end of file