diff --git a/src/OpenTelemetry/CHANGELOG.md b/src/OpenTelemetry/CHANGELOG.md index a15bdb9275..2b20ac5927 100644 --- a/src/OpenTelemetry/CHANGELOG.md +++ b/src/OpenTelemetry/CHANGELOG.md @@ -7,6 +7,11 @@ function when configuring a view (applies to individual metrics). ([#5542](https://github.com/open-telemetry/opentelemetry-dotnet/pull/5542)) +* Fixed a race condition for the experimental MetricPoint reclaim scenario + (enabled via `OTEL_DOTNET_EXPERIMENTAL_METRICS_RECLAIM_UNUSED_METRIC_POINTS`) + which could have led to a measurement being dropped. + ([#5546](https://github.com/open-telemetry/opentelemetry-dotnet/pull/5546)) + ## 1.8.1 Released 2024-Apr-17 diff --git a/src/OpenTelemetry/Metrics/AggregatorStore.cs b/src/OpenTelemetry/Metrics/AggregatorStore.cs index bb806a0149..34a4bb5f2c 100644 --- a/src/OpenTelemetry/Metrics/AggregatorStore.cs +++ b/src/OpenTelemetry/Metrics/AggregatorStore.cs @@ -265,12 +265,41 @@ internal void SnapshotDeltaWithMetricPointReclaim() if (metricPoint.MetricPointStatus == MetricPointStatus.NoCollectPending) { + // Reclaim the MetricPoint if it was marked for it in the previous collect cycle + if (metricPoint.LookupData != null && metricPoint.LookupData.DeferredReclaim == true) + { + this.ReclaimMetricPoint(ref metricPoint, i); + continue; + } + + // Check if the MetricPoint could be reclaimed in the current Collect cycle. // If metricPoint.LookupData is `null` then the MetricPoint is already reclaimed and in the queue. // If the Collect thread is successfully able to compare and swap the reference count from zero to int.MinValue, it means that // the MetricPoint can be reused for other tags. if (metricPoint.LookupData != null && Interlocked.CompareExchange(ref metricPoint.ReferenceCount, int.MinValue, 0) == 0) { - this.ReclaimMetricPoint(ref metricPoint, i); + // This is similar to double-checked locking. For some rare case, the Collect thread might read the status as `NoCollectPending`, + // and then get switched out before it could set the ReferenceCount to `int.MinValue`. In the meantime, an Update thread could come in + // and update the MetricPoint, thereby, setting its status to `CollectPending`. Note that the ReferenceCount would be 0 after the update. + // If the Collect thread now wakes up, it would be able to set the ReferenceCount to `int.MinValue`, thereby, marking the MetricPoint + // invalid for newer updates. In such cases, the MetricPoint, should not be reclaimed before taking its Snapshot. + + if (metricPoint.MetricPointStatus == MetricPointStatus.NoCollectPending) + { + this.ReclaimMetricPoint(ref metricPoint, i); + } + else + { + // MetricPoint's ReferenceCount is `int.MinValue` but it still has a collect pending. Take the MetricPoint's Snapshot + // and mark it to be reclaimed in the next Collect cycle. + + metricPoint.LookupData.DeferredReclaim = true; + + this.TakeMetricPointSnapshot(ref metricPoint, outputDelta: true); + + this.currentMetricPointBatch[this.batchSize] = i; + this.batchSize++; + } } continue; diff --git a/src/OpenTelemetry/Metrics/LookupData.cs b/src/OpenTelemetry/Metrics/LookupData.cs index a77cf4d18b..99ed37e4de 100644 --- a/src/OpenTelemetry/Metrics/LookupData.cs +++ b/src/OpenTelemetry/Metrics/LookupData.cs @@ -5,12 +5,14 @@ namespace OpenTelemetry.Metrics; internal sealed class LookupData { + public bool DeferredReclaim; public int Index; public Tags SortedTags; public Tags GivenTags; public LookupData(int index, in Tags sortedTags, in Tags givenTags) { + this.DeferredReclaim = false; this.Index = index; this.SortedTags = sortedTags; this.GivenTags = givenTags;