From 4266b88b88aa5740605ec801cf89bf4c2541e133 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20=C5=9Awi=C4=85tek?= <mswiatek@sumologic.com>
Date: Thu, 4 May 2023 23:10:44 +0200
Subject: [PATCH] [exporterhelper]: reduce default queue size to 1000 (#7592)

Reduce the default queue size to 1000. Combined with the defaults on batchprocessor, the current default of 5000 is too much and will lead to an OOMKill in most environments if actually filled.

1000 is probably too high still, but we don't want to make too drastic of a change here at once. See linked issue for specifics.
---
 .../exporterhelper_default-queue-size.yaml     | 18 ++++++++++++++++++
 exporter/exporterhelper/README.md              |  4 ++--
 .../internal/persistent_queue_test.go          |  2 +-
 .../internal/persistent_storage_test.go        |  2 +-
 exporter/exporterhelper/queued_retry.go        | 11 ++++++-----
 exporter/exporterhelper/queued_retry_test.go   |  2 +-
 exporter/otlpexporter/cfg-schema.yaml          |  2 +-
 7 files changed, 30 insertions(+), 11 deletions(-)
 create mode 100755 .chloggen/exporterhelper_default-queue-size.yaml

diff --git a/.chloggen/exporterhelper_default-queue-size.yaml b/.chloggen/exporterhelper_default-queue-size.yaml
new file mode 100755
index 00000000000..84945a4ed52
--- /dev/null
+++ b/.chloggen/exporterhelper_default-queue-size.yaml
@@ -0,0 +1,18 @@
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: breaking
+
+# The name of the component, or a single word describing the area of concern, (e.g. otlpreceiver)
+component: exporterhelper
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Reduce the default queue size to 1000 from 5000
+
+# One or more tracking issues or pull requests related to the change
+issues: [7359]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: |
+  Affects any exporter which enables the queue by default and doesn't set its own default size.
+  For example: otlphttp.
diff --git a/exporter/exporterhelper/README.md b/exporter/exporterhelper/README.md
index 3cf08456cdb..5af822618cb 100644
--- a/exporter/exporterhelper/README.md
+++ b/exporter/exporterhelper/README.md
@@ -16,7 +16,7 @@ The following configuration options can be modified:
 - `sending_queue`
   - `enabled` (default = true)
   - `num_consumers` (default = 10): Number of consumers that dequeue batches; ignored if `enabled` is `false`
-  - `queue_size` (default = 5000): Maximum number of batches kept in memory before dropping; ignored if `enabled` is `false`
+  - `queue_size` (default = 1000): Maximum number of batches kept in memory before dropping; ignored if `enabled` is `false`
   User should calculate this as `num_seconds * requests_per_second / requests_per_batch` where:
     - `num_seconds` is the number of seconds to buffer in case of a backend outage
     - `requests_per_second` is the average number of requests per seconds
@@ -37,7 +37,7 @@ To use the persistent queue, the following setting needs to be set:
   - `storage` (default = none): When set, enables persistence and uses the component specified as a storage extension for the persistent queue
 
 The maximum number of batches stored to disk can be controlled using `sending_queue.queue_size` parameter (which,
-similarly as for in-memory buffering, defaults to 5000 batches).
+similarly as for in-memory buffering, defaults to 1000 batches).
 
 When persistent queue is enabled, the batches are being buffered using the provided storage extension - [filestorage] is a popular and safe choice. If the collector instance is killed while having some items in the persistent queue, on restart the items will be be picked and the exporting is continued.
 
diff --git a/exporter/exporterhelper/internal/persistent_queue_test.go b/exporter/exporterhelper/internal/persistent_queue_test.go
index 4e13841670c..987eb5fafe8 100644
--- a/exporter/exporterhelper/internal/persistent_queue_test.go
+++ b/exporter/exporterhelper/internal/persistent_queue_test.go
@@ -171,7 +171,7 @@ func TestPersistentQueue_ConsumersProducers(t *testing.T) {
 			req := newFakeTracesRequest(traces)
 
 			ext := createStorageExtension(path)
-			tq := createTestQueue(ext, 5000)
+			tq := createTestQueue(ext, 1000)
 
 			defer tq.Stop()
 			t.Cleanup(func() { assert.NoError(t, ext.Shutdown(context.Background())) })
diff --git a/exporter/exporterhelper/internal/persistent_storage_test.go b/exporter/exporterhelper/internal/persistent_storage_test.go
index 76977df3332..80473663875 100644
--- a/exporter/exporterhelper/internal/persistent_storage_test.go
+++ b/exporter/exporterhelper/internal/persistent_storage_test.go
@@ -357,7 +357,7 @@ func TestPersistentStorage_RepeatPutCloseReadClose(t *testing.T) {
 
 	// No more items
 	ext := createStorageExtension(path)
-	wq := createTestQueue(ext, 5000)
+	wq := createTestQueue(ext, 1000)
 	require.Equal(t, 0, wq.Size())
 	require.NoError(t, ext.Shutdown(context.Background()))
 }
diff --git a/exporter/exporterhelper/queued_retry.go b/exporter/exporterhelper/queued_retry.go
index 16344ea0205..e0b5a8ac0eb 100644
--- a/exporter/exporterhelper/queued_retry.go
+++ b/exporter/exporterhelper/queued_retry.go
@@ -34,6 +34,8 @@ import (
 	"go.opentelemetry.io/collector/internal/obsreportconfig/obsmetrics"
 )
 
+const defaultQueueSize = 1000
+
 var (
 	errSendingQueueIsFull = errors.New("sending_queue is full")
 	errNoStorageClient    = errors.New("no storage client extension found")
@@ -58,11 +60,10 @@ func NewDefaultQueueSettings() QueueSettings {
 	return QueueSettings{
 		Enabled:      true,
 		NumConsumers: 10,
-		// For 5000 queue elements at 100 requests/sec gives about 50 sec of survival of destination outage.
-		// This is a pretty decent value for production.
-		// User should calculate this from the perspective of how many seconds to buffer in case of a backend outage,
-		// multiply that by the number of requests per seconds.
-		QueueSize: 5000,
+		// By default, batches are 8192 spans, for a total of up to 8 million spans in the queue
+		// This can be estimated at 1-4 GB worth of maximum memory usage
+		// This default is probably still too high, and may be adjusted further down in a future release
+		QueueSize: defaultQueueSize,
 	}
 }
 
diff --git a/exporter/exporterhelper/queued_retry_test.go b/exporter/exporterhelper/queued_retry_test.go
index a2ac8f8e7e3..e20fdd3d62e 100644
--- a/exporter/exporterhelper/queued_retry_test.go
+++ b/exporter/exporterhelper/queued_retry_test.go
@@ -359,7 +359,7 @@ func TestQueuedRetry_QueueMetricsReported(t *testing.T) {
 	require.NoError(t, err)
 	require.NoError(t, be.Start(context.Background(), componenttest.NewNopHost()))
 
-	checkValueForGlobalManager(t, defaultExporterTags, int64(5000), "exporter/queue_capacity")
+	checkValueForGlobalManager(t, defaultExporterTags, int64(defaultQueueSize), "exporter/queue_capacity")
 	for i := 0; i < 7; i++ {
 		require.NoError(t, be.sender.send(newErrorRequest(context.Background())))
 	}
diff --git a/exporter/otlpexporter/cfg-schema.yaml b/exporter/otlpexporter/cfg-schema.yaml
index 2db95c4d283..073ec5707ee 100644
--- a/exporter/otlpexporter/cfg-schema.yaml
+++ b/exporter/otlpexporter/cfg-schema.yaml
@@ -22,7 +22,7 @@ fields:
       NumConsumers is the number of consumers from the queue.
   - name: queue_size
     kind: int
-    default: 5000
+    default: 1000
     doc: |
       QueueSize is the maximum number of batches allowed in queue at a given time.
 - name: retry_on_failure