From 0e08a1c405d84c8973c6e1d79d4fa91ded2335e5 Mon Sep 17 00:00:00 2001 From: Paulo Janotti Date: Mon, 30 Oct 2023 21:54:35 -0700 Subject: [PATCH] [receiver/hostmetrics] Fix panic in load_scraper_windows when stopping (#28678) **Description:** Fix a panic when the load scraper for Windows is stopped before being started. This can happen when the collector fails at startup. In this case the components are shutdown even if they were not started. This was encountered in real world usage. ```terminal 2023-10-23T13:13:23.137-0500 info service@v0.86.0/service.go:170 Starting shutdown... 2023-10-23T13:13:23.138-0500 info healthcheck/handler.go:132 Health Check state change {"kind": "extension", "name": "health_check", "status": "unavailable"} panic: runtime error: invalid memory address or nil pointer dereference [signal 0xc0000005 code=0x0 addr=0x0 pc=0x30c4028] goroutine 1 [running]: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/loadscraper.stopSampling({0x0?, 0x6000103?}) github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver@v0.86.0/internal/scraper/loadscraper/load_scraper_windows.go:145 +0xc8 github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/scraper/loadscraper.(*scraper).shutdown(...) github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver@v0.86.0/internal/scraper/loadscraper/load_scraper.go:78 go.opentelemetry.io/collector/component.ShutdownFunc.Shutdown(...) go.opentelemetry.io/collector/component@v0.86.0/component.go:84 go.opentelemetry.io/collector/receiver/scraperhelper.(*controller).Shutdown(0xc0000c3a40, {0x71b0d50, 0xc00006c0e0}) go.opentelemetry.io/collector/receiver@v0.86.0/scraperhelper/scrapercontroller.go:149 +0x97 go.opentelemetry.io/collector/service/internal/graph.(*Graph).ShutdownAll(0x0?, {0x71b0d50, 0xc00006c0e0}) go.opentelemetry.io/collector/service@v0.86.0/internal/graph/graph.go:358 +0xc9 go.opentelemetry.io/collector/service.(*Service).Shutdown(0xc0008373b0, {0x71b0d50, 0xc00006c0e0}) go.opentelemetry.io/collector/service@v0.86.0/service.go:176 +0xd4 go.opentelemetry.io/collector/otelcol.(*Collector).setupConfigurationComponents(0xc000dc6180, {0x71b0d50, 0xc00006c0e0}) go.opentelemetry.io/collector/otelcol@v0.86.0/collector.go:187 +0x708 go.opentelemetry.io/collector/otelcol.(*Collector).Run(0xc000dc6180, {0x71b0d50, 0xc00006c0e0}) go.opentelemetry.io/collector/otelcol@v0.86.0/collector.go:221 +0x65 go.opentelemetry.io/collector/otelcol.NewCommand.func1(0xc00229cf00, {0x6636c91?, 0x0?, 0x3?}) go.opentelemetry.io/collector/otelcol@v0.86.0/command.go:27 +0x96 github.com/spf13/cobra.(*Command).execute(0xc00229cf00, {0xc0000ac050, 0x0, 0x3}) github.com/spf13/cobra@v1.7.0/command.go:940 +0x862 github.com/spf13/cobra.(*Command).ExecuteC(0xc00229cf00) github.com/spf13/cobra@v1.7.0/command.go:1068 +0x3bd github.com/spf13/cobra.(*Command).Execute(0xc0022dd860?) github.com/spf13/cobra@v1.7.0/command.go:992 +0x19 main.runInteractive({{0xc0022dd860, 0xc0022ddad0, 0xc0022dda10, 0xc0022dd5f0, 0xc0022ddb00}, {{0x663630d, 0x7}, {0x0, 0x0}, {0x713fdf8, ...}}, ...}) github.com/signalfx/splunk-otel-collector/cmd/otelcol/main.go:100 +0x5d main.run({{0xc0022dd860, 0xc0022ddad0, 0xc0022dda10, 0xc0022dd5f0, 0xc0022ddb00}, {{0x663630d, 0x7}, {0x0, 0x0}, {0x713fdf8, ...}}, ...}) github.com/signalfx/splunk-otel-collector/cmd/otelcol/main_windows.go:33 +0x58 main.main() github.com/signalfx/splunk-otel-collector/cmd/otelcol/main.go:93 +0xcba ``` **Link to tracking Issue:** N/A **Testing:** Local test runs. **Documentation:** N/A --- ...anic-on-load_scraper_windows-shutdown.yaml | 27 +++++++++++++++++++ .../loadscraper/load_scraper_windows.go | 6 +++++ .../loadscraper/load_scraper_windows_test.go | 7 +++++ 3 files changed, 40 insertions(+) create mode 100644 .chloggen/fix-panic-on-load_scraper_windows-shutdown.yaml diff --git a/.chloggen/fix-panic-on-load_scraper_windows-shutdown.yaml b/.chloggen/fix-panic-on-load_scraper_windows-shutdown.yaml new file mode 100644 index 0000000000000..eb74cb7dc862e --- /dev/null +++ b/.chloggen/fix-panic-on-load_scraper_windows-shutdown.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: receiver/hostmetrics + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Fix panic on load_scraper_windows shutdown + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [28678] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows.go b/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows.go index c7f972f786726..be28feb29e1da 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows.go +++ b/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows.go @@ -136,12 +136,18 @@ func stopSampling(_ context.Context) error { startupLock.Lock() defer startupLock.Unlock() + if scraperCount == 0 { + // no load scraper is running nothing to do + return nil + } + // only stop sampling if all load scrapers have been closed scraperCount-- if scraperCount > 0 { return nil } + // no more load scrapers are running, stop the sampler close(samplerInstance.done) return nil } diff --git a/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows_test.go b/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows_test.go index 577690df8a17e..55c3bdd53fec0 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows_test.go +++ b/receiver/hostmetricsreceiver/internal/scraper/loadscraper/load_scraper_windows_test.go @@ -18,6 +18,13 @@ import ( "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver/internal/perfcounters" ) +func TestStopSamplingWithoutStart(t *testing.T) { + // When the collector fails to start it is possible that stopSampling is called + // before startSampling. This test ensures that stopSampling does not panic in + // this scenario. + require.NoError(t, stopSampling(context.Background())) +} + func TestStartSampling(t *testing.T) { t.Skip(t, "Test is causing race conditions, see https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/10143.") // override sampling frequency to 2ms