From 3174b433c18712761a74f8e9c95b0d02eb0ae422 Mon Sep 17 00:00:00 2001 From: Anders Swanson Date: Fri, 26 Sep 2025 12:15:53 -0700 Subject: [PATCH] Hot reload to override individual metrics Signed-off-by: Anders Swanson --- collector/cache.go | 2 +- collector/collector.go | 6 ++-- collector/data_loader.go | 19 +++++++----- collector/database.go | 2 +- collector/default_metrics.go | 6 ++-- collector/metrics.go | 8 +++++ collector/types.go | 2 +- site/docs/configuration/custom-metrics.md | 38 +++++++++++++++++++++++ site/docs/releases/changelog.md | 1 + 9 files changed, 67 insertions(+), 17 deletions(-) diff --git a/collector/cache.go b/collector/cache.go index 741ed62b..226d05ca 100644 --- a/collector/cache.go +++ b/collector/cache.go @@ -8,7 +8,7 @@ import ( "time" ) -func NewMetricsCache(metrics []*Metric) *MetricsCache { +func NewMetricsCache(metrics map[string]*Metric) *MetricsCache { c := map[*Metric]*MetricCacheRecord{} for _, metric := range metrics { diff --git a/collector/collector.go b/collector/collector.go index 5767e975..8aae30ab 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -252,7 +252,7 @@ func (e *Exporter) scrapeDatabase(ch chan<- prometheus.Metric, errChan chan<- er e.logger.Debug("Successfully pinged Oracle database: "+maskDsn(d.Config.URL), "database", d.Name) metricsToScrape := 0 - for _, metric := range e.metricsToScrape.Metric { + for _, metric := range e.metricsToScrape { metric := metric //https://golang.org/doc/faq#closures_and_goroutines isScrapeMetric := e.isScrapeMetric(tick, metric, d) metricsToScrape++ @@ -324,7 +324,7 @@ func (e *Exporter) scrapeDatabase(ch chan<- prometheus.Metric, errChan chan<- er func (e *Exporter) scrape(ch chan<- prometheus.Metric, tick *time.Time) { e.totalScrapes.Inc() - errChan := make(chan error, len(e.metricsToScrape.Metric)*len(e.databases)) + errChan := make(chan error, len(e.metricsToScrape)*len(e.databases)) begun := time.Now() if e.checkIfMetricsChanged() { e.reloadMetrics() @@ -529,7 +529,7 @@ func (e *Exporter) generatePrometheusMetrics(d *Database, parse func(row map[str func (e *Exporter) initCache() { for _, d := range e.databases { - d.initCache(e.metricsToScrape.Metric) + d.initCache(e.metricsToScrape) } } diff --git a/collector/data_loader.go b/collector/data_loader.go index cc0fe51f..6574ad7c 100644 --- a/collector/data_loader.go +++ b/collector/data_loader.go @@ -14,12 +14,8 @@ import ( ) func (e *Exporter) reloadMetrics() { - // Truncate metricsToScrape - e.metricsToScrape.Metric = []*Metric{} - - // Load default metrics - defaultMetrics := e.DefaultMetrics() - e.metricsToScrape.Metric = defaultMetrics.Metric + // reload default metrics + e.metricsToScrape = e.DefaultMetrics() // If custom metrics, load it if len(e.CustomMetricsFiles()) > 0 { @@ -32,8 +28,9 @@ func (e *Exporter) reloadMetrics() { } else { e.logger.Info("Successfully loaded custom metrics from " + _customMetrics) } - - e.metricsToScrape.Metric = append(e.metricsToScrape.Metric, metrics.Metric...) + // Merge custom metrics into default metrics. + // Any collisions (by ID) will overwrite the old metric value. + e.merge(metrics) } } else { e.logger.Debug("No custom metrics defined.") @@ -41,6 +38,12 @@ func (e *Exporter) reloadMetrics() { e.initCache() } +func (e *Exporter) merge(metrics *Metrics) { + for _, metric := range metrics.Metric { + e.metricsToScrape[metric.ID()] = metric + } +} + func loadYamlMetricsConfig(_metricsFileName string, metrics *Metrics) error { yamlBytes, err := os.ReadFile(_metricsFileName) if err != nil { diff --git a/collector/database.go b/collector/database.go index dbc4bbfb..52b25ece 100644 --- a/collector/database.go +++ b/collector/database.go @@ -98,7 +98,7 @@ func NewDatabase(logger *slog.Logger, dbname string, dbconfig DatabaseConfig) *D } // initCache resets the metrics cached. Used on startup and when metrics are reloaded. -func (d *Database) initCache(metrics []*Metric) { +func (d *Database) initCache(metrics map[string]*Metric) { d.MetricsCache = NewMetricsCache(metrics) } diff --git a/collector/default_metrics.go b/collector/default_metrics.go index 5e5111b2..da02d221 100644 --- a/collector/default_metrics.go +++ b/collector/default_metrics.go @@ -17,19 +17,19 @@ import ( var defaultMetricsToml string // DefaultMetrics is a somewhat hacky way to load the default metrics -func (e *Exporter) DefaultMetrics() Metrics { +func (e *Exporter) DefaultMetrics() map[string]*Metric { var metricsToScrape Metrics if e.Metrics.Default != "" { if err := loadMetricsConfig(filepath.Clean(e.Metrics.Default), &metricsToScrape); err != nil { e.logger.Error(fmt.Sprintf("there was an issue while loading specified default metrics file at: "+e.Metrics.Default+", proceeding to run with default metrics."), "error", err) } - return metricsToScrape + return metricsToScrape.toMap() } if _, err := toml.Decode(defaultMetricsToml, &metricsToScrape); err != nil { e.logger.Error("failed to load default metrics", "error", err) panic(errors.New("Error while loading " + defaultMetricsToml)) } - return metricsToScrape + return metricsToScrape.toMap() } diff --git a/collector/metrics.go b/collector/metrics.go index 0271c696..55aae25c 100644 --- a/collector/metrics.go +++ b/collector/metrics.go @@ -112,3 +112,11 @@ func (m *Metric) IsEnabledForDatabase(d *Database) bool { } return false } + +func (metrics Metrics) toMap() map[string]*Metric { + m := map[string]*Metric{} + for _, metric := range metrics.Metric { + m[metric.ID()] = metric + } + return m +} diff --git a/collector/types.go b/collector/types.go index 003149bc..721a040c 100644 --- a/collector/types.go +++ b/collector/types.go @@ -15,7 +15,7 @@ import ( type Exporter struct { *MetricsConfiguration mu *sync.Mutex - metricsToScrape Metrics + metricsToScrape map[string]*Metric duration, error prometheus.Gauge totalScrapes prometheus.Counter scrapeErrors *prometheus.CounterVec diff --git a/site/docs/configuration/custom-metrics.md b/site/docs/configuration/custom-metrics.md index b33d6355..6e202eb1 100644 --- a/site/docs/configuration/custom-metrics.md +++ b/site/docs/configuration/custom-metrics.md @@ -23,6 +23,10 @@ metrics: You may also use `--custom.metrics` flag followed by a comma separated list of TOML or YAML files, or export `CUSTOM_METRICS` variable environment (`export CUSTOM_METRICS=my-custom-metrics.toml,my-other-custom-metrics.toml`) +### Metric Hot Reload + +The exporter watches for changes in custom metrics. When these files change, the exporter hot reloads the metrics definition, and serves the new metrics on the next scrape. + ### Metric Schema Metrics files must contain a series of `[[metric]]` definitions, in TOML, or the equivalent definition in a YAML file. Each metric definition must follow the exporter's metric schema: @@ -123,6 +127,40 @@ oracledb_test_value_2 2 You can find [working examples](https://github.com/oracle/oracle-db-appdev-monitoring/blob/main/custom-metrics-example/custom-metrics.toml) of custom metrics for slow queries, big queries and top 100 tables. An example of [custom metrics for Transacational Event Queues](https://github.com/oracle/oracle-db-appdev-monitoring/blob/main/custom-metrics-example/txeventq-metrics.toml) is also provided. +#### Override Existing, Individual Metrics + +You may override properties for existing metrics by supplying a new, custom metric definition with the same `context` and `metricsdesc` values. For example, if you have an existing metric like so: + +```toml +[[metric]] +context = "my_default_metric" +metricsdesc = { value_1 = "Simple example returning always 1.", value_2 = "Same but returning always 2." } +request = "SELECT 1 as value_1, 2 as value_2 FROM DUAL" +``` + +You can redefine this metric in a custom metrics file to change any properties other than `context` or `metricsdesc`. For example, overriding the previous metric with `labels`, `scrapeinterval`, and `querytimeout` properties: + +```toml +[[metric]] +context = "my_default_metric" +metricsdesc = { value_1 = "Simple example returning always 1.", value_2 = "Same but returning always 2." } +labels = [ "label_1", "label_2" ] +request = "SELECT 1 as value_1, 2 as value_2 FROM DUAL" +scrapeinterval = "30s" +querytimeout = "10s" +``` + +Then, provide any metrics overrides as custom metrics files in the [exporter configuration file](config-file.md): + +```yaml +metrics: + ## Paths to any custom metrics files + custom: + - my-custom-metrics.toml +``` + +If any metric appears more than once in the custom metrics file list, the metric definition in the last file provided takes precedence. + ### YAML Metrics Metrics may be defined with YAML instead of TOML. YAML metric field names correspond to TOML metric field names. diff --git a/site/docs/releases/changelog.md b/site/docs/releases/changelog.md index 45f8973f..b9ddf3ff 100644 --- a/site/docs/releases/changelog.md +++ b/site/docs/releases/changelog.md @@ -13,6 +13,7 @@ Our current priorities to support metrics for advanced database features and use - Updated project dependencies. - Standardize multi-arch builds and document supported database versions. +- The metrics override capability is extended, allowing users to redefine individual existing metrics in custom metrics files. This allows users to modify individual default metrics without wholly replacing the default metrics file. - If the exporter fails to connect to a database due to invalid or locked credentials (ORA-01017 or ORA-28000 errors), that database configuration will be invalidated and the exporter will not attempt to re-establish the database connection. Other databases will continue to be scraped. - Metrics with an empty databases array (`databases = []`) are now considered disabled, and will not be scraped. - Increased the default query timeout for the `top_sql` metric to 10 seconds (previously 5 seconds).