diff --git a/alertlog/alertlog.go b/alertlog/alertlog.go index 6b887d40..d201072f 100644 --- a/alertlog/alertlog.go +++ b/alertlog/alertlog.go @@ -24,6 +24,10 @@ type LogRecord struct { var databaseFailures map[string]int = map[string]int{} func UpdateLog(logDestination string, logger *slog.Logger, d *collector.Database) { + // Do not try to query the alert log if the database configuration is invalid. + if !d.IsValid() { + return + } queryFailures := databaseFailures[d.Name] if queryFailures == 3 { diff --git a/collector/collector.go b/collector/collector.go index c7c34e28..5767e975 100644 --- a/collector/collector.go +++ b/collector/collector.go @@ -237,6 +237,12 @@ func (e *Exporter) scheduledScrape(tick *time.Time) { } func (e *Exporter) scrapeDatabase(ch chan<- prometheus.Metric, errChan chan<- error, d *Database, tick *time.Time) int { + // If the database configuration is invalid, do not attempt to ping or reestablish the database connection. + if !d.IsValid() { + e.logger.Warn("Invalid database configuration, will not attempt reconnection", "database", d.Name) + errChan <- fmt.Errorf("database %s is invalid, will not be scraped", d.Name) + return 1 + } // If ping fails, we will try again on the next iteration of metrics scraping if err := d.ping(e.logger); err != nil { e.logger.Error("Error pinging database", "error", err, "database", d.Name) diff --git a/collector/database.go b/collector/database.go index 90cdc2f8..5ef3894d 100644 --- a/collector/database.go +++ b/collector/database.go @@ -6,6 +6,7 @@ package collector import ( "context" "database/sql" + "errors" "fmt" "github.com/godror/godror" "github.com/godror/godror/dsn" @@ -15,6 +16,10 @@ import ( "time" ) +const ( + ora01017code = 1017 +) + func (d *Database) UpMetric(exporterLabels map[string]string) prometheus.Metric { desc := prometheus.NewDesc( prometheus.BuildFQName(namespace, "", "up"), @@ -49,6 +54,10 @@ func (d *Database) ping(logger *slog.Logger) error { err := d.Session.PingContext(ctx) if err != nil { d.Up = 0 + if isInvalidCredentialsError(err) { + d.invalidate() + return err + } // If database is closed, try to reconnect if strings.Contains(err.Error(), "sql: database is closed") { db, dbtype := connect(logger, d.Name, d.Config) @@ -83,6 +92,7 @@ func NewDatabase(logger *slog.Logger, dbname string, dbconfig DatabaseConfig) *D Session: db, Type: dbtype, Config: dbconfig, + Valid: true, } } @@ -127,6 +137,26 @@ func (d *Database) WarmupConnectionPool(logger *slog.Logger) { } } +func (d *Database) IsValid() bool { + return d.Valid +} + +func (d *Database) invalidate() { + d.Valid = false +} + +func isInvalidCredentialsError(err error) bool { + err = errors.Unwrap(err) + if err == nil { + return false + } + oraErr, ok := err.(*godror.OraErr) + if !ok { + return false + } + return oraErr.Code() == ora01017 +} + func connect(logger *slog.Logger, dbname string, dbconfig DatabaseConfig) (*sql.DB, float64) { logger.Debug("Launching connection to "+maskDsn(dbconfig.URL), "database", dbname) diff --git a/collector/types.go b/collector/types.go index 12f2a85b..003149bc 100644 --- a/collector/types.go +++ b/collector/types.go @@ -34,6 +34,8 @@ type Database struct { // MetricsCache holds computed metrics for a database, so these metrics are available on each scrape. // Given a metric's scrape configuration, it may not be computed on the same interval as other metrics. MetricsCache *MetricsCache + + Valid bool } type MetricsCache struct { diff --git a/site/docs/releases/changelog.md b/site/docs/releases/changelog.md index 33cd41b8..e3641f04 100644 --- a/site/docs/releases/changelog.md +++ b/site/docs/releases/changelog.md @@ -13,6 +13,7 @@ Our current priorities to support metrics for advanced database features and use - Updated project dependencies. - Standardize multi-arch builds and document supported database versions. +- If the exporter fails to connect to a database due to invalid credentials (ORA-01017 error), that database configuration will be invalidated and the exporter will not attempt to re-establish the database connection. Other databases will continue to be scraped. - Metrics with an empty databases array (`databases = []`) are now considered disabled, and will not be scraped. - Increased the default query timeout for the `top_sql` metric to 10 seconds (previously 5 seconds). - Metrics using the `scrapeinterval` property will no longer be scraped on every request if they have a cached value. This only applies when the metrics exporter is configured to scrape metrics _on request_, rather than on a global interval. diff --git a/site/docs/releases/roadmap.md b/site/docs/releases/roadmap.md index b19aa0ec..a1b89425 100644 --- a/site/docs/releases/roadmap.md +++ b/site/docs/releases/roadmap.md @@ -12,10 +12,10 @@ We welcome input on community-driven features you'd like to see supported. Pleas Currently, we plan to address the following key features: - Provide default Oracle Exadata metrics -- Implement connection storm protection: prevent the exporter from repeatedly connecting when the credentials fail, to prevent a storm of connections causing accounts to be locked across a large number of databases +- Provide default GoldenGate metrics +- Enhance database alert logging and alert log metrics - Provide the option to have the Oracle client outside of the container image, e.g., on a shared volume, - Implement the ability to update the configuration dynamically, i.e., without a restart - Implement support for tracing within the database, e.g., using an execution context ID provide by an external caller - Provide additional pre-built Grafana dashboards, - Integration with Spring Observability, e.g., Micrometer -- Provide additional documentation and samples