From 14dc849df4a3218e39a98a5557f67d67eb972d39 Mon Sep 17 00:00:00 2001 From: Alex Chvatal Date: Tue, 24 Nov 2020 12:52:31 -0500 Subject: [PATCH 1/4] only alert for ES being red if the csv succeeded During the rollout of a new logging-operator version, ES can take some time before it goes green again. This patch only causes the alert for ES being red to fire if the elasticsearch-operator csv succeeded. If the csv is actively rolling out, ES being red should be expected and we have other monitoring for when a csv is abnormal. --- files/prometheus_alerts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/prometheus_alerts.yml b/files/prometheus_alerts.yml index dc444a684..185fd18f4 100644 --- a/files/prometheus_alerts.yml +++ b/files/prometheus_alerts.yml @@ -7,7 +7,7 @@ "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." "summary": "Cluster health status is RED" "expr": | - sum by (cluster) (es_cluster_status == 2) + count((sum by (cluster) (es_cluster_status == 2))) + count(csv_succeeded{name=~"elasticsearch-operator.*"} == 1) >= 2 "for": "2m" "labels": "severity": "critical" From 9f58c25f953a8932f9d8ea4b254d75944b78e603 Mon Sep 17 00:00:00 2001 From: Alex Chvatal Date: Wed, 25 Nov 2020 09:34:37 -0500 Subject: [PATCH 2/4] clarify the ElasticsearchClusterNotHealthy alerting rule --- files/prometheus_alerts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/files/prometheus_alerts.yml b/files/prometheus_alerts.yml index 185fd18f4..7a8b1d96f 100644 --- a/files/prometheus_alerts.yml +++ b/files/prometheus_alerts.yml @@ -7,7 +7,7 @@ "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." "summary": "Cluster health status is RED" "expr": | - count((sum by (cluster) (es_cluster_status == 2))) + count(csv_succeeded{name=~"elasticsearch-operator.*"} == 1) >= 2 + (count((sum by (cluster) (es_cluster_status) == 2)) and count(csv_succeeded{name=~"elasticsearch-operator.*"} == 1)) == 1 "for": "2m" "labels": "severity": "critical" From 6bafdeb2857ec6b19550cb52c3293c28da6da9bc Mon Sep 17 00:00:00 2001 From: Alex Chvatal Date: Wed, 25 Nov 2020 15:09:00 -0500 Subject: [PATCH 3/4] update unit tests for when ElasticsearchClusterNotHealthy is red --- files/prometheus_alerts.yml | 2 +- test/files/prometheus-unit-tests/test.yml | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/files/prometheus_alerts.yml b/files/prometheus_alerts.yml index 7a8b1d96f..e25949f56 100644 --- a/files/prometheus_alerts.yml +++ b/files/prometheus_alerts.yml @@ -4,7 +4,7 @@ "rules": - "alert": "ElasticsearchClusterNotHealthy" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." + "message": "Cluster health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." "summary": "Cluster health status is RED" "expr": | (count((sum by (cluster) (es_cluster_status) == 2)) and count(csv_succeeded{name=~"elasticsearch-operator.*"} == 1)) == 1 diff --git a/test/files/prometheus-unit-tests/test.yml b/test/files/prometheus-unit-tests/test.yml index aebc8d388..5186d94a1 100644 --- a/test/files/prometheus-unit-tests/test.yml +++ b/test/files/prometheus-unit-tests/test.yml @@ -15,6 +15,9 @@ tests: - series: 'es_process_cpu_percent{cluster="elasticsearch", instance="localhost:9090", node="elasticsearch-cdm-1"}' values: '10+10x8 95+0x100' # 10 20 30 40 50 60 70 80 90 -- 95 (100x) + - series: 'csv_succeeded{name="elasticsearch-operator.currentversion-builddate"}' + values: '1+0x99' # flag as successful for the whole run + # Rejected indexing requests simulation (note: this simulation also verifies all recording rules) # Number of rejected write requests grows at constant pace for 10 minutes # and then we repeat this patterns again. This gives us two 10m segments of the series to test on. @@ -64,11 +67,10 @@ tests: alertname: ElasticsearchClusterNotHealthy exp_alerts: - exp_labels: - cluster: elasticsearch severity: critical exp_annotations: summary: "Cluster health status is RED" - message: "Cluster elasticsearch health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." + message: "Cluster health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." # --------- ElasticsearchWriteRequestsRejectionJumps --------- # Within the first 10m the percent of rejected requests is = 5% (the alert require > 5%) From aa06a5498239d09f00f25c40986f0fe15315ff1e Mon Sep 17 00:00:00 2001 From: Alex Chvatal Date: Wed, 9 Dec 2020 15:58:56 -0500 Subject: [PATCH 4/4] Update test/files/prometheus-unit-tests/test.yml Co-authored-by: Rick Rackow --- test/files/prometheus-unit-tests/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/files/prometheus-unit-tests/test.yml b/test/files/prometheus-unit-tests/test.yml index 5186d94a1..6c5a30366 100644 --- a/test/files/prometheus-unit-tests/test.yml +++ b/test/files/prometheus-unit-tests/test.yml @@ -70,7 +70,7 @@ tests: severity: critical exp_annotations: summary: "Cluster health status is RED" - message: "Cluster health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." + message: "Elasticserch cluster health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." # --------- ElasticsearchWriteRequestsRejectionJumps --------- # Within the first 10m the percent of rejected requests is = 5% (the alert require > 5%) @@ -143,4 +143,3 @@ tests: - eval_time: 5m alertname: ElasticsearchProcessCPUHigh exp_alerts: -