From 7ca1656d436e07843855b56a8bdce681e8872dea Mon Sep 17 00:00:00 2001 From: John Roche Date: Thu, 17 Dec 2020 13:31:18 +1000 Subject: [PATCH 1/2] Change ElasticsearchClusterNotHealthy to 7 minutes --- files/prometheus_alerts.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/files/prometheus_alerts.yml b/files/prometheus_alerts.yml index 5dc0164fe..5cf7d8394 100644 --- a/files/prometheus_alerts.yml +++ b/files/prometheus_alerts.yml @@ -4,11 +4,11 @@ "rules": - "alert": "ElasticsearchClusterNotHealthy" "annotations": - "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Health-is-Red" + "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Health-is-Red" "summary": "Cluster health status is RED" "expr": | sum by (cluster) (es_cluster_status == 2) - "for": "2m" + "for": "7m" "labels": "severity": "critical" From 40a68d8ba45224a8c89ef26067a6c7f09500943e Mon Sep 17 00:00:00 2001 From: John Roche Date: Thu, 17 Dec 2020 14:03:58 +1000 Subject: [PATCH 2/2] Fix test --- test/files/prometheus-unit-tests/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/files/prometheus-unit-tests/test.yml b/test/files/prometheus-unit-tests/test.yml index fa881dc88..9729440ad 100644 --- a/test/files/prometheus-unit-tests/test.yml +++ b/test/files/prometheus-unit-tests/test.yml @@ -60,7 +60,7 @@ tests: message: "Cluster elasticsearch health status has been YELLOW for at least 20m. Some shard replicas are not allocated. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Healthy-is-Yellow" # --------- ElasticsearchClusterNotHealthy (red) --------- - - eval_time: 33m + - eval_time: 38m alertname: ElasticsearchClusterNotHealthy exp_alerts: - exp_labels: @@ -68,7 +68,7 @@ tests: severity: critical exp_annotations: summary: "Cluster health status is RED" - message: "Cluster elasticsearch health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Health-is-Red" + message: "Cluster elasticsearch health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Health-is-Red" # --------- ElasticsearchWriteRequestsRejectionJumps --------- # Within the first 10m the percent of rejected requests is = 5% (the alert require > 5%)