From 247d705843f46b5de8da08dcc034e5eeeadb4da6 Mon Sep 17 00:00:00 2001
From: John Roche <jroche@redhat.com>
Date: Thu, 17 Dec 2020 13:31:18 +1000
Subject: [PATCH] Bug 1908959: Change ElasticsearchClusterNotHealthy to 7
 minutes

(cherry picked from commit 7ca1656d436e07843855b56a8bdce681e8872dea)
---
 files/prometheus_alerts.yml               |   4 +-
 test/files/prometheus-unit-tests/test.yml | 144 ++++++++++++++++++++++
 2 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 test/files/prometheus-unit-tests/test.yml

diff --git a/files/prometheus_alerts.yml b/files/prometheus_alerts.yml
index 3f1788227..47a340191 100644
--- a/files/prometheus_alerts.yml
+++ b/files/prometheus_alerts.yml
@@ -4,11 +4,11 @@
   "rules":
   - "alert": "ElasticsearchClusterNotHealthy"
     "annotations":
-      "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
+      "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Health-is-Red"
       "summary": "Cluster health status is RED"
     "expr": |
       sum by (cluster) (es_cluster_status == 2)
-    "for": "2m"
+    "for": "7m"
     "labels":
       "severity": "critical"
 
diff --git a/test/files/prometheus-unit-tests/test.yml b/test/files/prometheus-unit-tests/test.yml
new file mode 100644
index 000000000..9729440ad
--- /dev/null
+++ b/test/files/prometheus-unit-tests/test.yml
@@ -0,0 +1,144 @@
+rule_files:
+  - ../../../files/prometheus_recording_rules.yml
+  - ../../../files/prometheus_alerts.yml
+
+evaluation_interval: 1m
+
+tests:
+  - interval: 1m
+
+    input_series:
+      - series: 'es_cluster_status{cluster="elasticsearch"}'
+        values: '0+0x9 1+0x20 2+0x10'
+      - series: 'es_os_cpu_percent{cluster="elasticsearch", instance="localhost:9090", node="elasticsearch-cdm-1"}'
+        values: '10+10x8 95+0x100' # 10 20 30 40 50 60 70 80 90 -- 95 (100x)
+      - series: 'es_process_cpu_percent{cluster="elasticsearch", instance="localhost:9090", node="elasticsearch-cdm-1"}'
+        values: '10+10x8 95+0x100' # 10 20 30 40 50 60 70 80 90 -- 95 (100x)
+
+      # Rejected indexing requests simulation (note: this simulation also verifies all recording rules)
+        # Number of rejected write requests grows at constant pace for 10 minutes
+        # and then we repeat this patterns again. This gives us two 10m segments of the series to test on.
+        # Interestingly when the segments were exactly 10m long (constructed like: 1+0x9) the test was
+        # non-deterministic and it was failing randomly.
+      - series: 'es_threadpool_threads_count{name="write", type="rejected", cluster="elasticsearch", instance="localhost:9090", node="elasticsearch-cdm-1"}'
+        values: '1+1x10  1+1x10'
+        # Number of completed write requests grows in two segments. The first one grows fast enough to
+        # not trigger the alert. Second segment grows slower and makes the alert to fire.
+      - series: 'es_threadpool_threads_count{name="write", type="completed", cluster="elasticsearch", instance="localhost:9090", node="elasticsearch-cdm-1"}'
+        values: '1+20x10 1+10x10'
+
+      # Disk Watermark levels simulation
+        # We set low watermark level to 85%
+      - series: 'es_cluster_routing_allocation_disk_watermark_low_pct{instance="localhost:9090",pod="pod-1"}'
+        values: '85+0x9 85+0x9'
+        # Total disk available space is constant (10)
+      - series: 'es_fs_path_total_bytes{instance="localhost:9090",pod="pod-1"}'
+        values: '10+0x9 10+0x9'
+        # The remaining space on the disk drops from 10 to 1 within first 10 minutes
+        # and then stay at 1 for another 10 minutes.
+      - series: 'es_fs_path_available_bytes{instance="localhost:9090",pod="pod-1"}'
+        values: '10-1x9 1+0x10'
+
+
+    # Unit test for alerting rules.
+    alert_rule_test:
+
+      # For the first 10m the cluster is green
+      - eval_time: 5m
+        alertname: ElasticsearchClusterNotHealthy
+        exp_alerts:
+
+      # --------- ElasticsearchClusterNotHealthy (yellow) ---------
+      - eval_time: 30m
+        alertname: ElasticsearchClusterNotHealthy
+        exp_alerts:
+          - exp_labels:
+              cluster: elasticsearch
+              severity: warning
+            exp_annotations:
+              summary: "Cluster health status is YELLOW"
+              message: "Cluster elasticsearch health status has been YELLOW for at least 20m. Some shard replicas are not allocated. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Healthy-is-Yellow"
+
+      # --------- ElasticsearchClusterNotHealthy (red) ---------
+      - eval_time: 38m
+        alertname: ElasticsearchClusterNotHealthy
+        exp_alerts:
+          - exp_labels:
+              cluster: elasticsearch
+              severity: critical
+            exp_annotations:
+              summary: "Cluster health status is RED"
+              message: "Cluster elasticsearch health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Cluster-Health-is-Red"
+
+      # --------- ElasticsearchWriteRequestsRejectionJumps ---------
+      # Within the first 10m the percent of rejected requests is = 5% (the alert require > 5%)
+      - eval_time: 11m
+        alertname: ElasticsearchWriteRequestsRejectionJumps
+        exp_alerts:
+
+      - eval_time: 22m
+        alertname: ElasticsearchWriteRequestsRejectionJumps
+        exp_alerts:
+          - exp_labels:
+              cluster: elasticsearch
+              instance: localhost:9090
+              node: elasticsearch-cdm-1
+              severity: warning
+            exp_annotations:
+              summary: "High Write Rejection Ratio - 10%"
+              message: "High Write Rejection Ratio at elasticsearch-cdm-1 node in elasticsearch cluster. This node may not be keeping up with the indexing speed. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Write-Requests-Rejection-Jumps"
+
+      # --------- ElasticsearchNodeDiskWatermarkReached ---------
+      # By the end of 10th minute we do not expect the low watermark has been active for more than 5 minutes.
+      - eval_time: 10m
+        alertname: ElasticsearchNodeDiskWatermarkReached
+        exp_alerts:
+
+      # By the end of 15th minute we do expect the low watermark to be active more then 5 minutes.
+      - eval_time: 15m
+        alertname: ElasticsearchNodeDiskWatermarkReached
+        exp_alerts:
+          - exp_labels:
+              instance: localhost:9090
+              pod: pod-1
+              severity: info
+            exp_annotations:
+              summary: "Disk Low Watermark Reached - disk saturation is 90%"
+              message: "Disk Low Watermark Reached at pod-1 pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Node-Disk-Low-Watermark-Reached"
+
+      # --------- AggregatedLoggingSystemCPUHigh ---------
+      - eval_time: 15m
+        alertname: AggregatedLoggingSystemCPUHigh
+        exp_alerts:
+          - exp_labels:
+              cluster: elasticsearch
+              instance: localhost:9090
+              node: elasticsearch-cdm-1
+              severity: alert
+            exp_annotations:
+              summary: "System CPU usage is high"
+              message: "System CPU usage on the node elasticsearch-cdm-1 in elasticsearch cluster is 95%. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Aggregated-Logging-System-CPU-is-High"
+
+      # Critical value not reached - no alert is fired
+      - eval_time: 5m
+        alertname: AggregatedLoggingSystemCPUHigh
+        exp_alerts:
+
+      # --------- ElasticsearchProcessCPUHigh ---------
+      - eval_time: 15m
+        alertname: ElasticsearchProcessCPUHigh
+        exp_alerts:
+          - exp_labels:
+              cluster: elasticsearch
+              instance: localhost:9090
+              node: elasticsearch-cdm-1
+              severity: alert
+            exp_annotations:
+              summary: "ES process CPU usage is high"
+              message: "ES process CPU usage on the node elasticsearch-cdm-1 in elasticsearch cluster is 95%. For more information refer to https://github.com/openshift/elasticsearch-operator/blob/master/docs/alerts.md.md#Elasticsearch-Process-CPU-is-High"
+
+      # Critical value not reached - no alert is fired
+      - eval_time: 5m
+        alertname: ElasticsearchProcessCPUHigh
+        exp_alerts:
+