Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions files/prometheus_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
"rules":
- "alert": "ElasticsearchClusterNotHealthy"
"annotations":
"message": "Cluster {{ $labels.cluster }} health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"message": "Cluster health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"summary": "Cluster health status is RED"
"expr": |
sum by (cluster) (es_cluster_status == 2)
(count((sum by (cluster) (es_cluster_status) == 2)) and count(csv_succeeded{name=~"elasticsearch-operator.*"} == 1)) == 1
"for": "2m"
"labels":
"severity": "critical"
Expand Down
7 changes: 4 additions & 3 deletions test/files/prometheus-unit-tests/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ tests:
- series: 'es_process_cpu_percent{cluster="elasticsearch", instance="localhost:9090", node="elasticsearch-cdm-1"}'
values: '10+10x8 95+0x100' # 10 20 30 40 50 60 70 80 90 -- 95 (100x)

- series: 'csv_succeeded{name="elasticsearch-operator.currentversion-builddate"}'
values: '1+0x99' # flag as successful for the whole run

# Rejected indexing requests simulation (note: this simulation also verifies all recording rules)
# Number of rejected write requests grows at constant pace for 10 minutes
# and then we repeat this patterns again. This gives us two 10m segments of the series to test on.
Expand Down Expand Up @@ -64,11 +67,10 @@ tests:
alertname: ElasticsearchClusterNotHealthy
exp_alerts:
- exp_labels:
cluster: elasticsearch
severity: critical
exp_annotations:
summary: "Cluster health status is RED"
message: "Cluster elasticsearch health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
message: "Elasticserch cluster health status has been RED for at least 2m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."

# --------- ElasticsearchWriteRequestsRejectionJumps ---------
# Within the first 10m the percent of rejected requests is = 5% (the alert require > 5%)
Expand Down Expand Up @@ -141,4 +143,3 @@ tests:
- eval_time: 5m
alertname: ElasticsearchProcessCPUHigh
exp_alerts: