Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions files/prometheus_alerts.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
sum by (cluster) (es_cluster_status == 2)
"for": 7m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchClusterNotHealthy
Expand All @@ -22,6 +23,7 @@
sum by (cluster) (es_cluster_status == 1)
"for": 20m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchWriteRequestsRejectionJumps
Expand All @@ -33,6 +35,7 @@
round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
Expand All @@ -51,6 +54,7 @@
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchNodeDiskWatermarkReached
Expand All @@ -69,6 +73,7 @@
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchNodeDiskWatermarkReached
Expand All @@ -87,6 +92,7 @@
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchJVMHeapUseHigh
Expand All @@ -98,6 +104,7 @@
sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75
"for": 10m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": AggregatedLoggingSystemCPUHigh
Expand All @@ -109,6 +116,7 @@
sum by (cluster, instance, node) (es_os_cpu_percent) > 90
"for": 1m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchProcessCPUHigh
Expand All @@ -120,6 +128,7 @@
sum by (cluster, instance, node) (es_process_cpu_percent) > 90
"for": 1m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchDiskSpaceRunningLow
Expand All @@ -131,6 +140,7 @@
sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0
"for": 1h
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchHighFileDescriptorUsage
Expand All @@ -142,6 +152,7 @@
predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchOperatorCSVNotSuccessful
Expand All @@ -152,6 +163,7 @@
csv_succeeded{name =~ "elasticsearch-operator.*"} == 0
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
Expand All @@ -170,6 +182,7 @@
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
Expand All @@ -188,6 +201,7 @@
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
Expand All @@ -206,4 +220,5 @@
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning
8 changes: 8 additions & 0 deletions test/files/prometheus-unit-tests/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ tests:
exp_alerts:
- exp_labels:
cluster: elasticsearch
namespace: openshift-logging
severity: warning
exp_annotations:
summary: "Cluster health status is YELLOW"
Expand All @@ -79,6 +80,7 @@ tests:
exp_alerts:
- exp_labels:
cluster: elasticsearch
namespace: openshift-logging
severity: critical
exp_annotations:
summary: "Cluster health status is RED"
Expand All @@ -98,6 +100,7 @@ tests:
cluster: elasticsearch
instance: localhost:9090
node: elasticsearch-cdm-1
namespace: openshift-logging
severity: warning
exp_annotations:
summary: "High Write Rejection Ratio - 10%"
Expand All @@ -117,6 +120,7 @@ tests:
- exp_labels:
instance: localhost:9090
pod: pod-1
namespace: openshift-logging
severity: info
exp_annotations:
summary: "Disk Low Watermark Reached - disk saturation is 90%"
Expand All @@ -130,6 +134,7 @@ tests:
- exp_labels:
instance: localhost:9091
pod: pod-2
namespace: openshift-logging
severity: warning
exp_annotations:
summary: "Disk Low Watermark is predicted to be reached within next 6h."
Expand All @@ -144,6 +149,7 @@ tests:
cluster: elasticsearch
instance: localhost:9090
node: elasticsearch-cdm-1
namespace: openshift-logging
severity: info
exp_annotations:
summary: "System CPU usage is high"
Expand All @@ -163,6 +169,7 @@ tests:
cluster: elasticsearch
instance: localhost:9090
node: elasticsearch-cdm-1
namespace: openshift-logging
severity: info
exp_annotations:
summary: "ES process CPU usage is high"
Expand All @@ -180,6 +187,7 @@ tests:
exp_alerts:
- exp_labels:
name: elasticsearch-operator.currentversion-builddate
namespace: openshift-logging
severity: warning
exp_annotations:
summary: "Elasticsearch Operator CSV Not Successful"
Expand Down