Permalink
Signed-off-by: Damien Grisonnet <dgrisonn@redhat.com>
apiVersion: monitoring.coreos.com/v1 | |
kind: PrometheusRule | |
metadata: | |
labels: | |
prometheus: k8s | |
role: alert-rules | |
name: prometheus-k8s-rules | |
namespace: openshift-monitoring | |
spec: | |
groups: | |
- name: node-exporter.rules | |
rules: | |
- expr: | | |
count without (cpu) ( | |
count without (mode) ( | |
node_cpu_seconds_total{job="node-exporter"} | |
) | |
) | |
record: instance:node_num_cpu:sum | |
- expr: | | |
1 - avg without (cpu, mode) ( | |
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m]) | |
) | |
record: instance:node_cpu_utilisation:rate1m | |
- expr: | | |
( | |
node_load1{job="node-exporter"} | |
/ | |
instance:node_num_cpu:sum{job="node-exporter"} | |
) | |
record: instance:node_load1_per_cpu:ratio | |
- expr: | | |
1 - ( | |
node_memory_MemAvailable_bytes{job="node-exporter"} | |
/ | |
node_memory_MemTotal_bytes{job="node-exporter"} | |
) | |
record: instance:node_memory_utilisation:ratio | |
- expr: | | |
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) | |
record: instance:node_vmstat_pgmajfault:rate1m | |
- expr: | | |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) | |
record: instance_device:node_disk_io_time_seconds:rate1m | |
- expr: | | |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"mmcblk.p.+|nvme.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) | |
record: instance_device:node_disk_io_time_weighted_seconds:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_receive_bytes_excluding_lo:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_transmit_bytes_excluding_lo:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_receive_drop_excluding_lo:rate1m | |
- expr: | | |
sum without (device) ( | |
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) | |
) | |
record: instance:node_network_transmit_drop_excluding_lo:rate1m | |
- name: kube-apiserver.rules | |
rules: | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate1d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate1h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate2h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate30m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate3d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate5m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) | |
- | |
( | |
( | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) | |
or | |
vector(0) | |
) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) | |
+ | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) | |
) | |
) | |
+ | |
# errors | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) | |
labels: | |
verb: read | |
record: apiserver_request:burnrate6h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate1d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate1h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate2h | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate30m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate3d | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate5m | |
- expr: | | |
( | |
( | |
# too slow | |
sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
- | |
sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) | |
) | |
+ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) | |
) | |
/ | |
sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) | |
labels: | |
verb: write | |
record: apiserver_request:burnrate6h | |
- expr: | | |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) | |
labels: | |
verb: read | |
record: code_resource:apiserver_request_total:rate5m | |
- expr: | | |
sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) | |
labels: | |
verb: write | |
record: code_resource:apiserver_request_total:rate5m | |
- expr: | | |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 | |
labels: | |
quantile: "0.99" | |
verb: read | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 | |
labels: | |
quantile: "0.99" | |
verb: write | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile | |
- name: k8s.rules | |
rules: | |
- expr: | | |
sum by (cluster, namespace, pod, container) ( | |
rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) | |
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( | |
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate | |
- expr: | | |
container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_working_set_bytes | |
- expr: | | |
container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_rss | |
- expr: | | |
container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_cache | |
- expr: | | |
container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} | |
* on (namespace, pod) group_left(node) topk by(namespace, pod) (1, | |
max by(namespace, pod, node) (kube_pod_info{node!=""}) | |
) | |
record: node_namespace_pod_container:container_memory_swap | |
- expr: | | |
sum by (namespace) ( | |
sum by (namespace, pod) ( | |
max by (namespace, pod, container) ( | |
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} | |
) * on(namespace, pod) group_left() max by (namespace, pod) ( | |
kube_pod_status_phase{phase=~"Pending|Running"} == 1 | |
) | |
) | |
) | |
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum | |
- expr: | | |
sum by (namespace) ( | |
sum by (namespace, pod) ( | |
max by (namespace, pod, container) ( | |
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} | |
) * on(namespace, pod) group_left() max by (namespace, pod) ( | |
kube_pod_status_phase{phase=~"Pending|Running"} == 1 | |
) | |
) | |
) | |
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum | |
- expr: | | |
max by (cluster, namespace, workload, pod) ( | |
label_replace( | |
label_replace( | |
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, | |
"replicaset", "$1", "owner_name", "(.*)" | |
) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( | |
1, max by (replicaset, namespace, owner_name) ( | |
kube_replicaset_owner{job="kube-state-metrics"} | |
) | |
), | |
"workload", "$1", "owner_name", "(.*)" | |
) | |
) | |
labels: | |
workload_type: deployment | |
record: namespace_workload_pod:kube_pod_owner:relabel | |
- expr: | | |
max by (cluster, namespace, workload, pod) ( | |
label_replace( | |
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, | |
"workload", "$1", "owner_name", "(.*)" | |
) | |
) | |
labels: | |
workload_type: daemonset | |
record: namespace_workload_pod:kube_pod_owner:relabel | |
- expr: | | |
max by (cluster, namespace, workload, pod) ( | |
label_replace( | |
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, | |
"workload", "$1", "owner_name", "(.*)" | |
) | |
) | |
labels: | |
workload_type: statefulset | |
record: namespace_workload_pod:kube_pod_owner:relabel | |
- name: kube-scheduler.rules | |
rules: | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.99" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.9" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="scheduler"}[5m])) without(instance, pod)) | |
labels: | |
quantile: "0.5" | |
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile | |
- name: node.rules | |
rules: | |
- expr: | | |
topk by(namespace, pod) (1, | |
max by (node, namespace, pod) ( | |
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") | |
)) | |
record: 'node_namespace_pod:kube_pod_info:' | |
- expr: | | |
count by (cluster, node) (sum by (node, cpu) ( | |
node_cpu_seconds_total{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
)) | |
record: node:node_num_cpu:sum | |
- expr: | | |
sum( | |
node_memory_MemAvailable_bytes{job="node-exporter"} or | |
( | |
node_memory_Buffers_bytes{job="node-exporter"} + | |
node_memory_Cached_bytes{job="node-exporter"} + | |
node_memory_MemFree_bytes{job="node-exporter"} + | |
node_memory_Slab_bytes{job="node-exporter"} | |
) | |
) by (cluster) | |
record: :node_memory_MemAvailable_bytes:sum | |
- name: kubelet.rules | |
rules: | |
- expr: | | |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.99" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.9" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- expr: | | |
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) | |
labels: | |
quantile: "0.5" | |
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile | |
- name: kube-prometheus-node-recording.rules | |
rules: | |
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) | |
BY (instance) | |
record: instance:node_cpu:rate:sum | |
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) | |
record: instance:node_network_receive_bytes:rate:sum | |
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) | |
record: instance:node_network_transmit_bytes:rate:sum | |
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) | |
WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) | |
BY (instance, cpu)) BY (instance) | |
record: instance:node_cpu:ratio | |
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) | |
record: cluster:node_cpu:sum_rate5m | |
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) | |
BY (instance, cpu)) | |
record: cluster:node_cpu:ratio | |
- name: kube-prometheus-general.rules | |
rules: | |
- expr: count without(instance, pod, node) (up == 1) | |
record: count:up1 | |
- expr: count without(instance, pod, node) (up == 0) | |
record: count:up0 | |
- name: kubernetes.rules | |
rules: | |
- expr: sum(container_memory_usage_bytes{container="",pod!=""}) BY (pod, namespace) | |
record: pod:container_memory_usage_bytes:sum | |
- expr: sum(container_spec_cpu_shares{container="",pod!=""}) BY (pod, namespace) | |
record: pod:container_spec_cpu_shares:sum | |
- expr: sum(rate(container_cpu_usage_seconds_total{container="",pod!=""}[5m])) | |
BY (pod, namespace) | |
record: pod:container_cpu_usage:sum | |
- expr: sum(container_fs_usage_bytes{pod!=""}) BY (pod, namespace) | |
record: pod:container_fs_usage_bytes:sum | |
- expr: sum(container_memory_usage_bytes{container!=""}) BY (namespace) | |
record: namespace:container_memory_usage_bytes:sum | |
- expr: sum(container_spec_cpu_shares{container!=""}) BY (namespace) | |
record: namespace:container_spec_cpu_shares:sum | |
- expr: sum(rate(container_cpu_usage_seconds_total{container!="POD",container!=""}[5m])) | |
BY (namespace) | |
record: namespace:container_cpu_usage:sum | |
- expr: sum(container_memory_usage_bytes{container="",pod!=""}) BY (cluster) / | |
sum(machine_memory_bytes) BY (cluster) | |
record: cluster:memory_usage:ratio | |
- expr: sum(container_spec_cpu_shares{container="",pod!=""}) / 1000 / sum(machine_cpu_cores) | |
record: cluster:container_spec_cpu_shares:ratio | |
- expr: sum(rate(container_cpu_usage_seconds_total{container="",pod!=""}[5m])) | |
/ sum(machine_cpu_cores) | |
record: cluster:container_cpu_usage:ratio | |
- expr: max without(endpoint, instance, job, pod, service) (kube_node_labels and | |
on(node) kube_node_role{role="master"}) | |
labels: | |
label_node_role_kubernetes_io: master | |
label_node_role_kubernetes_io_master: "true" | |
record: cluster:master_nodes | |
- expr: max without(endpoint, instance, job, pod, service) (kube_node_labels and | |
on(node) kube_node_role{role="infra"}) | |
labels: | |
label_node_role_kubernetes_io_infra: "true" | |
record: cluster:infra_nodes | |
- expr: max without(endpoint, instance, job, pod, service) (cluster:master_nodes | |
and on(node) cluster:infra_nodes) | |
labels: | |
label_node_role_kubernetes_io_infra: "true" | |
label_node_role_kubernetes_io_master: "true" | |
record: cluster:master_infra_nodes | |
- expr: cluster:master_infra_nodes or on (node) cluster:master_nodes or on (node) | |
cluster:infra_nodes or on (node) max without(endpoint, instance, job, pod, | |
service) (kube_node_labels) | |
record: cluster:nodes_roles | |
- expr: kube_node_labels and on(node) (sum(label_replace(node_cpu_info, "node", | |
"$1", "instance", "(.*)")) by (node, package, core) == 2) | |
labels: | |
label_node_hyperthread_enabled: "true" | |
record: cluster:hyperthread_enabled_nodes | |
- expr: count(sum(virt_platform) by (instance, type, system_manufacturer, system_product_name, | |
baseboard_manufacturer, baseboard_product_name)) by (type, system_manufacturer, | |
system_product_name, baseboard_manufacturer, baseboard_product_name) | |
record: cluster:virt_platform_nodes:sum | |
- expr: | | |
sum by(label_beta_kubernetes_io_instance_type, label_node_role_kubernetes_io, label_kubernetes_io_arch, label_node_openshift_io_os_id) ( | |
( | |
cluster:master_nodes | |
* on(node) group_left() max by(node) | |
( | |
kube_node_status_capacity_cpu_cores | |
) | |
) | |
or on(node) ( | |
max without(endpoint, instance, job, pod, service) | |
( | |
kube_node_labels | |
) * on(node) group_left() max by(node) | |
( | |
kube_node_status_capacity_cpu_cores | |
) | |
) | |
) | |
record: cluster:capacity_cpu_cores:sum | |
- expr: | | |
clamp_max( | |
label_replace( | |
sum by(instance, package, core) ( | |
node_cpu_info{core!="",package!=""} | |
or | |
# Assume core = cpu and package = 0 for platforms that don't expose core/package labels. | |
label_replace(label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "") | |
) > 1, | |
"label_node_hyperthread_enabled", | |
"true", | |
"instance", | |
"(.*)" | |
) or on (instance, package) | |
label_replace( | |
sum by(instance, package, core) ( | |
label_replace(node_cpu_info{core!="",package!=""} | |
or | |
# Assume core = cpu and package = 0 for platforms that don't expose core/package labels. | |
label_join(node_cpu_info{core="",package=""}, "core", "", "cpu"), "package", "0", "package", "") | |
) <= 1, | |
"label_node_hyperthread_enabled", | |
"false", | |
"instance", | |
"(.*)" | |
), | |
1 | |
) | |
record: cluster:cpu_core_hyperthreading | |
- expr: | | |
topk by(node) (1, cluster:nodes_roles) * on (node) | |
group_right( label_beta_kubernetes_io_instance_type, label_node_role_kubernetes_io, label_node_openshift_io_os_id, label_kubernetes_io_arch, | |
label_node_role_kubernetes_io_master, label_node_role_kubernetes_io_infra) | |
label_replace( cluster:cpu_core_hyperthreading, "node", "$1", "instance", "(.*)" ) | |
record: cluster:cpu_core_node_labels | |
- expr: count(cluster:cpu_core_node_labels) by (label_beta_kubernetes_io_instance_type, | |
label_node_hyperthread_enabled) | |
record: cluster:capacity_cpu_cores_hyperthread_enabled:sum | |
- expr: | | |
sum by(label_beta_kubernetes_io_instance_type, label_node_role_kubernetes_io) | |
( | |
( | |
cluster:master_nodes | |
* on(node) group_left() max by(node) | |
( | |
kube_node_status_capacity_memory_bytes | |
) | |
) | |
or on(node) | |
( | |
max without(endpoint, instance, job, pod, service) | |
( | |
kube_node_labels | |
) | |
* on(node) group_left() max by(node) | |
( | |
kube_node_status_capacity_memory_bytes | |
) | |
) | |
) | |
record: cluster:capacity_memory_bytes:sum | |
- expr: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[2m]) * on(namespace, | |
pod) group_left(node) node_namespace_pod:kube_pod_info:{pod=~"node-exporter.+"}) | |
record: cluster:cpu_usage_cores:sum | |
- expr: sum(node_memory_MemTotal_bytes{job="node-exporter"} - node_memory_MemAvailable_bytes{job="node-exporter"}) | |
record: cluster:memory_usage_bytes:sum | |
- expr: sum(rate(container_cpu_usage_seconds_total{namespace!~"openshift-.+",pod!="",container=""}[5m])) | |
record: workload:cpu_usage_cores:sum | |
- expr: cluster:cpu_usage_cores:sum - workload:cpu_usage_cores:sum | |
record: openshift:cpu_usage_cores:sum | |
- expr: sum(container_memory_working_set_bytes{namespace!~"openshift-.+",pod!="",container=""}) | |
record: workload:memory_usage_bytes:sum | |
- expr: cluster:memory_usage_bytes:sum - workload:memory_usage_bytes:sum | |
record: openshift:memory_usage_bytes:sum | |
- expr: sum(cluster:master_nodes or on(node) kube_node_labels ) BY (label_beta_kubernetes_io_instance_type, | |
label_node_role_kubernetes_io, label_kubernetes_io_arch, label_node_openshift_io_os_id) | |
record: cluster:node_instance_type_count:sum | |
- expr: | | |
sum by(provisioner) ( | |
topk by (namespace, persistentvolumeclaim) ( | |
1, kube_persistentvolumeclaim_resource_requests_storage_bytes | |
) * on(namespace, persistentvolumeclaim) group_right() | |
topk by(namespace, persistentvolumeclaim) ( | |
1, kube_persistentvolumeclaim_info * on(storageclass) group_left(provisioner) topk by(storageclass) (1, max by(storageclass, provisioner) (kube_storageclass_info)) | |
) | |
) | |
record: cluster:kube_persistentvolumeclaim_resource_requests_storage_bytes:provisioner:sum | |
- expr: (sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_node_role_kubernetes_io_master="",label_node_role_kubernetes_io_infra=""} | |
or absent(__does_not_exist__)*0)) + ((sum(node_role_os_version_machine:cpu_capacity_cores:sum{label_node_role_kubernetes_io_master="true"} | |
or absent(__does_not_exist__)*0) * ((max(cluster_master_schedulable == 1)*0+1) | |
or (absent(cluster_master_schedulable == 1)*0)))) | |
record: workload:capacity_physical_cpu_cores:sum | |
- expr: min_over_time(workload:capacity_physical_cpu_cores:sum[5m:15s]) | |
record: cluster:usage:workload:capacity_physical_cpu_cores:min:5m | |
- expr: max_over_time(workload:capacity_physical_cpu_cores:sum[5m:15s]) | |
record: cluster:usage:workload:capacity_physical_cpu_cores:max:5m | |
- expr: | | |
sum by (provisioner) ( | |
topk by (namespace, persistentvolumeclaim) ( | |
1, kubelet_volume_stats_used_bytes | |
) * on (namespace,persistentvolumeclaim) group_right() | |
topk by (namespace, persistentvolumeclaim) ( | |
1, kube_persistentvolumeclaim_info * on(storageclass) group_left(provisioner) topk by(storageclass) (1, max by(storageclass, provisioner) (kube_storageclass_info)) | |
) | |
) | |
record: cluster:kubelet_volume_stats_used_bytes:provisioner:sum | |
- expr: sum(etcd_object_counts) BY (instance) | |
record: instance:etcd_object_counts:sum | |
- expr: topk(500, max(etcd_object_counts) by (resource)) | |
record: cluster:usage:resources:sum | |
- expr: count(count (kube_pod_restart_policy{type!="Always",namespace!~"openshift-.+"}) | |
by (namespace,pod)) | |
record: cluster:usage:pods:terminal:workload:sum | |
- expr: sum(max(kubelet_containers_per_pod_count_sum) by (instance)) | |
record: cluster:usage:containers:sum | |
- expr: count(cluster:cpu_core_node_labels) by (label_kubernetes_io_arch, label_node_hyperthread_enabled, | |
label_node_openshift_io_os_id,label_node_role_kubernetes_io_master,label_node_role_kubernetes_io_infra) | |
record: node_role_os_version_machine:cpu_capacity_cores:sum | |
- expr: count(max(cluster:cpu_core_node_labels) by (node, package, label_beta_kubernetes_io_instance_type, | |
label_node_hyperthread_enabled, label_node_role_kubernetes_io) ) by ( label_beta_kubernetes_io_instance_type, | |
label_node_hyperthread_enabled, label_node_role_kubernetes_io) | |
record: cluster:capacity_cpu_sockets_hyperthread_enabled:sum | |
- expr: count (max(cluster:cpu_core_node_labels) by (node, package, label_kubernetes_io_arch, | |
label_node_hyperthread_enabled, label_node_openshift_io_os_id,label_node_role_kubernetes_io_master,label_node_role_kubernetes_io_infra) | |
) by (label_kubernetes_io_arch, label_node_hyperthread_enabled, label_node_openshift_io_os_id,label_node_role_kubernetes_io_master,label_node_role_kubernetes_io_infra) | |
record: node_role_os_version_machine:cpu_capacity_sockets:sum | |
- expr: clamp_max(sum(alertmanager_integrations),1) | |
record: cluster:alertmanager_routing_enabled:max | |
- expr: sum by(plugin_name, volume_mode)(pv_collector_total_pv_count) | |
record: cluster:kube_persistentvolume_plugin_type_counts:sum | |
- expr: sum by(version)(vsphere_vcenter_info) | |
record: cluster:vsphere_vcenter_info:sum | |
- expr: sum by(version)(vsphere_esxi_version_total) | |
record: cluster:vsphere_esxi_version_total:sum | |
- expr: sum by(hw_version)(vsphere_node_hw_version_total) | |
record: cluster:vsphere_node_hw_version_total:sum | |
- expr: absent(count(max by (node) (kube_node_role{role="master"}) and (min by | |
(node) (kube_node_status_condition{condition="Ready",status="true"} == 0))) | |
> 0) | |
record: cluster:control_plane:all_nodes_ready | |
- alert: ClusterMonitoringOperatorReconciliationErrors | |
annotations: | |
message: Cluster Monitoring Operator is experiencing reconciliation error | |
rate of {{ printf "%0.0f" $value }}%. | |
expr: rate(cluster_monitoring_operator_reconcile_errors_total[15m]) * 100 / | |
rate(cluster_monitoring_operator_reconcile_attempts_total[15m]) > 10 | |
for: 30m | |
labels: | |
severity: warning | |
- alert: AlertmanagerReceiversNotConfigured | |
annotations: | |
message: Alerts are not configured to be sent to a notification system, meaning | |
that you may not be notified in a timely fashion when important failures | |
occur. Check the OpenShift documentation to learn how to configure notifications | |
with Alertmanager. | |
expr: cluster:alertmanager_routing_enabled:max == 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: KubeDeploymentReplicasMismatch | |
annotations: | |
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has | |
not matched the expected number of replicas for longer than 15 minutes. | |
This indicates that cluster infrastructure is unable to start or restart | |
the necessary components. This most often occurs when one or more nodes | |
are down or partioned from the cluster, or a fault occurs on the node that | |
prevents the workload from starting. In rare cases this may indicate a new | |
version of a cluster component cannot start due to a bug or configuration | |
error. Assess the pods for this deployment to verify they are running on | |
healthy nodes and then contact support. | |
summary: Deployment has not matched the expected number of replicas | |
expr: | | |
( | |
kube_deployment_spec_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_deployment_status_replicas_available{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
) and ( | |
changes(kube_deployment_status_replicas_updated{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[5m]) | |
== | |
0 | |
) and cluster:control_plane:all_nodes_ready | |
for: 15m | |
labels: | |
severity: warning | |
- alert: MultipleContainersOOMKilled | |
annotations: | |
message: Multiple containers were out of memory killed within the past 15 | |
minutes. | |
expr: sum(max by(namespace, container, pod) (increase(kube_pod_container_status_restarts_total[12m])) | |
and max by(namespace, container, pod) (kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}) | |
== 1) > 5 | |
for: 15m | |
labels: | |
severity: info | |
- expr: avg_over_time((((count((max by (node) (up{job="kubelet",metrics_path="/metrics"} | |
== 1) and max by (node) (kube_node_status_condition{condition="Ready",status="true"} | |
== 1) and min by (node) (kube_node_spec_unschedulable == 0))) / scalar(count(min | |
by (node) (kube_node_spec_unschedulable == 0))))))[5m:1s]) | |
record: cluster:usage:kube_schedulable_node_ready_reachable:avg5m | |
- expr: avg_over_time((count(max by (node) (kube_node_status_condition{condition="Ready",status="true"} | |
== 1)) / scalar(count(max by (node) (kube_node_status_condition{condition="Ready",status="true"}))))[5m:1s]) | |
record: cluster:usage:kube_node_ready:avg5m | |
- expr: (max without (condition,container,endpoint,instance,job,service) (((kube_pod_status_ready{condition="false"} | |
== 1)*0 or (kube_pod_status_ready{condition="true"} == 1)) * on(pod,namespace) | |
group_left() group by (pod,namespace) (kube_pod_status_phase{phase=~"Running|Unknown|Pending"} | |
== 1))) | |
record: kube_running_pod_ready | |
- expr: avg(kube_running_pod_ready{namespace=~"openshift-.*"}) | |
record: cluster:usage:openshift:kube_running_pod_ready:avg | |
- expr: avg(kube_running_pod_ready{namespace!~"openshift-.*"}) | |
record: cluster:usage:workload:kube_running_pod_ready:avg | |
- interval: 30s | |
name: kubernetes-recurring.rules | |
rules: | |
- expr: sum_over_time(workload:capacity_physical_cpu_cores:sum[30s:1s]) + ((cluster:usage:workload:capacity_physical_cpu_core_seconds | |
offset 25s) or (absent(cluster:usage:workload:capacity_physical_cpu_core_seconds | |
offset 25s)*0)) | |
record: cluster:usage:workload:capacity_physical_cpu_core_seconds | |
- name: openshift-etcd.rules | |
rules: | |
- alert: etcdInsufficientMembers | |
annotations: | |
message: etcd is reporting fewer instances are available than are needed ({{ | |
$value }}). When etcd does not have a majority of instances available the | |
Kubernetes and OpenShift APIs will reject read and write requests and operations | |
that preserve the health of workloads cannot be performed. This can occur | |
when multiple control plane nodes are powered off or are unable to connect | |
to each other via the network. Check that all control plane nodes are powered | |
on and that network connections between each machine are functional. | |
summary: etcd is reporting that a majority of instances are unavailable. | |
expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} == | |
bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod) | |
+ 1) / 2) | |
for: 3m | |
labels: | |
severity: critical | |
- name: openshift-ingress.rules | |
rules: | |
- expr: sum by (code) (rate(haproxy_server_http_responses_total[5m]) > 0) | |
record: code:cluster:ingress_http_request_count:rate5m:sum | |
- expr: sum (rate(haproxy_frontend_bytes_in_total[5m])) | |
record: cluster:usage:ingress_frontend_bytes_in:rate5m:sum | |
- expr: sum (rate(haproxy_frontend_bytes_out_total[5m])) | |
record: cluster:usage:ingress_frontend_bytes_out:rate5m:sum | |
- expr: sum (haproxy_frontend_current_sessions) | |
record: cluster:usage:ingress_frontend_connections:sum | |
- expr: sum(max without(service,endpoint,container,pod,job,namespace) (increase(haproxy_server_http_responses_total{code!~"2xx|1xx|4xx|3xx",exported_namespace!~"openshift-.*"}[5m]) | |
> 0)) / sum (max without(service,endpoint,container,pod,job,namespace) (increase(haproxy_server_http_responses_total{exported_namespace!~"openshift-.*"}[5m]))) | |
or absent(__does_not_exist__)*0 | |
record: cluster:usage:workload:ingress_request_error:fraction5m | |
- expr: sum (max without(service,endpoint,container,pod,job,namespace) (irate(haproxy_server_http_responses_total{exported_namespace!~"openshift-.*"}[5m]))) | |
or absent(__does_not_exist__)*0 | |
record: cluster:usage:workload:ingress_request_total:irate5m | |
- expr: sum(max without(service,endpoint,container,pod,job,namespace) (increase(haproxy_server_http_responses_total{code!~"2xx|1xx|4xx|3xx",exported_namespace=~"openshift-.*"}[5m]) | |
> 0)) / sum (max without(service,endpoint,container,pod,job,namespace) (increase(haproxy_server_http_responses_total{exported_namespace=~"openshift-.*"}[5m]))) | |
or absent(__does_not_exist__)*0 | |
record: cluster:usage:openshift:ingress_request_error:fraction5m | |
- expr: sum (max without(service,endpoint,container,pod,job,namespace) (irate(haproxy_server_http_responses_total{exported_namespace=~"openshift-.*"}[5m]))) | |
or absent(__does_not_exist__)*0 | |
record: cluster:usage:openshift:ingress_request_total:irate5m | |
- name: openshift-build.rules | |
rules: | |
- expr: sum(openshift_build_total{job="kubernetes-apiservers",phase="Error"})/(sum(openshift_build_total{job="kubernetes-apiservers",phase=~"Failed|Complete|Error"})) | |
record: build_error_rate | |
- name: openshift-monitoring.rules | |
rules: | |
- expr: sum by (job,namespace) (prometheus_tsdb_head_series{namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}) | |
record: openshift:prometheus_tsdb_head_series:sum | |
- expr: sum by(job,namespace) (rate(prometheus_tsdb_head_samples_appended_total{namespace=~"openshift-monitoring|openshift-user-workload-monitoring"}[2m])) | |
record: openshift:prometheus_tsdb_head_samples_appended_total:sum | |
- expr: sum by (namespace) (container_memory_working_set_bytes{namespace=~"openshift-monitoring|openshift-user-workload-monitoring", | |
container=""}) | |
record: monitoring:container_memory_working_set_bytes:sum | |
- expr: sum by(exported_service) (rate(haproxy_server_http_responses_total{exported_namespace="openshift-monitoring", | |
exported_service=~"alertmanager-main|grafana|prometheus-k8s"}[5m])) | |
record: monitoring:haproxy_server_http_responses_total:sum | |
- name: openshift-sre.rules | |
rules: | |
- expr: sum(rate(apiserver_request_total{job="apiserver"}[10m])) BY (code) | |
record: code:apiserver_request_total:rate:sum | |
- expr: sum(rate(apiserver_request_total{job="apiserver",resource=~"image.*",verb!="WATCH"}[10m])) | |
BY (code) | |
record: code:registry_api_request_count:rate:sum | |
- expr: sum(kube_pod_status_ready{condition="true",namespace="openshift-etcd",pod=~"etcd.*"}) | |
by(condition) | |
record: kube_pod_status_ready:etcd:sum | |
- expr: sum(kube_pod_status_ready{condition="true",namespace="openshift-image-registry",pod=~"image-registry.*"}) | |
by(condition) | |
record: kube_pod_status_ready:image_registry:sum | |
- name: kube-state-metrics | |
rules: | |
- alert: KubeStateMetricsListErrors | |
annotations: | |
message: kube-state-metrics is experiencing errors at an elevated rate in | |
list operations. This is likely causing it to not be able to expose metrics | |
about Kubernetes objects correctly or at all. | |
expr: | | |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) | |
/ | |
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m]))) | |
> 0.01 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: KubeStateMetricsWatchErrors | |
annotations: | |
message: kube-state-metrics is experiencing errors at an elevated rate in | |
watch operations. This is likely causing it to not be able to expose metrics | |
about Kubernetes objects correctly or at all. | |
expr: | | |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) | |
/ | |
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m]))) | |
> 0.01 | |
for: 15m | |
labels: | |
severity: critical | |
- name: node-exporter | |
rules: | |
- alert: NodeFilesystemSpaceFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available space left and is filling | |
up. | |
summary: Filesystem is predicted to run out of space within the next 24 hours. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40 | |
and | |
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemSpaceFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available space left and is filling | |
up fast. | |
summary: Filesystem is predicted to run out of space within the next 4 hours. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15 | |
and | |
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeFilesystemAlmostOutOfSpace | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available space left. | |
summary: Filesystem has less than 5% space left. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemAlmostOutOfSpace | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available space left. | |
summary: Filesystem has less than 3% space left. | |
expr: | | |
( | |
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeFilesystemFilesFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available inodes left and is filling | |
up. | |
summary: Filesystem is predicted to run out of inodes within the next 24 hours. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40 | |
and | |
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemFilesFillingUp | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available inodes left and is filling | |
up fast. | |
summary: Filesystem is predicted to run out of inodes within the next 4 hours. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20 | |
and | |
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeFilesystemAlmostOutOfFiles | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available inodes left. | |
summary: Filesystem has less than 5% inodes left. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeFilesystemAlmostOutOfFiles | |
annotations: | |
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} | |
has only {{ printf "%.2f" $value }}% available inodes left. | |
summary: Filesystem has less than 3% inodes left. | |
expr: | | |
( | |
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3 | |
and | |
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0 | |
) | |
for: 1h | |
labels: | |
severity: critical | |
- alert: NodeNetworkReceiveErrs | |
annotations: | |
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered | |
{{ printf "%.0f" $value }} receive errors in the last two minutes.' | |
summary: Network interface is reporting many receive errors. | |
expr: | | |
rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeNetworkTransmitErrs | |
annotations: | |
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered | |
{{ printf "%.0f" $value }} transmit errors in the last two minutes.' | |
summary: Network interface is reporting many transmit errors. | |
expr: | | |
rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: NodeHighNumberConntrackEntriesUsed | |
annotations: | |
description: '{{ $value | humanizePercentage }} of conntrack entries are used.' | |
summary: Number of conntrack are getting close to the limit. | |
expr: | | |
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 | |
labels: | |
severity: warning | |
- alert: NodeTextFileCollectorScrapeError | |
annotations: | |
description: Node Exporter text file collector failed to scrape. | |
summary: Node Exporter text file collector failed to scrape. | |
expr: | | |
node_textfile_scrape_error{job="node-exporter"} == 1 | |
labels: | |
severity: warning | |
- alert: NodeClockSkewDetected | |
annotations: | |
message: Clock on {{ $labels.instance }} is out of sync by more than 300s. | |
Ensure NTP is configured correctly on this host. | |
summary: Clock skew detected. | |
expr: | | |
( | |
node_timex_offset_seconds > 0.05 | |
and | |
deriv(node_timex_offset_seconds[5m]) >= 0 | |
) | |
or | |
( | |
node_timex_offset_seconds < -0.05 | |
and | |
deriv(node_timex_offset_seconds[5m]) <= 0 | |
) | |
for: 10m | |
labels: | |
severity: warning | |
- alert: NodeClockNotSynchronising | |
annotations: | |
message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP | |
is configured on this host. | |
summary: Clock not synchronising. | |
expr: | | |
min_over_time(node_timex_sync_status[5m]) == 0 | |
and | |
node_timex_maxerror_seconds >= 16 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: NodeRAIDDegraded | |
annotations: | |
description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is | |
in degraded state due to one or more disks failures. Number of spare drives | |
is insufficient to fix issue automatically. | |
summary: RAID Array is degraded | |
expr: | | |
node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: NodeRAIDDiskFailure | |
annotations: | |
description: At least one device in RAID array on {{ $labels.instance }} failed. | |
Array '{{ $labels.device }}' needs attention and possibly a disk swap. | |
summary: Failed device in RAID array | |
expr: | | |
node_md_disks{state="fail"} > 0 | |
labels: | |
severity: warning | |
- name: alertmanager.rules | |
rules: | |
- alert: AlertmanagerFailedReload | |
annotations: | |
description: Configuration has failed to load for {{ $labels.namespace }}/{{ | |
$labels.pod}}. | |
summary: Reloading an Alertmanager configuration has failed. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="openshift-monitoring"}[5m]) == 0 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: AlertmanagerMembersInconsistent | |
annotations: | |
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only | |
found {{ $value }} members of the {{$labels.job}} cluster. | |
summary: A member of an Alertmanager cluster has not found all other cluster | |
members. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="openshift-monitoring"}[5m]) | |
< on (namespace,service) group_left | |
count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="openshift-monitoring"}[5m])) | |
for: 10m | |
labels: | |
severity: critical | |
- alert: AlertmanagerFailedToSendAlerts | |
annotations: | |
description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed | |
to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration | |
}}. | |
summary: An Alertmanager instance failed to send notifications. | |
expr: | | |
( | |
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="openshift-monitoring"}[5m]) | |
/ | |
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="openshift-monitoring"}[5m]) | |
) | |
> 0.01 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: AlertmanagerClusterFailedToSendAlerts | |
annotations: | |
description: The minimum notification failure rate to {{ $labels.integration | |
}} sent from any instance in the {{$labels.job}} cluster is {{ $value | | |
humanizePercentage }}. | |
summary: All Alertmanager instances in a cluster failed to send notifications. | |
expr: | | |
min by (namespace,service) ( | |
rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="openshift-monitoring"}[5m]) | |
/ | |
rate(alertmanager_notifications_total{job="alertmanager-main",namespace="openshift-monitoring"}[5m]) | |
) | |
> 0.01 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: AlertmanagerConfigInconsistent | |
annotations: | |
description: Alertmanager instances within the {{$labels.job}} cluster have | |
different configurations. | |
summary: Alertmanager instances within the same cluster have different configurations. | |
expr: | | |
count by (namespace,service) ( | |
count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="openshift-monitoring"}) | |
) | |
!= 1 | |
for: 20m | |
labels: | |
severity: critical | |
- alert: AlertmanagerClusterDown | |
annotations: | |
description: '{{ $value | humanizePercentage }} of Alertmanager instances | |
within the {{$labels.job}} cluster have been up for less than half of the | |
last 5m.' | |
summary: Half or more of the Alertmanager instances within the same cluster | |
are down. | |
expr: | | |
( | |
count by (namespace,service) ( | |
avg_over_time(up{job="alertmanager-main",namespace="openshift-monitoring"}[5m]) < 0.5 | |
) | |
/ | |
count by (namespace,service) ( | |
up{job="alertmanager-main",namespace="openshift-monitoring"} | |
) | |
) | |
>= 0.5 | |
for: 5m | |
labels: | |
severity: critical | |
- alert: AlertmanagerClusterCrashlooping | |
annotations: | |
description: '{{ $value | humanizePercentage }} of Alertmanager instances | |
within the {{$labels.job}} cluster have restarted at least 5 times in the | |
last 10m.' | |
summary: Half or more of the Alertmanager instances within the same cluster | |
are crashlooping. | |
expr: | | |
( | |
count by (namespace,service) ( | |
changes(process_start_time_seconds{job="alertmanager-main",namespace="openshift-monitoring"}[10m]) > 4 | |
) | |
/ | |
count by (namespace,service) ( | |
up{job="alertmanager-main",namespace="openshift-monitoring"} | |
) | |
) | |
>= 0.5 | |
for: 5m | |
labels: | |
severity: critical | |
- name: prometheus-operator | |
rules: | |
- alert: PrometheusOperatorListErrors | |
annotations: | |
description: Errors while performing List operations in controller {{$labels.controller}} | |
in {{$labels.namespace}} namespace. | |
summary: Errors while performing list operations in controller. | |
expr: | | |
(sum by (controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="prometheus-operator",namespace="openshift-monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_list_operations_total{job="prometheus-operator",namespace="openshift-monitoring"}[10m]))) > 0.4 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorWatchErrors | |
annotations: | |
description: Errors while performing watch operations in controller {{$labels.controller}} | |
in {{$labels.namespace}} namespace. | |
summary: Errors while performing watch operations in controller. | |
expr: | | |
(sum by (controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="prometheus-operator",namespace="openshift-monitoring"}[10m])) / sum by (controller,namespace) (rate(prometheus_operator_watch_operations_total{job="prometheus-operator",namespace="openshift-monitoring"}[10m]))) > 0.4 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorSyncFailed | |
annotations: | |
description: Controller {{ $labels.controller }} in {{ $labels.namespace }} | |
namespace fails to reconcile {{ $value }} objects. | |
summary: Last controller reconciliation failed | |
expr: | | |
min_over_time(prometheus_operator_syncs{status="failed",job="prometheus-operator",namespace="openshift-monitoring"}[5m]) > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorReconcileErrors | |
annotations: | |
description: '{{ $value | humanizePercentage }} of reconciling operations | |
failed for {{ $labels.controller }} controller in {{ $labels.namespace }} | |
namespace.' | |
summary: Errors while reconciling controller. | |
expr: | | |
(sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="openshift-monitoring"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator",namespace="openshift-monitoring"}[5m]))) > 0.1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorNodeLookupErrors | |
annotations: | |
description: Errors while reconciling Prometheus in {{ $labels.namespace }} | |
Namespace. | |
summary: Errors while reconciling Prometheus. | |
expr: | | |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="openshift-monitoring"}[5m]) > 0.1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorNotReady | |
annotations: | |
description: Prometheus operator in {{ $labels.namespace }} namespace isn't | |
ready to reconcile {{ $labels.controller }} resources. | |
summary: Prometheus operator not ready | |
expr: | | |
min by(namespace, controller) (max_over_time(prometheus_operator_ready{job="prometheus-operator",namespace="openshift-monitoring"}[5m]) == 0) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: PrometheusOperatorRejectedResources | |
annotations: | |
description: Prometheus operator in {{ $labels.namespace }} namespace rejected | |
{{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource | |
}} resources. | |
summary: Resources rejected by Prometheus operator | |
expr: | | |
min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator",namespace="openshift-monitoring"}[5m]) > 0 | |
for: 5m | |
labels: | |
severity: warning | |
- name: kubernetes-apps | |
rules: | |
- alert: KubePodCrashLooping | |
annotations: | |
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container | |
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. | |
summary: Pod is crash looping. | |
expr: | | |
rate(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[5m]) * 60 * 5 > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubePodNotReady | |
annotations: | |
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready | |
state for longer than 15 minutes. | |
summary: Pod has been in a non-ready state for more than 15 minutes. | |
expr: | | |
sum by (namespace, pod) ( | |
max by(namespace, pod) ( | |
kube_pod_status_phase{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", phase=~"Pending|Unknown"} | |
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( | |
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) | |
) | |
) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeDeploymentGenerationMismatch | |
annotations: | |
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment | |
}} does not match, this indicates that the Deployment has failed but has | |
not been rolled back. | |
summary: Deployment generation mismatch due to possible roll-back | |
expr: | | |
kube_deployment_status_observed_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_deployment_metadata_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeStatefulSetReplicasMismatch | |
annotations: | |
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} | |
has not matched the expected number of replicas for longer than 15 minutes. | |
summary: Deployment has not matched the expected number of replicas. | |
expr: | | |
( | |
kube_statefulset_status_replicas_ready{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_statefulset_status_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
) and ( | |
changes(kube_statefulset_status_replicas_updated{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[5m]) | |
== | |
0 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeStatefulSetGenerationMismatch | |
annotations: | |
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset | |
}} does not match, this indicates that the StatefulSet has failed but has | |
not been rolled back. | |
summary: StatefulSet generation mismatch due to possible roll-back | |
expr: | | |
kube_statefulset_status_observed_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_statefulset_metadata_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeStatefulSetUpdateNotRolledOut | |
annotations: | |
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} | |
update has not been rolled out. | |
summary: StatefulSet update has not been rolled out. | |
expr: | | |
( | |
max without (revision) ( | |
kube_statefulset_status_current_revision{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
unless | |
kube_statefulset_status_update_revision{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
) | |
* | |
( | |
kube_statefulset_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_statefulset_status_replicas_updated{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
) | |
) and ( | |
changes(kube_statefulset_status_replicas_updated{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[5m]) | |
== | |
0 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeDaemonSetRolloutStuck | |
annotations: | |
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has | |
not finished or progressed for at least 15 minutes. | |
summary: DaemonSet rollout is stuck. | |
expr: | | |
( | |
( | |
kube_daemonset_status_current_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_daemonset_status_desired_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
) or ( | |
kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
0 | |
) or ( | |
kube_daemonset_updated_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_daemonset_status_desired_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
) or ( | |
kube_daemonset_status_number_available{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_daemonset_status_desired_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
) | |
) and ( | |
changes(kube_daemonset_updated_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[5m]) | |
== | |
0 | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeContainerWaiting | |
annotations: | |
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} | |
has been in waiting state for longer than 1 hour. | |
summary: Pod container waiting longer than 1 hour | |
expr: | | |
sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}) > 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: KubeDaemonSetNotScheduled | |
annotations: | |
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are not scheduled.' | |
summary: DaemonSet pods are not scheduled. | |
expr: | | |
kube_daemonset_status_desired_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
- | |
kube_daemonset_status_current_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: KubeDaemonSetMisScheduled | |
annotations: | |
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset | |
}} are running where they are not supposed to run.' | |
summary: DaemonSet pods are misscheduled. | |
expr: | | |
kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeJobCompletion | |
annotations: | |
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking | |
more than 12 hours to complete. | |
summary: Job did not complete in time | |
expr: | | |
kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0 | |
for: 12h | |
labels: | |
severity: warning | |
- alert: KubeJobFailed | |
annotations: | |
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to | |
complete. Removing failed job after investigation should clear this alert. | |
summary: Job failed to complete. | |
expr: | | |
kube_job_failed{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeHpaReplicasMismatch | |
annotations: | |
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched | |
the desired number of replicas for longer than 15 minutes. | |
summary: HPA has not matched descired number of replicas. | |
expr: | | |
(kube_hpa_status_desired_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
!= | |
kube_hpa_status_current_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}) | |
and | |
(kube_hpa_status_current_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
> | |
kube_hpa_spec_min_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}) | |
and | |
(kube_hpa_status_current_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
< | |
kube_hpa_spec_max_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}) | |
and | |
changes(kube_hpa_status_current_replicas[15m]) == 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeHpaMaxedOut | |
annotations: | |
description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running | |
at max replicas for longer than 15 minutes. | |
summary: HPA is running at max replicas | |
expr: | | |
kube_hpa_status_current_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
== | |
kube_hpa_spec_max_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} | |
for: 15m | |
labels: | |
severity: warning | |
- name: kubernetes-resources | |
rules: | |
- alert: KubeCPUOvercommit | |
annotations: | |
description: Cluster has overcommitted CPU resource requests for Pods and | |
cannot tolerate node failure. | |
summary: Cluster has overcommitted CPU resource requests. | |
expr: | | |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) | |
/ | |
sum(kube_node_status_allocatable_cpu_cores) | |
> | |
(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeMemoryOvercommit | |
annotations: | |
description: Cluster has overcommitted memory resource requests for Pods and | |
cannot tolerate node failure. | |
summary: Cluster has overcommitted memory resource requests. | |
expr: | | |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) | |
/ | |
sum(kube_node_status_allocatable_memory_bytes) | |
> | |
(count(kube_node_status_allocatable_memory_bytes)-1) | |
/ | |
count(kube_node_status_allocatable_memory_bytes) | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeCPUQuotaOvercommit | |
annotations: | |
description: Cluster has overcommitted CPU resource requests for Namespaces. | |
summary: Cluster has overcommitted CPU resource requests. | |
expr: | | |
sum(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard", resource="cpu"}) | |
/ | |
sum(kube_node_status_allocatable_cpu_cores) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeMemoryQuotaOvercommit | |
annotations: | |
description: Cluster has overcommitted memory resource requests for Namespaces. | |
summary: Cluster has overcommitted memory resource requests. | |
expr: | | |
sum(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard", resource="memory"}) | |
/ | |
sum(kube_node_status_allocatable_memory_bytes{job="kube-state-metrics"}) | |
> 1.5 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeQuotaAlmostFull | |
annotations: | |
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage | |
}} of its {{ $labels.resource }} quota. | |
summary: Namespace quota is going to be full. | |
expr: | | |
kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard"} > 0) | |
> 0.9 < 1 | |
for: 15m | |
labels: | |
severity: info | |
- alert: KubeQuotaFullyUsed | |
annotations: | |
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage | |
}} of its {{ $labels.resource }} quota. | |
summary: Namespace quota is fully used. | |
expr: | | |
kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard"} > 0) | |
== 1 | |
for: 15m | |
labels: | |
severity: info | |
- alert: KubeQuotaExceeded | |
annotations: | |
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage | |
}} of its {{ $labels.resource }} quota. | |
summary: Namespace quota has exceeded the limits. | |
expr: | | |
kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="used"} | |
/ ignoring(instance, job, type) | |
(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard"} > 0) | |
> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- name: kubernetes-storage | |
rules: | |
- alert: KubePersistentVolumeFillingUp | |
annotations: | |
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim | |
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage | |
}} free. | |
summary: PersistentVolume is filling up. | |
expr: | | |
kubelet_volume_stats_available_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet", metrics_path="/metrics"} | |
/ | |
kubelet_volume_stats_capacity_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet", metrics_path="/metrics"} | |
< 0.03 | |
for: 1m | |
labels: | |
severity: critical | |
- alert: KubePersistentVolumeFillingUp | |
annotations: | |
description: Based on recent sampling, the PersistentVolume claimed by {{ | |
$labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is | |
expected to fill up within four days. Currently {{ $value | humanizePercentage | |
}} is available. | |
summary: PersistentVolume is filling up. | |
expr: | | |
( | |
kubelet_volume_stats_available_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet", metrics_path="/metrics"} | |
/ | |
kubelet_volume_stats_capacity_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet", metrics_path="/metrics"} | |
) < 0.15 | |
and | |
predict_linear(kubelet_volume_stats_available_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: KubePersistentVolumeErrors | |
annotations: | |
description: The persistent volume {{ $labels.persistentvolume }} has status | |
{{ $labels.phase }}. | |
summary: PersistentVolume is having issues with provisioning. | |
expr: | | |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0 | |
for: 5m | |
labels: | |
severity: critical | |
- name: kubernetes-system | |
rules: | |
- alert: KubeClientErrors | |
annotations: | |
description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance | |
}}' is experiencing {{ $value | humanizePercentage }} errors.' | |
summary: Kubernetes API server client is experiencing errors. | |
expr: | | |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) | |
/ | |
sum(rate(rest_client_requests_total[5m])) by (instance, job)) | |
> 0.01 | |
for: 15m | |
labels: | |
severity: warning | |
- name: kube-apiserver-slos | |
rules: | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) | |
and | |
sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) | |
for: 2m | |
labels: | |
long: 1h | |
severity: critical | |
short: 5m | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) | |
for: 15m | |
labels: | |
long: 6h | |
severity: critical | |
short: 30m | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) | |
for: 1h | |
labels: | |
long: 1d | |
severity: warning | |
short: 2h | |
- alert: KubeAPIErrorBudgetBurn | |
annotations: | |
description: The API server is burning too much error budget. | |
summary: The API server is burning too much error budget. | |
expr: | | |
sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) | |
and | |
sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) | |
for: 3h | |
labels: | |
long: 3d | |
severity: warning | |
short: 6h | |
- name: kubernetes-system-apiserver | |
rules: | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
description: A client certificate used to authenticate to the apiserver is | |
expiring in less than 1.5 hours. | |
summary: Client certificate is about to expire. | |
expr: | | |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 5400 | |
labels: | |
severity: warning | |
- alert: KubeClientCertificateExpiration | |
annotations: | |
description: A client certificate used to authenticate to the apiserver is | |
expiring in less than 1.0 hours. | |
summary: Client certificate is about to expire. | |
expr: | | |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 3600 | |
labels: | |
severity: critical | |
- alert: AggregatedAPIErrors | |
annotations: | |
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} | |
has reported errors. The number of errors have increased for it in the past | |
five minutes. High values indicate that the availability of the service | |
changes too often. | |
summary: An aggregated API has reported errors. | |
expr: | | |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 | |
labels: | |
severity: warning | |
- alert: AggregatedAPIDown | |
annotations: | |
description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} | |
has been only {{ $value | humanize }}% available over the last 10m. | |
summary: An aggregated API is down. | |
expr: | | |
(1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeAPIDown | |
annotations: | |
description: KubeAPI has disappeared from Prometheus target discovery. | |
summary: Target disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job="apiserver"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: kubernetes-system-kubelet | |
rules: | |
- alert: KubeNodeNotReady | |
annotations: | |
description: '{{ $labels.node }} has been unready for more than 15 minutes.' | |
summary: Node is not ready. | |
expr: | | |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeNodeUnreachable | |
annotations: | |
description: '{{ $labels.node }} is unreachable and some workloads may be | |
rescheduled.' | |
summary: Node is unreachable. | |
expr: | | |
(kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletTooManyPods | |
annotations: | |
description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage | |
}} of its Pod capacity. | |
summary: Kubelet is running at capacity. | |
expr: | | |
count by(node) ( | |
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) | |
) | |
/ | |
max by(node) ( | |
kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 | |
) > 0.95 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeNodeReadinessFlapping | |
annotations: | |
description: The readiness status of node {{ $labels.node }} has changed {{ | |
$value }} times in the last 15 minutes. | |
summary: Node readiness status is flapping. | |
expr: | | |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletPlegDurationHigh | |
annotations: | |
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile | |
duration of {{ $value }} seconds on node {{ $labels.node }}. | |
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. | |
expr: | | |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: KubeletPodStartUpLatencyHigh | |
annotations: | |
description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds | |
on node {{ $labels.node }}. | |
summary: Kubelet Pod startup latency is too high. | |
expr: | | |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletClientCertificateRenewalErrors | |
annotations: | |
description: Kubelet on node {{ $labels.node }} has failed to renew its client | |
certificate ({{ $value | humanize }} errors in the last 5 minutes). | |
summary: Kubelet has failed to renew its client certificate. | |
expr: | | |
increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletServerCertificateRenewalErrors | |
annotations: | |
description: Kubelet on node {{ $labels.node }} has failed to renew its server | |
certificate ({{ $value | humanize }} errors in the last 5 minutes). | |
summary: Kubelet has failed to renew its server certificate. | |
expr: | | |
increase(kubelet_server_expiration_renew_errors[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: KubeletDown | |
annotations: | |
description: Kubelet has disappeared from Prometheus target discovery. | |
summary: Target disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job="kubelet", metrics_path="/metrics"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: kubernetes-system-scheduler | |
rules: | |
- alert: KubeSchedulerDown | |
annotations: | |
description: KubeScheduler has disappeared from Prometheus target discovery. | |
summary: Target disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job="scheduler"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: kubernetes-system-controller-manager | |
rules: | |
- alert: KubeControllerManagerDown | |
annotations: | |
description: KubeControllerManager has disappeared from Prometheus target | |
discovery. | |
summary: Target disappeared from Prometheus target discovery. | |
expr: | | |
absent(up{job="kube-controller-manager"} == 1) | |
for: 15m | |
labels: | |
severity: critical | |
- name: prometheus | |
rules: | |
- alert: PrometheusBadConfig | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to | |
reload its configuration. | |
summary: Failed Prometheus configuration reload. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(prometheus_config_last_reload_successful{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) == 0 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: PrometheusNotificationQueueRunningFull | |
annotations: | |
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} | |
is running full. | |
summary: Prometheus alert notification queue predicted to run full in less | |
than 30m. | |
expr: | | |
# Without min_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
( | |
predict_linear(prometheus_notifications_queue_length{job=~"prometheus-k8s|prometheus-user-workload"}[5m], 60 * 30) | |
> | |
min_over_time(prometheus_notifications_queue_capacity{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers | |
annotations: | |
description: '{{ printf "%.1f" $value }}% errors while sending alerts from | |
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' | |
summary: Prometheus has encountered more than 1% errors sending alerts to | |
a specific Alertmanager. | |
expr: | | |
( | |
rate(prometheus_notifications_errors_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
/ | |
rate(prometheus_notifications_sent_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
) | |
* 100 | |
> 1 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusNotConnectedToAlertmanagers | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected | |
to any Alertmanagers. | |
summary: Prometheus is not connected to any Alertmanagers. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
max_over_time(prometheus_notifications_alertmanagers_discovered{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) < 1 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusTSDBReloadsFailing | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected | |
{{$value | humanize}} reload failures over the last 3h. | |
summary: Prometheus has issues reloading blocks from disk. | |
expr: | | |
increase(prometheus_tsdb_reloads_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0 | |
for: 4h | |
labels: | |
severity: warning | |
- alert: PrometheusTSDBCompactionsFailing | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected | |
{{$value | humanize}} compaction failures over the last 3h. | |
summary: Prometheus has issues compacting blocks. | |
expr: | | |
increase(prometheus_tsdb_compactions_failed_total{job=~"prometheus-k8s|prometheus-user-workload"}[3h]) > 0 | |
for: 4h | |
labels: | |
severity: warning | |
- alert: PrometheusNotIngestingSamples | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting | |
samples. | |
summary: Prometheus is not ingesting samples. | |
expr: | | |
( | |
rate(prometheus_tsdb_head_samples_appended_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) <= 0 | |
and | |
( | |
sum without(scrape_job) (prometheus_target_metadata_cache_entries{job=~"prometheus-k8s|prometheus-user-workload"}) > 0 | |
or | |
sum without(rule_group) (prometheus_rule_group_rules{job=~"prometheus-k8s|prometheus-user-workload"}) > 0 | |
) | |
) | |
for: 10m | |
labels: | |
severity: warning | |
- alert: PrometheusDuplicateTimestamps | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping | |
{{ printf "%.4g" $value }} samples/s with different values but duplicated | |
timestamp. | |
summary: Prometheus is dropping samples with duplicate timestamps. | |
expr: | | |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: PrometheusOutOfOrderTimestamps | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping | |
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. | |
summary: Prometheus drops samples with out-of-order timestamps. | |
expr: | | |
rate(prometheus_target_scrapes_sample_out_of_order_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 | |
for: 1h | |
labels: | |
severity: warning | |
- alert: PrometheusRemoteStorageFailures | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send | |
{{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ | |
$labels.url }} | |
summary: Prometheus fails to send samples to remote storage. | |
expr: | | |
( | |
rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
/ | |
( | |
rate(prometheus_remote_storage_failed_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
+ | |
rate(prometheus_remote_storage_succeeded_samples_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
) | |
) | |
* 100 | |
> 1 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PrometheusRemoteWriteBehind | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write | |
is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url | |
}}. | |
summary: Prometheus remote write is behind. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
( | |
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
- ignoring(remote_name, url) group_right | |
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
) | |
> 120 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PrometheusRemoteWriteDesiredShards | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write | |
desired shards calculation wants to run {{ $value }} shards for queue {{ | |
$labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ | |
printf `prometheus_remote_storage_shards_max{instance="%s",job=~"prometheus-k8s|prometheus-user-workload"}` | |
$labels.instance | query | first | value }}. | |
summary: Prometheus remote write desired shards calculation wants to run more | |
than configured max shards. | |
expr: | | |
# Without max_over_time, failed scrapes could create false negatives, see | |
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. | |
( | |
max_over_time(prometheus_remote_storage_shards_desired{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
> | |
max_over_time(prometheus_remote_storage_shards_max{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
) | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusRuleFailures | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to | |
evaluate {{ printf "%.0f" $value }} rules in the last 5m. | |
summary: Prometheus is failing rule evaluations. | |
expr: | | |
increase(prometheus_rule_evaluation_failures_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 | |
for: 15m | |
labels: | |
severity: critical | |
- alert: PrometheusMissingRuleEvaluations | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ | |
printf "%.0f" $value }} rule group evaluations in the last 5m. | |
summary: Prometheus is missing rule evaluations due to slow rule group evaluation. | |
expr: | | |
increase(prometheus_rule_group_iterations_missed_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusTargetLimitHit | |
annotations: | |
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped | |
{{ printf "%.0f" $value }} targets because the number of targets exceeded | |
the configured target_limit. | |
summary: Prometheus has dropped targets because some scrape configs have exceeded | |
the targets limit. | |
expr: | | |
increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) > 0 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager | |
annotations: | |
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts | |
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' | |
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. | |
expr: | | |
min without (alertmanager) ( | |
rate(prometheus_notifications_errors_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
/ | |
rate(prometheus_notifications_sent_total{job=~"prometheus-k8s|prometheus-user-workload"}[5m]) | |
) | |
* 100 | |
> 3 | |
for: 15m | |
labels: | |
severity: critical | |
- name: general.rules | |
rules: | |
- alert: TargetDown | |
annotations: | |
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service | |
}} targets in {{ $labels.namespace }} namespace are down.' | |
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, | |
namespace, service)) > 10 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: Watchdog | |
annotations: | |
message: | | |
This is an alert meant to ensure that the entire alerting pipeline is functional. | |
This alert is always firing, therefore it should always be firing in Alertmanager | |
and always fire against a receiver. There are integrations with various notification | |
mechanisms that send a notification when this alert is not firing. For example the | |
"DeadMansSnitch" integration in PagerDuty. | |
expr: vector(1) | |
labels: | |
severity: none | |
- name: node-network | |
rules: | |
- alert: NodeNetworkInterfaceFlapping | |
annotations: | |
message: Network interface "{{ $labels.device }}" changing it's up status | |
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}" | |
expr: | | |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 | |
for: 2m | |
labels: | |
severity: warning | |
- name: etcd | |
rules: | |
- alert: etcdMembersDown | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value | |
}}).' | |
summary: etcd cluster members are down. | |
expr: | | |
max without (endpoint) ( | |
sum without (instance) (up{job=~".*etcd.*"} == bool 0) | |
or | |
count without (To) ( | |
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 | |
) | |
) | |
> 0 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: etcdNoLeader | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance | |
}} has no leader.' | |
summary: etcd cluster has no leader. | |
expr: | | |
etcd_server_has_leader{job=~".*etcd.*"} == 0 | |
for: 1m | |
labels: | |
severity: critical | |
- alert: etcdHighNumberOfLeaderChanges | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes | |
within the last 15 minutes. Frequent elections may be a sign of insufficient | |
resources, high network latency, or disruptions by other components and | |
should be investigated.' | |
summary: etcd cluster has high number of leader changes. | |
expr: | | |
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 | |
for: 5m | |
labels: | |
severity: warning | |
- alert: etcdGRPCRequestsSlow | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method | |
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.' | |
summary: etcd grpc requests are slow | |
expr: | | |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) without(grpc_type)) | |
> 0.15 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: etcdMemberCommunicationSlow | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": member communication with | |
{{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance | |
}}.' | |
summary: etcd cluster member communication is slow. | |
expr: | | |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) | |
> 0.15 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: etcdHighNumberOfFailedProposals | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures | |
within the last 30 minutes on etcd instance {{ $labels.instance }}.' | |
summary: etcd cluster has high number of proposal failures. | |
expr: | | |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 | |
for: 15m | |
labels: | |
severity: warning | |
- alert: etcdHighFsyncDurations | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations | |
are {{ $value }}s on etcd instance {{ $labels.instance }}.' | |
summary: etcd cluster 99th percentile fsync durations are too high. | |
expr: | | |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) | |
> 0.5 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: etcdHighFsyncDurations | |
annotations: | |
message: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations | |
are {{ $value }}s on etcd instance {{ $labels.instance }}.' | |
expr: | | |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) | |
> 1 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: etcdHighCommitDurations | |
annotations: | |
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations | |
{{ $value }}s on etcd instance {{ $labels.instance }}.' | |
summary: etcd cluster 99th percentile commit durations are too high. | |
expr: | | |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) | |
> 0.25 | |
for: 10m | |
labels: | |
severity: warning | |
- alert: etcdBackendQuotaLowSpace | |
annotations: | |
message: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined | |
quota on etcd instance {{ $labels.instance }}, please defrag or increase | |
the quota as the writes to etcd will be disabled when it is full.' | |
expr: | | |
(etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100 > 95 | |
for: 10m | |
labels: | |
severity: critical | |
- alert: etcdExcessiveDatabaseGrowth | |
annotations: | |
message: 'etcd cluster "{{ $labels.job }}": Observed surge in etcd writes | |
leading to 50% increase in database size over the past four hours on etcd | |
instance {{ $labels.instance }}, please check as it might be disruptive.' | |
expr: | | |
increase(((etcd_mvcc_db_total_size_in_bytes/etcd_server_quota_backend_bytes)*100)[240m:1m]) > 50 | |
for: 10m | |
labels: | |
severity: warning | |
- name: thanos-sidecar.rules | |
rules: | |
- alert: ThanosSidecarPrometheusDown | |
annotations: | |
description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} cannot connect | |
to Prometheus. | |
summary: Thanos Sidecar cannot connect to Prometheus | |
expr: | | |
sum by (job, pod) (thanos_sidecar_prometheus_up{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"} == 0) | |
for: 5m | |
labels: | |
severity: critical | |
- alert: ThanosSidecarUnhealthy | |
annotations: | |
description: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for | |
{{ $value }} seconds. | |
summary: Thanos Sidecar is unhealthy. | |
expr: | | |
time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"prometheus-(k8s|user-workload)-thanos-sidecar"}) by (job, pod) >= 600 | |
labels: | |
severity: critical |