Permalink
1093 lines (1092 sloc) 46.6 KB
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: prometheus-k8s-rules
namespace: openshift-monitoring
spec:
groups:
- name: k8s.rules
rules:
- expr: |
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
record: namespace:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, pod_name, container_name) (
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
)
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
record: namespace:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name)
* on (namespace, pod_name) group_left(label_name)
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:container_cpu_usage_seconds_total:sum_rate
- expr: |
sum by (namespace, label_name) (
sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace)
* on (namespace, pod_name) group_left(label_name)
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
* on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
)
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
- name: kube-scheduler.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- expr: |
histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-controllers"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
- name: kube-apiserver.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.99"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.9"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
labels:
quantile: "0.5"
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
- name: node.rules
rules:
- expr: sum(min(kube_pod_info) by (node))
record: ':kube_pod_info_node_count:'
- expr: |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
record: 'node_namespace_pod:kube_pod_info:'
- expr: |
count by (node) (sum by (node, cpu) (
node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
))
record: node:node_num_cpu:sum
- expr: |
1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
record: :node_cpu_utilisation:avg1m
- expr: |
1 - avg by (node) (
rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:)
record: node:node_cpu_utilisation:avg1m
- expr: |
sum(node_load1{job="node-exporter"})
/
sum(node:node_num_cpu:sum)
record: ':node_cpu_saturation_load1:'
- expr: |
sum by (node) (
node_load1{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
node:node_num_cpu:sum
record: 'node:node_cpu_saturation_load1:'
- expr: |
1 -
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
/
sum(node_memory_MemTotal_bytes{job="node-exporter"})
record: ':node_memory_utilisation:'
- expr: |
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
record: :node_memory_MemFreeCachedBuffers_bytes:sum
- expr: |
sum(node_memory_MemTotal_bytes{job="node-exporter"})
record: :node_memory_MemTotal_bytes:sum
- expr: |
sum by (node) (
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_available:sum
- expr: |
sum by (node) (
node_memory_MemTotal_bytes{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_bytes_total:sum
- expr: |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
/
scalar(sum(node:node_memory_bytes_total:sum))
record: node:node_memory_utilisation:ratio
- expr: |
1e3 * sum(
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
)
record: :node_memory_swap_io_bytes:sum_rate
- expr: |
1 -
sum by (node) (
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
/
sum by (node) (
node_memory_MemTotal_bytes{job="node-exporter"}
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: 'node:node_memory_utilisation:'
- expr: |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
record: 'node:node_memory_utilisation_2:'
- expr: |
1e3 * sum by (node) (
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_memory_swap_io_bytes:sum_rate
- expr: |
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
record: :node_disk_utilisation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_utilisation:avg_irate
- expr: |
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
record: :node_disk_saturation:avg_irate
- expr: |
avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_disk_saturation:avg_irate
- expr: |
max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_usage:'
- expr: |
max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
record: 'node:node_filesystem_avail:'
- expr: |
sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
record: :node_net_utilisation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_utilisation:sum_irate
- expr: |
sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) +
sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
record: :node_net_saturation:sum_irate
- expr: |
sum by (node) (
(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) +
irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m]))
* on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info:
)
record: node:node_net_saturation:sum_irate
- name: kube-prometheus-node-recording.rules
rules:
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)
record: instance:node_cpu:rate:sum
- expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
record: instance:node_filesystem_usage:sum
- expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode)
/ ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
record: cluster:node_cpu:ratio
- name: kubernetes.rules
rules:
- expr: sum(container_memory_usage_bytes{container_name!="POD",container_name!="",pod_name!=""})
BY (pod_name, namespace)
record: pod_name:container_memory_usage_bytes:sum
- expr: sum(container_spec_cpu_shares{container_name!="POD",container_name!="",pod_name!=""})
BY (pod_name, namespace)
record: pod_name:container_spec_cpu_shares:sum
- expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",container_name!="",pod_name!=""}[5m]))
BY (pod_name, namespace)
record: pod_name:container_cpu_usage:sum
- expr: sum(container_fs_usage_bytes{container_name!="POD",container_name!="",pod_name!=""})
BY (pod_name, namespace)
record: pod_name:container_fs_usage_bytes:sum
- expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
record: namespace:container_memory_usage_bytes:sum
- expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
record: namespace:container_spec_cpu_shares:sum
- expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",container_name!=""}[5m]))
BY (namespace)
record: namespace:container_cpu_usage:sum
- expr: sum(container_memory_usage_bytes{container_name!="POD",container_name!="",pod_name!=""})
BY (cluster) / sum(machine_memory_bytes) BY (cluster)
record: cluster:memory_usage:ratio
- expr: sum(container_spec_cpu_shares{container_name!="POD",container_name!="",pod_name!=""})
/ 1000 / sum(machine_cpu_cores)
record: cluster:container_spec_cpu_shares:ratio
- expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",container_name!="",pod_name!=""}[5m]))
/ sum(machine_cpu_cores)
record: cluster:container_cpu_usage:ratio
- alert: ClusterMonitoringOperatorErrors
annotations:
message: Cluster Monitoring Operator is experiencing {{ printf "%0.0f" $value
}}% errors.
expr: sum(rate(cluster_monitoring_operator_reconcile_errors_total[15m])) * 100
/ sum(rate(cluster_monitoring_operator_reconcile_attempts_total[15m])) > 10
for: 15m
labels:
severity: critical
- name: openshift-build.rules
rules:
- expr: sum(openshift_build_total{job="kubernetes-apiservers",phase="Error"})/(sum(openshift_build_total{job="kubernetes-apiservers",phase=~"Failed|Complete|Error"}))
record: build_error_rate
- name: kubernetes-absent
rules:
- alert: AlertmanagerDown
annotations:
message: Alertmanager has disappeared from Prometheus target discovery.
expr: |
absent(up{job="alertmanager-main"} == 1)
for: 15m
labels:
severity: critical
- alert: ClusterMonitoringOperatorDown
annotations:
message: ClusterMonitoringOperator has disappeared from Prometheus target
discovery.
expr: |
absent(up{job="cluster-monitoring-operator"} == 1)
for: 15m
labels:
severity: critical
- alert: CoreDNSDown
annotations:
message: CoreDNS has disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-dns"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeAPIDown
annotations:
message: KubeAPI has disappeared from Prometheus target discovery.
expr: |
absent(up{job="apiserver"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeControllerManagerDown
annotations:
message: KubeControllerManager has disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-controllers"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeSchedulerDown
annotations:
message: KubeScheduler has disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-controllers"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsDown
annotations:
message: KubeStateMetrics has disappeared from Prometheus target discovery.
expr: |
absent(up{job="kube-state-metrics"} == 1)
for: 15m
labels:
severity: critical
- alert: KubeletDown
annotations:
message: Kubelet has disappeared from Prometheus target discovery.
expr: |
absent(up{job="kubelet"} == 1)
for: 15m
labels:
severity: critical
- alert: NodeExporterDown
annotations:
message: NodeExporter has disappeared from Prometheus target discovery.
expr: |
absent(up{job="node-exporter"} == 1)
for: 15m
labels:
severity: critical
- alert: PrometheusDown
annotations:
message: Prometheus has disappeared from Prometheus target discovery.
expr: |
absent(up{job="prometheus-k8s"} == 1)
for: 15m
labels:
severity: critical
- alert: PrometheusOperatorDown
annotations:
message: PrometheusOperator has disappeared from Prometheus target discovery.
expr: |
absent(up{job="prometheus-operator"} == 1)
for: 15m
labels:
severity: critical
- alert: TelemeterClientDown
annotations:
message: TelemeterClient has disappeared from Prometheus target discovery.
expr: |
absent(up{job="telemeter-client"} == 1)
for: 15m
labels:
severity: critical
- name: kubernetes-apps
rules:
- alert: KubePodCrashLooping
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
expr: |
rate(kube_pod_container_status_restarts_total{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}[15m]) * 60 * 5 > 0
for: 1h
labels:
severity: critical
- alert: KubePodNotReady
annotations:
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than an hour.
expr: |
sum by (namespace, pod) (kube_pod_status_phase{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0
for: 1h
labels:
severity: critical
- alert: KubeDeploymentGenerationMismatch
annotations:
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has
not been rolled back.
expr: |
kube_deployment_status_observed_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
!=
kube_deployment_metadata_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeDeploymentReplicasMismatch
annotations:
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
matched the expected number of replicas for longer than an hour.
expr: |
kube_deployment_spec_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
!=
kube_deployment_status_replicas_available{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
for: 1h
labels:
severity: critical
- alert: KubeStatefulSetReplicasMismatch
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
expr: |
kube_statefulset_status_replicas_ready{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
!=
kube_statefulset_status_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeStatefulSetGenerationMismatch
annotations:
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
expr: |
kube_statefulset_status_observed_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
!=
kube_statefulset_metadata_generation{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
for: 15m
labels:
severity: critical
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
expr: |
max without (revision) (
kube_statefulset_status_current_revision{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
)
*
(
kube_statefulset_replicas{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
!=
kube_statefulset_status_replicas_updated{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
)
for: 15m
labels:
severity: critical
- alert: KubeDaemonSetRolloutStuck
annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
}}/{{ $labels.daemonset }} are scheduled and ready.
expr: |
kube_daemonset_status_number_ready{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
/
kube_daemonset_status_desired_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} * 100 < 100
for: 15m
labels:
severity: critical
- alert: KubeDaemonSetNotScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
expr: |
kube_daemonset_status_desired_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"}
-
kube_daemonset_status_current_number_scheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
expr: |
kube_daemonset_status_number_misscheduled{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0
for: 10m
labels:
severity: warning
- alert: KubeCronJobRunning
annotations:
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
than 1h to complete.
expr: |
time() - kube_cronjob_next_schedule_time{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 3600
for: 1h
labels:
severity: warning
- alert: KubeJobCompletion
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than one hour to complete.
expr: |
kube_job_spec_completions{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} - kube_job_status_succeeded{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
- alert: KubeJobFailed
annotations:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
expr: |
kube_job_status_failed{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0
for: 1h
labels:
severity: warning
- name: kubernetes-resources
rules:
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Pods and cannot
tolerate node failure.
expr: |
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
/
sum(node:node_num_cpu:sum)
>
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Pods and cannot
tolerate node failure.
expr: |
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
/
sum(node_memory_MemTotal_bytes)
>
(count(node:node_num_cpu:sum)-1)
/
count(node:node_num_cpu:sum)
for: 5m
labels:
severity: warning
- alert: KubeCPUOvercommit
annotations:
message: Cluster has overcommitted CPU resource requests for Namespaces.
expr: |
sum(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard", resource="requests.cpu"})
/
sum(node:node_num_cpu:sum)
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeMemOvercommit
annotations:
message: Cluster has overcommitted memory resource requests for Namespaces.
expr: |
sum(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard", resource="requests.memory"})
/
sum(node_memory_MemTotal_bytes{job="node-exporter"})
> 1.5
for: 5m
labels:
severity: warning
- alert: KubeQuotaExceeded
annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
}}% of its {{ $labels.resource }} quota.
expr: |
100 * kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type)
(kube_resourcequota{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics", type="hard"} > 0)
> 90
for: 15m
labels:
severity: warning
- alert: CPUThrottlingHigh
annotations:
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
}} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
}}.'
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m]))
by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m]))
by (container_name, pod_name, namespace)\n > 25 \n"
for: 15m
labels:
severity: warning
- name: kubernetes-storage
rules:
- alert: KubePersistentVolumeUsageCritical
annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
}}% free.
expr: |
100 * kubelet_volume_stats_available_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
< 3
for: 1m
labels:
severity: critical
- alert: KubePersistentVolumeFullInFourDays
annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ printf "%0.2f" $value }}% is available.
expr: |
100 * (
kubelet_volume_stats_available_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
/
kubelet_volume_stats_capacity_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}
) < 15
and
predict_linear(kubelet_volume_stats_available_bytes{namespace=~"(openshift-.*|kube-.*|default|logging)",job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 5m
labels:
severity: critical
- alert: KubePersistentVolumeErrors
annotations:
message: The persistent volume {{ $labels.persistentvolume }} has status {{
$labels.phase }}.
expr: |
kube_persistentvolume_status_phase{phase=~"Failed|Pending",namespace=~"(openshift-.*|kube-.*|default|logging)",job="kube-state-metrics"} > 0
for: 5m
labels:
severity: critical
- name: kubernetes-system
rules:
- alert: KubeNodeNotReady
annotations:
message: '{{ $labels.node }} has been unready for more than an hour.'
expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
- alert: KubeVersionMismatch
annotations:
message: There are {{ $value }} different versions of Kubernetes components
running.
expr: |
count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1
for: 1h
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
expr: |
(sum(rate(rest_client_requests_total{code!~"2..|404"}[5m])) by (instance, job)
/
sum(rate(rest_client_requests_total[5m])) by (instance, job))
* 100 > 1
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
expr: |
sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
for: 15m
labels:
severity: warning
- alert: KubeletTooManyPods
annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
to the limit of 110.
expr: |
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
for: 15m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
for: 10m
labels:
severity: warning
- alert: KubeAPILatencyHigh
annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds
for {{ $labels.verb }} {{ $labels.resource }}.
expr: |
cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value }}% of requests.
expr: |
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
for: 10m
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: Kubernetes API certificate is expiring in less than 7 days.
expr: |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
message: Kubernetes API certificate is expiring in less than 24 hours.
expr: |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels:
severity: critical
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
annotations:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync.
expr: |
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
for: 5m
labels:
severity: critical
- alert: AlertmanagerFailedReload
annotations:
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
expr: |
alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0
for: 10m
labels:
severity: warning
- alert: AlertmanagerMembersInconsistent
annotations:
message: Alertmanager has not found all other members of the cluster.
expr: |
alertmanager_cluster_members{job="alertmanager-main"}
!= on (service) GROUP_LEFT()
count by (service) (alertmanager_cluster_members{job="alertmanager-main"})
for: 5m
labels:
severity: critical
- name: general.rules
rules:
- alert: TargetDown
annotations:
message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
- alert: DeadMansSwitch
annotations:
message: This is a DeadMansSwitch meant to ensure that the entire alerting
pipeline is functional.
expr: vector(1)
labels:
severity: none
- name: kube-prometheus-node-alerting.rules
rules:
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} will be full within the next 24 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
for: 30m
labels:
severity: warning
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace
}}/{{ $labels.pod }} will be full within the next 2 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
for: 10m
labels:
severity: critical
- name: prometheus.rules
rules:
- alert: PrometheusConfigReloadFailed
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
summary: Reloading Prometheus' configuration failed
expr: |
prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0
for: 10m
labels:
severity: warning
- alert: PrometheusNotificationQueueRunningFull
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}}
summary: Prometheus' alert notification queue is running full
expr: |
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"}
for: 10m
labels:
severity: warning
- alert: PrometheusErrorSendingAlerts
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alert from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01
for: 10m
labels:
severity: warning
- alert: PrometheusErrorSendingAlerts
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alerts from Prometheus
expr: |
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03
for: 10m
labels:
severity: critical
- alert: PrometheusNotConnectedToAlertmanagers
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers
summary: Prometheus is not connected to any Alertmanagers
expr: |
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1
for: 10m
labels:
severity: warning
- alert: PrometheusTSDBReloadsFailing
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0
for: 12h
labels:
severity: warning
- alert: PrometheusTSDBCompactionsFailing
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0
for: 12h
labels:
severity: warning
- alert: PrometheusTSDBWALCorruptions
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted
expr: |
tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0
for: 4h
labels:
severity: warning
- alert: PrometheusNotIngestingSamples
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting
samples.
summary: Prometheus isn't ingesting samples
expr: |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0
for: 10m
labels:
severity: warning
- alert: PrometheusTargetScrapesDuplicate
annotations:
description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected
due to duplicate timestamps but different values'
summary: Prometheus has many samples rejected
expr: |
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0
for: 10m
labels:
severity: warning
- name: prometheus-operator
rules:
- alert: PrometheusOperatorReconcileErrors
annotations:
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
}} Namespace.
expr: |
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- alert: PrometheusOperatorNodeLookupErrors
annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
for: 10m
labels:
severity: warning
- name: etcd
rules:
- alert: EtcdInsufficientMembers
annotations:
message: 'Etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
expr: |
sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: EtcdNoLeader
annotations:
message: 'Etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
has no leader.'
expr: |
etcd_server_has_leader{job=~".*etcd.*"} == 0
for: 1m
labels:
severity: critical
- alert: EtcdHighNumberOfLeaderChanges
annotations:
message: 'Etcd cluster "{{ $labels.job }}": instance {{ $labels.instance }}
has seen {{ $value }} leader changes within the last hour.'
expr: |
rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
for: 15m
labels:
severity: warning
- alert: EtcdHighNumberOfFailedGRPCRequests
annotations:
message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 1
for: 10m
labels:
severity: warning
- alert: EtcdHighNumberOfFailedGRPCRequests
annotations:
message: 'Etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
expr: |
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
/
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
> 5
for: 5m
labels:
severity: critical
- alert: EtcdGRPCRequestsSlow
annotations:
message: 'Etcd cluster "{{ $labels.job }}": gRPC requests to {{ $labels.grpc_method
}} are taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
- alert: EtcdMemberCommunicationSlow
annotations:
message: 'Etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To
}} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.15
for: 10m
labels:
severity: warning
- alert: EtcdHighNumberOfFailedProposals
annotations:
message: 'Etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last hour on etcd instance {{ $labels.instance }}.'
expr: |
rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 15m
labels:
severity: warning
- alert: EtcdHighFsyncDurations
annotations:
message: 'Etcd cluster "{{ $labels.job }}": 99th percentile fync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.5
for: 10m
labels:
severity: warning
- alert: EtcdHighCommitDurations
annotations:
message: 'Etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
> 0.25
for: 10m
labels:
severity: warning
- expr: process_open_fds / process_max_fds
record: instance:fd_utilization
- alert: FdExhaustionClose
annotations:
message: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust its
file descriptors soon'
expr: |
predict_linear(instance:fd_utilization{job=~".*etcd.*"}[1h], 3600 * 4) > 1
for: 10m
labels:
severity: warning
- alert: FdExhaustionClose
annotations:
description: '{{ $labels.job }} instance {{ $labels.instance }} will exhaust
its file descriptors soon'
expr: |
predict_linear(instance:fd_utilization{job=~".*etcd.*"}[10m], 3600) > 1
for: 10m
labels:
severity: critical