/
0000_90_machine-config-operator_01_prometheus-rules.yaml
59 lines (59 loc) · 2.58 KB
/
0000_90_machine-config-operator_01_prometheus-rules.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: machine-config-daemon
namespace: openshift-machine-config-operator
labels:
k8s-app: machine-config-daemon
annotations:
include.release.openshift.io/ibm-cloud-managed: "true"
include.release.openshift.io/self-managed-high-availability: "true"
include.release.openshift.io/single-node-developer: "true"
spec:
groups:
- name: mcd-reboot-error
rules:
- alert: MCDRebootError
expr: |
mcd_reboot_err > 0
labels:
severity: critical
annotations:
message: "Reboot failed on {{ $labels.node }} , update may be blocked"
- name: mcd-drain-error
rules:
- alert: MCDDrainError
expr: |
mcd_drain_err > 0
labels:
severity: warning
annotations:
message: "Drain failed on {{ $labels.node }} , updates may be blocked. For more details: oc logs -f -n openshift-machine-config-operator machine-config-daemon-<hash> -c machine-config-daemon"
- name: mcd-pivot-error
rules:
- alert: MCDPivotError
expr: |
mcd_pivot_err > 0
labels:
severity: warning
annotations:
message: "Error detected in pivot logs on {{ $labels.node }} "
- name: mcd-kubelet-health-state-error
rules:
- alert: KubeletHealthState
expr: |
mcd_kubelet_state > 2
labels:
severity: warning
annotations:
message: "Kubelet health failure threshold reached"
- name: system-memory-exceeds-reservation
rules:
- alert: SystemMemoryExceedsReservation
expr: |
sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95)
for: 15m
labels:
severity: warning
annotations:
message: "System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the node. The default reservation is expected to be sufficient for most configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state)."