From fa7b5bee5369da2eaa94f80f6d34b6fe4899a920 Mon Sep 17 00:00:00 2001 From: Harshal Patil Date: Thu, 2 Sep 2021 15:14:51 +0530 Subject: [PATCH] bump SystemMemoryExceedsReservation alert threshold to 95% In order to reduce the excessive firing of this alert we are making it fire at 95% of system memory utilization instead of the current 90 %. By making this change, we hope to keep this alert still relevant for users while reducing the noise. Signed-off-by: Harshal Patil --- .../0000_90_machine-config-operator_01_prometheus-rules.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml index 5f258b7d6c..470a9c1396 100644 --- a/install/0000_90_machine-config-operator_01_prometheus-rules.yaml +++ b/install/0000_90_machine-config-operator_01_prometheus-rules.yaml @@ -47,9 +47,9 @@ spec: rules: - alert: SystemMemoryExceedsReservation expr: | - sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.9) + sum by (node) (container_memory_rss{id="/system.slice"}) > ((sum by (node) (kube_node_status_capacity{resource="memory"} - kube_node_status_allocatable{resource="memory"})) * 0.95) for: 15m labels: severity: warning annotations: - message: "System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 90% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the node. The default reservation is expected to be sufficient for most configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state)." + message: "System memory usage of {{ $value | humanize }} on {{ $labels.node }} exceeds 95% of the reservation. Reserved memory ensures system processes can function even when the node is fully allocated and protects against workload out of memory events impacting the proper functioning of the node. The default reservation is expected to be sufficient for most configurations and should be increased (https://docs.openshift.com/container-platform/latest/nodes/nodes/nodes-nodes-managing.html) when running nodes with high numbers of pods (either due to rate of change or at steady state)."