From adb7bf730de687ee36e83cf3fd1fb54e9183744d Mon Sep 17 00:00:00 2001 From: Jacob Tanenbaum Date: Tue, 24 Sep 2019 10:51:21 -0400 Subject: [PATCH] Correct the way nodes are computed for alert ClusterIPTablesStale Change kube_pod_info_node_count to sum(kube_pod_info{namespace="openshift-sdn", pod=~"ovs.*"}) this more accuratly computes the alert by returning the number of nodes that have an ovs pod running also change using time() to timestamp() --- bindata/network/openshift-sdn/alert-rules.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bindata/network/openshift-sdn/alert-rules.yaml b/bindata/network/openshift-sdn/alert-rules.yaml index 4a12ba6672..415e9f3382 100644 --- a/bindata/network/openshift-sdn/alert-rules.yaml +++ b/bindata/network/openshift-sdn/alert-rules.yaml @@ -67,7 +67,10 @@ spec: annotations: message: The average time between iptables resyncs is too high. NOTE - There is some scrape delay and other offsets, 90s isn't exact but it is still too high. expr: | - time() - (sum(kubeproxy_sync_proxy_rules_last_timestamp_seconds) / :kube_pod_info_node_count:) > 90 + quantile(0.95, + timestamp(kubeproxy_sync_proxy_rules_last_timestamp_seconds) + - on(pod) kubeproxy_sync_proxy_rules_last_timestamp_seconds + * on(pod) group_right kube_pod_info{namespace="openshift-sdn", pod=~"sdn-[^-]*"}) > 90 for: 20m labels: severity: warning