openshift · openshift-merge-robot · Feb 25, 2021 · Feb 18, 2021
diff --git a/assets/prometheus-k8s/rules.yaml b/assets/prometheus-k8s/rules.yaml
@@ -961,6 +961,24 @@ spec:
         offset 25s) or (absent(cluster:usage:workload:capacity_physical_cpu_core_seconds
         offset 25s)*0))
       record: cluster:usage:workload:capacity_physical_cpu_core_seconds
+  - name: openshift-etcd.rules
+    rules:
+    - alert: etcdInsufficientMembers
+      annotations:
+        message: etcd is reporting fewer instances are available than are needed ({{
+          $value }}). When etcd does not have a majority of instances available the
+          Kubernetes and OpenShift APIs will reject read and write requests and operations
+          that preserve the health of workloads cannot be performed. This can occur
+          when multiple control plane nodes are powered off or are unable to connect
+          to each other via the network. Check that all control plane nodes are powered
+          on and that network connections between each machine are functional.
+        summary: etcd is reporting that a majority of instances are unavailable.
+      expr: sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} ==
+        bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod)
+        + 1) / 2)
+      for: 3m
+      labels:
+        severity: critical
   - name: openshift-ingress.rules
     rules:
     - expr: sum by (code) (rate(haproxy_server_http_responses_total[5m]) > 0)
@@ -2335,16 +2353,6 @@ spec:
       for: 10m
       labels:
         severity: critical
-    - alert: etcdInsufficientMembers
-      annotations:
-        description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
-          }}).'
-        summary: etcd cluster has insufficient number of members.
-      expr: |
-        sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
-      for: 3m
-      labels:
-        severity: critical
     - alert: etcdNoLeader
       annotations:
         description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance

diff --git a/jsonnet/main.jsonnet b/jsonnet/main.jsonnet
@@ -37,7 +37,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') +
                  std.map(
                    function(ruleGroup)
                      if ruleGroup.name == 'etcd' then
-                       ruleGroup { rules: std.filter(function(rule) !('alert' in rule && rule.alert == 'etcdHighNumberOfFailedGRPCRequests'), ruleGroup.rules) }
+                       ruleGroup { rules: std.filter(function(rule) !('alert' in rule && (rule.alert == 'etcdHighNumberOfFailedGRPCRequests' || rule.alert == 'etcdInsufficientMembers')), ruleGroup.rules) }
                      else if ruleGroup.name == 'kubernetes-system' then
                        ruleGroup { rules: std.filter(function(rule) !('alert' in rule && rule.alert == 'KubeVersionMismatch'), ruleGroup.rules) }
                      // Removing CPUThrottlingHigh alert as per https://bugzilla.redhat.com/show_bug.cgi?id=1843346

diff --git a/jsonnet/rules.jsonnet b/jsonnet/rules.jsonnet
@@ -371,6 +371,23 @@ local droppedKsmLabels = 'endpoint, instance, job, pod, service';
           },
         ],
       },
+      {
+        name: 'openshift-etcd.rules',
+        rules: [
+          {
+            expr: 'sum(up{job="etcd"} == bool 1 and etcd_server_has_leader{job="etcd"} == bool 1) without (instance,pod) < ((count(up{job="etcd"}) without (instance,pod) + 1) / 2)',
+            alert: 'etcdInsufficientMembers',
+            'for': '3m',
+            annotations: {
+              message: 'etcd is reporting fewer instances are available than are needed ({{ $value }}). When etcd does not have a majority of instances available the Kubernetes and OpenShift APIs will reject read and write requests and operations that preserve the health of workloads cannot be performed. This can occur when multiple control plane nodes are powered off or are unable to connect to each other via the network. Check that all control plane nodes are powered on and that network connections between each machine are functional.',
+              summary: 'etcd is reporting that a majority of instances are unavailable.',
+            },
+            labels: {
+              severity: 'critical',
+            },
+          },
+        ],
+      },
       {
         name: 'openshift-ingress.rules',
         rules: [