openshift · openshift-merge-robot · Oct 18, 2022 · Oct 18, 2022 · gnufied · Oct 18, 2022
diff --git a/manifests/12_prometheusrules.yaml b/manifests/12_prometheusrules.yaml
@@ -25,3 +25,22 @@ spec:
             Cluster storage operator monitors all storage classes configured in the cluster
             and checks there is not more than one default StorageClass configured.
           message: "StorageClass count check is failing (there should not be more than one default StorageClass)"
+
+    - name: storage-operations.rules
+      rules:
+      - alert: PodStartupStorageOperationsFailing
+        # There was at least one failing operation in past 5 minutes *and* there was no successful one.
+        # Focus on attach and mount operations - they have the same diagnostic steps and are the most common.
+        expr: |
+          increase(storage_operation_duration_seconds_count{status != "success", operation_name =~"volume_attach|volume_mount"}[5m]) > 0
+          and on() increase(storage_operation_duration_seconds_count{status = "success", operation_name =~"volume_attach|volume_mount"}[5m]) == 0
+        for: 5m
+        labels:
+          severity: info
+        annotations:
+          summary: "Pods can't start because {{ $labels.operation_name }} of volume plugin {{ $labels.volume_plugin }} is permanently failing on node {{ $labels.node }}."
+          description: |
+            Failing storage operation "{{ $labels.operation_name }}" of volume plugin {{ $labels.volume_plugin }} was preventing Pods on node {{ $labels.node }}
+            from starting for past 5 minutes.
+            Please investigate Pods that are "ContainerCreating" on the node: "oc get pod --field-selector=spec.nodeName=ip-10-0-130-168.ec2.internal --all-namespaces | grep ContainerCreating".
+            Events of the Pods should contain exact error message: "oc describe pod -n <pod namespace> <pod name>".