From ba83e2b26b5e2bbe1afa311788e111773cd23a4d Mon Sep 17 00:00:00 2001 From: Jan Safranek Date: Tue, 18 Oct 2022 13:00:21 +0200 Subject: [PATCH] Add alert about attach / mount failing Show an alert when all attach or mount operations for a volume plugin (or CSI driver) are failing on a node for 5 minutes. A single success will make the alert go away. --- manifests/12_prometheusrules.yaml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/manifests/12_prometheusrules.yaml b/manifests/12_prometheusrules.yaml index edc51a2cd..6fd8234ee 100644 --- a/manifests/12_prometheusrules.yaml +++ b/manifests/12_prometheusrules.yaml @@ -25,3 +25,22 @@ spec: Cluster storage operator monitors all storage classes configured in the cluster and checks there is not more than one default StorageClass configured. message: "StorageClass count check is failing (there should not be more than one default StorageClass)" + + - name: storage-operations.rules + rules: + - alert: PodStartupStorageOperationsFailing + # There was at least one failing operation in past 5 minutes *and* there was no successful one. + # Focus on attach and mount operations - they have the same diagnostic steps and are the most common. + expr: | + increase(storage_operation_duration_seconds_count{status != "success", operation_name =~"volume_attach|volume_mount"}[5m]) > 0 + and on() increase(storage_operation_duration_seconds_count{status = "success", operation_name =~"volume_attach|volume_mount"}[5m]) == 0 + for: 5m + labels: + severity: info + annotations: + summary: "Pods can't start because {{ $labels.operation_name }} of volume plugin {{ $labels.volume_plugin }} is permanently failing on node {{ $labels.node }}." + description: | + Failing storage operation "{{ $labels.operation_name }}" of volume plugin {{ $labels.volume_plugin }} was preventing Pods on node {{ $labels.node }} + from starting for past 5 minutes. + Please investigate Pods that are "ContainerCreating" on the node: "oc get pod --field-selector=spec.nodeName=ip-10-0-130-168.ec2.internal --all-namespaces | grep ContainerCreating". + Events of the Pods should contain exact error message: "oc describe pod -n ".