From 9b666d6f20040bacbb56e10a4d2d67e34f7fd9be Mon Sep 17 00:00:00 2001 From: Vikas Choudhary Date: Fri, 23 Aug 2019 12:11:16 +0530 Subject: [PATCH] Add a new alert rule for reporting the MAO failure --- ...90_machine-api-operator_04_alertrules.yaml | 32 +++++++++---------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/install/0000_90_machine-api-operator_04_alertrules.yaml b/install/0000_90_machine-api-operator_04_alertrules.yaml index 3acd12374b..6639790d40 100644 --- a/install/0000_90_machine-api-operator_04_alertrules.yaml +++ b/install/0000_90_machine-api-operator_04_alertrules.yaml @@ -4,11 +4,11 @@ metadata: labels: prometheus: k8s role: alert-rules - name: machineapioperator-rules + name: machine-api-operator-prometheus-rules namespace: openshift-machine-api spec: groups: - - name: general.rules + - name: machine-without-valid-node-ref rules: - alert: MachineWithoutValidNode expr: | @@ -18,25 +18,23 @@ spec: severity: critical annotations: message: "machine {{ $labels.name }} does not have valid node reference" - ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: machineapioperator-rules - namespace: openshift-machine-api -spec: - groups: - - name: general.rules + - name: machine-api-operator-metrics-collector-up rules: - - alert: MachineMAOMetricsDown + - alert: MachineAPIOperatorMetricsCollectionFailing expr: | mapi_mao_collector_up == 0 for: 3m labels: severity: critical annotations: - message: "machine api MAO metric {{ $labels.kind }} is not being reported" + message: "machine api operator metrics collection is failing. For more details: oc logs -n openshift-machine-api" + - name: machine-api-operator-down + rules: + - alert: MachineAPIOperatorDown + expr: | + absent(up{job="machine-api-operator"} == 1) + for: 3m + labels: + severity: critical + annotations: + message: "machine api operator is down"