/
0000_90_cluster-version-operator_02_servicemonitor.yaml
68 lines (68 loc) · 2.31 KB
/
0000_90_cluster-version-operator_02_servicemonitor.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: cluster-version-operator
name: cluster-version-operator
namespace: openshift-cluster-version
annotations:
exclude.release.openshift.io/internal-openshift-hosted: "true"
spec:
endpoints:
- interval: 30s
port: metrics
scheme: http
namespaceSelector:
matchNames:
- openshift-cluster-version
selector:
matchLabels:
k8s-app: cluster-version-operator
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
k8s-app: cluster-version-operator
name: cluster-version-operator
namespace: openshift-cluster-version
annotations:
exclude.release.openshift.io/internal-openshift-hosted: "true"
spec:
groups:
- name: cluster-version
rules:
- alert: ClusterVersionOperatorDown
annotations:
message: Cluster version operator has disappeared from Prometheus target discovery. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
expr: |
absent(up{job="cluster-version-operator"} == 1)
for: 10m
labels:
severity: critical
- name: cluster-operators
rules:
- alert: ClusterOperatorDown
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} has not been available for 10 mins. Operator may be down or disabled, cluster will not be kept up to date and upgrades will not be possible.
expr: |
cluster_operator_up{job="cluster-version-operator"} == 0
for: 10m
labels:
severity: critical
- alert: ClusterOperatorDegraded
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} has been degraded for 10 mins. Operator is degraded because {{ "{{ $labels.reason }}" }} and cluster upgrades will be unstable.
expr: |
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"} == 1
for: 10m
labels:
severity: critical
- alert: ClusterOperatorFlapping
annotations:
message: Cluster operator {{ "{{ $labels.name }}" }} up status is changing often. This might cause upgrades to be unstable.
expr: |
changes(cluster_operator_up{job="cluster-version-operator"}[2m]) > 2
for: 10m
labels:
severity: warning