Skip to content

Commit

Permalink
WIP: Extend CVO alerts to cover update retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
jottofar committed Apr 23, 2020
1 parent 2c4931d commit 1487e3b
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@ spec:
for: 10m
labels:
severity: critical
- alert: ClusterVersionOperatorNotRetrieving
annotations:
message: Cluster version operator has not retrieved updates for 10 mins. Failure reason {{ "{{ $labels.reason }}" }}. {{ "{{ $labels.message }}" }}.
expr: |
cluster_version_operator_retrieving{job="cluster-version-operator"} == 0
for: 10m
labels:
severity: critical
- name: cluster-operators
rules:
- alert: ClusterOperatorDown
Expand Down
22 changes: 22 additions & 0 deletions pkg/cvo/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ import (
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/client-go/tools/cache"
"k8s.io/klog"

configv1 "github.com/openshift/api/config/v1"
"github.com/openshift/cluster-version-operator/lib/resourcemerge"
"github.com/openshift/cluster-version-operator/pkg/internal"
)

func (optr *Operator) registerMetrics(coInformer cache.SharedInformer) error {
klog.Infof("registerMetrics")
m := newOperatorMetrics(optr)
coInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
UpdateFunc: m.clusterOperatorChanged,
Expand All @@ -34,6 +36,7 @@ type operatorMetrics struct {
clusterOperatorConditions *prometheus.GaugeVec
clusterOperatorConditionTransitions *prometheus.GaugeVec
clusterInstaller *prometheus.GaugeVec
clusterVersionOperatorRetrieving *prometheus.GaugeVec
}

func newOperatorMetrics(optr *Operator) *operatorMetrics {
Expand Down Expand Up @@ -83,6 +86,10 @@ version for 'cluster', or empty for 'initial'.
Name: "cluster_installer",
Help: "Reports info about the installation process and, if applicable, the install tool.",
}, []string{"type", "version", "invoker"}),
clusterVersionOperatorRetrieving: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "cluster_version_operator_retrieving",
Help: "Reports when updates are not being succesfully retrieved and reports the reason.",
}, []string{"reason", "message"}),
}
}

Expand All @@ -93,6 +100,7 @@ type conditionKey struct {

// clusterOperatorChanged detects condition transitions and records them
func (m *operatorMetrics) clusterOperatorChanged(oldObj, obj interface{}) {
klog.Infof("clusterOperatorChanged")
oldCO, ok := oldObj.(*configv1.ClusterOperator)
if !ok {
return
Expand Down Expand Up @@ -133,9 +141,11 @@ func (m *operatorMetrics) Describe(ch chan<- *prometheus.Desc) {
ch <- m.clusterOperatorConditions.WithLabelValues("", "", "").Desc()
ch <- m.clusterOperatorConditionTransitions.WithLabelValues("", "").Desc()
ch <- m.clusterInstaller.WithLabelValues("", "", "").Desc()
ch <- m.clusterVersionOperatorRetrieving.WithLabelValues("", "").Desc()
}

func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
klog.Infof("Collect")
current := m.optr.currentVersion()
var completed configv1.UpdateHistory

Expand Down Expand Up @@ -297,6 +307,18 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
g.Set(1.0)
ch <- g
}

if availableUpdates := m.optr.getAvailableUpdates(); availableUpdates != nil {
klog.Infof("Status: %s", availableUpdates.Condition.Status)
if availableUpdates.Condition.Status == configv1.ConditionFalse {
klog.Infof("Reason: %s", availableUpdates.Condition.Reason)
klog.Infof("Message: %s", availableUpdates.Condition.Message)
g := m.clusterVersionOperatorRetrieving.WithLabelValues(availableUpdates.Condition.Reason, availableUpdates.Condition.Message)
//g := m.clusterVersionOperatorRetrieving.WithLabelValues("FooBar")
g.Set(1.0)
ch <- g
}
}
}

func gaugeFromInstallConfigMap(cm *corev1.ConfigMap, gauge *prometheus.GaugeVec, installType string) prometheus.Gauge {
Expand Down

0 comments on commit 1487e3b

Please sign in to comment.