Skip to content

Commit

Permalink
pkg/cvo/metrics: Connect ClusterVersion to ClusterOperatorDown and Cl…
Browse files Browse the repository at this point in the history
…usterOperatorDegraded

By adding cluster_operator_up handling for ClusterVersion, with
'version' as the component name, the same way we handle
cluster_operator_conditions.  This plugs us into ClusterOperatorDown
(based on cluster_operator_up) and ClusterOperatorDegraded (based on
both cluster_operator_conditions and cluster_operator_up).

I've adjusted the ClusterOperatorDegraded rule so that it fires on
ClusterVersion Failing=True and does not fire on Failing=False.
Thinking through an update from before:

1. Outgoing CVO does not serve cluster_operator_up{name="version"}.
2. User requests an update to a release with this change.
3. New CVO comes in, starts serving
   cluster_operator_up{name="version"}.
4. Old ClusterOperatorDegraded no matching
   cluster_operator_conditions{name="version",condition="Degraded"},
   falls through to cluster_operator_up{name="version"}, and starts
   cooking the 'for: 30m'.
5. If we go more than 30m before updating the ClusterOperatorDegraded
   rule to understand Failing, ClusterOperatorDegraded would fire.

We'll need to backport the ClusterOperatorDegraded expr change to one
4.y release before the CVO-metrics change lands to get:

1. Outgoing CVO does not serve cluster_operator_up{name="version"}.
2. User requests an update to a release with the expr change.
3. Incoming ClusterOperatorDegraded sees no
   cluster_operator_conditions{name="version",condition="Degraded"},
   cluster_operator_conditions{name="version",condition="Failing"} (we
   hope), or cluster_operator_up{name="version"}, so it doesn't fire.
   Unless we are Failing=True, in which case, hooray, we'll start
   alerting about it.
4. User requests an update to a release with the CVO-metrics change.
5. New CVO starts serving cluster_operator_up, just like the
   fresh-modern-install situation, and everything is great.

The missing-ClusterVersion metrics don't matter all that much today,
because the CVO has been creating replacement ClusterVersion since at
least 90e9881 (cvo: Change the core CVO loops to report status to
ClusterVersion, 2018-11-02, #45).  But it will become more important
with [1], which is planning on removing that default creation.  When
there is no ClusterVersion, we expect ClusterOperatorDown to fire.

[1]: #741
  • Loading branch information
wking committed Mar 19, 2024
1 parent 8f4a120 commit 1c99f1d
Show file tree
Hide file tree
Showing 3 changed files with 99 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ spec:
max by (namespace, name, reason)
(
(
cluster_operator_conditions{job="cluster-version-operator", condition="Degraded"}
cluster_operator_conditions{job="cluster-version-operator", name!="version", condition="Degraded"}
or on (namespace, name)
cluster_operator_conditions{job="cluster-version-operator", name="version", condition="Failing"}
or on (namespace, name)
group by (namespace, name) (cluster_operator_up{job="cluster-version-operator"})
) == 1
Expand Down
24 changes: 22 additions & 2 deletions pkg/cvo/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,16 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
current := m.optr.currentVersion()
var completed configv1.UpdateHistory

if cv, err := m.optr.cvLister.Get(m.optr.name); err == nil {
if cv, err := m.optr.cvLister.Get(m.optr.name); apierrors.IsNotFound(err) {
g := m.clusterOperatorUp.WithLabelValues("version", "", "ClusterVersionNotFound")
g.Set(0)
ch <- g

g = m.clusterOperatorConditions.WithLabelValues("version", string(configv1.OperatorAvailable), "ClusterVersionNotFound")
g.Set(0)
ch <- g
} else if err == nil {

// output cluster version

var initial configv1.UpdateHistory
Expand Down Expand Up @@ -482,7 +491,18 @@ func (m *operatorMetrics) Collect(ch chan<- prometheus.Metric) {
klog.V(2).Infof("skipping metrics for ClusterVersion condition %s=%s (neither True nor False)", condition.Type, condition.Status)
continue
}
g := m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason))

if condition.Type == configv1.OperatorAvailable {
g = m.clusterOperatorUp.WithLabelValues("version", completed.Version, string(condition.Reason))
if condition.Status == configv1.ConditionTrue {
g.Set(1)
} else {
g.Set(0)
}
ch <- g
}

g = m.clusterOperatorConditions.WithLabelValues("version", string(condition.Type), string(condition.Reason))
if condition.Status == configv1.ConditionTrue {
g.Set(1)
} else {
Expand Down
123 changes: 74 additions & 49 deletions pkg/cvo/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,13 @@ func Test_operatorMetrics_Collect(t *testing.T) {
releaseCreated: time.Unix(3, 0),
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 2 {
if len(metrics) != 4 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[1], 1, map[string]string{"type": ""})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[3], 1, map[string]string{"type": "", "version": "", "invoker": ""})
},
},
{
Expand All @@ -85,11 +87,13 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 2 {
if len(metrics) != 4 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[1], 1, map[string]string{"type": ""})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[3], 1, map[string]string{"type": "", "version": "", "invoker": ""})
},
},
{
Expand Down Expand Up @@ -225,12 +229,14 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 3 {
if len(metrics) != 5 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1", "reason": "NoAvailableCondition"})
expectMetric(t, metrics[2], 1, map[string]string{"type": ""})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[3], 0, map[string]string{"name": "test", "version": "10.1.5-1", "reason": "NoAvailableCondition"})
expectMetric(t, metrics[4], 1, map[string]string{"type": ""})
},
},
{
Expand All @@ -256,13 +262,15 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 4 {
if len(metrics) != 6 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[1], 0, map[string]string{"name": "test", "version": "10.1.5-1"})
expectMetric(t, metrics[2], 0, map[string]string{"name": "test", "condition": "Available"})
expectMetric(t, metrics[3], 1, map[string]string{"type": ""})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[3], 0, map[string]string{"name": "test", "version": "10.1.5-1"})
expectMetric(t, metrics[4], 0, map[string]string{"name": "test", "condition": "Available"})
expectMetric(t, metrics[5], 1, map[string]string{"type": ""})
},
},
{
Expand All @@ -289,14 +297,16 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 5 {
if len(metrics) != 7 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[1], 1, map[string]string{"name": "test", "version": "10.1.5-1"})
expectMetric(t, metrics[2], 1, map[string]string{"name": "test", "condition": "Available"})
expectMetric(t, metrics[3], 1, map[string]string{"name": "test", "condition": "Degraded"})
expectMetric(t, metrics[4], 1, map[string]string{"type": ""})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[3], 1, map[string]string{"name": "test", "version": "10.1.5-1"})
expectMetric(t, metrics[4], 1, map[string]string{"name": "test", "condition": "Available"})
expectMetric(t, metrics[5], 1, map[string]string{"name": "test", "condition": "Degraded"})
expectMetric(t, metrics[6], 1, map[string]string{"type": ""})
},
},
{
Expand All @@ -321,14 +331,16 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 5 {
if len(metrics) != 7 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[1], 1, map[string]string{"name": "test", "version": ""})
expectMetric(t, metrics[2], 1, map[string]string{"name": "test", "condition": "Available", "reason": ""})
expectMetric(t, metrics[3], 0, map[string]string{"name": "test", "condition": "Custom", "reason": "CustomReason"})
expectMetric(t, metrics[4], 1, map[string]string{"type": ""})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 0, map[string]string{"type": "current", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[3], 1, map[string]string{"name": "test", "version": ""})
expectMetric(t, metrics[4], 1, map[string]string{"name": "test", "condition": "Available", "reason": ""})
expectMetric(t, metrics[5], 0, map[string]string{"name": "test", "condition": "Custom", "reason": "CustomReason"})
expectMetric(t, metrics[6], 1, map[string]string{"type": ""})
},
},
{
Expand Down Expand Up @@ -424,15 +436,16 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 6 {
if len(metrics) != 7 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 2, map[string]string{"type": "initial", "version": "", "image": "", "from_version": ""})
expectMetric(t, metrics[1], 2, map[string]string{"type": "cluster", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
expectMetric(t, metrics[2], 5, map[string]string{"type": "desired", "version": "1.0.0", "image": "test/image:2", "from_version": ""})
expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "condition": "Available", "reason": "Because stuff"})
expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
expectMetric(t, metrics[5], 1, map[string]string{"type": ""})
expectMetric(t, metrics[3], 1, map[string]string{"name": "version", "version": "", "reason": "Because stuff"})
expectMetric(t, metrics[4], 1, map[string]string{"name": "version", "condition": "Available", "reason": "Because stuff"})
expectMetric(t, metrics[5], 0, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1", "from_version": ""})
expectMetric(t, metrics[6], 1, map[string]string{"type": ""})
},
},
{
Expand Down Expand Up @@ -532,11 +545,13 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 2 {
if len(metrics) != 4 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[1], 1, map[string]string{"type": "openshift-install", "version": "v0.0.2", "invoker": "jane"})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[3], 1, map[string]string{"type": "openshift-install", "version": "v0.0.2", "invoker": "jane"})
},
},
{
Expand All @@ -561,11 +576,13 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 2 {
if len(metrics) != 4 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[1], 1, map[string]string{"type": "openshift-install", "version": "v0.0.2", "invoker": "jane"})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[3], 1, map[string]string{"type": "openshift-install", "version": "v0.0.2", "invoker": "jane"})
},
},
{
Expand All @@ -584,11 +601,13 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 2 {
if len(metrics) != 4 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[1], 1, map[string]string{"type": "other", "version": "v0.0.1", "invoker": "bill"})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[3], 1, map[string]string{"type": "other", "version": "v0.0.1", "invoker": "bill"})
},
},
{
Expand All @@ -606,11 +625,13 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 2 {
if len(metrics) != 4 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[1], 1, map[string]string{"type": "openshift-install", "version": "<missing>", "invoker": "<missing>"})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[3], 1, map[string]string{"type": "openshift-install", "version": "<missing>", "invoker": "<missing>"})
},
},
{
Expand All @@ -626,10 +647,12 @@ func Test_operatorMetrics_Collect(t *testing.T) {
},
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 1 {
if len(metrics) != 3 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
expectMetric(t, metrics[0], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[1], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[2], 3, map[string]string{"type": "current", "version": "0.0.2", "image": "test/image:1"})
},
},
}
Expand Down Expand Up @@ -716,18 +739,20 @@ func Test_operatorMetrics_CollectTransitions(t *testing.T) {
eventRecorder: record.NewFakeRecorder(100),
},
wants: func(t *testing.T, metrics []prometheus.Metric) {
if len(metrics) != 5 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
sort.Slice(metrics, func(i, j int) bool {
a, b := metricParts(t, metrics[i], "name", "condition"), metricParts(t, metrics[j], "name", "condition")
return a < b
})
if len(metrics) != 7 {
t.Fatalf("Unexpected metrics %s", spew.Sdump(metrics))
}
expectMetric(t, metrics[0], 1, map[string]string{"type": ""})
expectMetric(t, metrics[1], 3, map[string]string{"name": "test", "condition": "Available"})
expectMetric(t, metrics[2], 2, map[string]string{"name": "test", "condition": "Custom"})
expectMetric(t, metrics[3], 2, map[string]string{"name": "test", "condition": "Unknown"})
expectMetric(t, metrics[4], 0, map[string]string{"type": "current", "version": "", "image": ""})
expectMetric(t, metrics[4], 0, map[string]string{"name": "version", "condition": "Available", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[5], 0, map[string]string{"name": "version", "version": "", "reason": "ClusterVersionNotFound"})
expectMetric(t, metrics[6], 0, map[string]string{"type": "current", "version": "", "image": ""})
},
},
}
Expand Down

0 comments on commit 1c99f1d

Please sign in to comment.