Merge pull request #1736 from petr-muller/ota-1174-details-health

OTA-1174: `upgrade status`: Expand health insights with `--detailed=health`
openshift · Apr 19, 2024 · 17c015a · 17c015a
2 parents b3dfe36 + 31b1c91
commit 17c015a
Show file tree

Hide file tree

Showing 15 changed files with 493 additions and 144 deletions.
diff --git a/pkg/cli/admin/upgrade/status/controlplane.go b/pkg/cli/admin/upgrade/status/controlplane.go
@@ -48,10 +48,16 @@ func coInsights(name string, available v1.ClusterOperatorStatusCondition, degrad
 			startedAt: available.LastTransitionTime.Time,
 			scope:     updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: name}}},
 			impact: updateInsightImpact{
-				level:      warningImpactLevel,
-				impactType: apiAvailabilityImpactType,
-				summary:    fmt.Sprintf("Cluster Operator %s is unavailable | %s: %s", name, available.Reason, strings.ReplaceAll(available.Message, "\n", ` // `)),
+				level:       warningImpactLevel,
+				impactType:  apiAvailabilityImpactType,
+				summary:     fmt.Sprintf("Cluster Operator %s is unavailable (%s)", name, available.Reason),
+				description: available.Message,
 			},
+			remediation: updateInsightRemediation{reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md"},
+		}
+		if available.Message == "" {
+			// Backfill the description if CO doesn't provide one
+			insight.impact.description = "<no message>"
 		}
 		if evaluated.After(available.LastTransitionTime.Time.Add(unavailableErrorThreshold)) {
 			insight.impact.level = errorImpactLevel
@@ -63,10 +69,16 @@ func coInsights(name string, available v1.ClusterOperatorStatusCondition, degrad
 			startedAt: degraded.LastTransitionTime.Time,
 			scope:     updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: name}}},
 			impact: updateInsightImpact{
-				level:      warningImpactLevel,
-				impactType: apiAvailabilityImpactType,
-				summary:    fmt.Sprintf("Cluster Operator %s is degraded | %s: %s", name, degraded.Reason, strings.ReplaceAll(degraded.Message, "\n", ` // `)),
+				level:       warningImpactLevel,
+				impactType:  apiAvailabilityImpactType,
+				summary:     fmt.Sprintf("Cluster Operator %s is degraded (%s)", name, degraded.Reason),
+				description: degraded.Message,
 			},
+			remediation: updateInsightRemediation{reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md"},
+		}
+		if degraded.Message == "" {
+			// Backfill the description if CO doesn't provide one
+			insight.impact.description = "<no message>"
 		}
 		if evaluated.After(degraded.LastTransitionTime.Time.Add(degradedErrorThreshold)) {
 			insight.impact.level = errorImpactLevel

diff --git a/pkg/cli/admin/upgrade/status/controlplane_test.go b/pkg/cli/admin/upgrade/status/controlplane_test.go
@@ -129,7 +129,14 @@ var cvFixture = configv1.ClusterVersion{
 	},
 }
 
-var allowUnexportedInsightStructs = cmp.AllowUnexported(updateInsight{}, updateInsightScope{}, scopeResource{}, updateInsightImpact{})
+var allowUnexportedInsightStructs = cmp.AllowUnexported(
+	updateInsight{},
+	updateInsightScope{},
+	scopeResource{},
+	updateInsightImpact{},
+	updateInsightRemediation{},
+	updateHealthData{},
+)
 
 func TestAssessControlPlaneStatus_Operators(t *testing.T) {
 	testCases := []struct {
@@ -404,18 +411,26 @@ func TestCoInsights(t *testing.T) {
 					startedAt: anchorTime.Add(-unavailableWarningThreshold).Add(-time.Second),
 					scope:     updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
 					impact: updateInsightImpact{
-						level:      warningImpactLevel,
-						impactType: apiAvailabilityImpactType,
-						summary:    "Cluster Operator testOperator is unavailable | Broken: Operator is broken",
+						level:       warningImpactLevel,
+						impactType:  apiAvailabilityImpactType,
+						summary:     "Cluster Operator testOperator is unavailable (Broken)",
+						description: "Operator is broken",
+					},
+					remediation: updateInsightRemediation{
+						reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md",
 					},
 				},
 				{
 					startedAt: anchorTime.Add(-degradedWarningThreshold).Add(-time.Second),
 					scope:     updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
 					impact: updateInsightImpact{
-						level:      warningImpactLevel,
-						impactType: apiAvailabilityImpactType,
-						summary:    "Cluster Operator testOperator is degraded | Slow: Networking is hard",
+						level:       warningImpactLevel,
+						impactType:  apiAvailabilityImpactType,
+						summary:     "Cluster Operator testOperator is degraded (Slow)",
+						description: "Networking is hard",
+					},
+					remediation: updateInsightRemediation{
+						reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md",
 					},
 				},
 			},
@@ -441,24 +456,32 @@ func TestCoInsights(t *testing.T) {
 					startedAt: anchorTime.Add(-unavailableErrorThreshold).Add(-time.Second),
 					scope:     updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
 					impact: updateInsightImpact{
-						level:      errorImpactLevel,
-						impactType: apiAvailabilityImpactType,
-						summary:    "Cluster Operator testOperator is unavailable | Broken: Operator is broken",
+						level:       errorImpactLevel,
+						impactType:  apiAvailabilityImpactType,
+						summary:     "Cluster Operator testOperator is unavailable (Broken)",
+						description: "Operator is broken",
+					},
+					remediation: updateInsightRemediation{
+						reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md",
 					},
 				},
 				{
 					startedAt: anchorTime.Add(-degradedErrorThreshold).Add(-time.Second),
 					scope:     updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
 					impact: updateInsightImpact{
-						level:      errorImpactLevel,
-						impactType: apiAvailabilityImpactType,
-						summary:    "Cluster Operator testOperator is degraded | Slow: Networking is hard",
+						level:       errorImpactLevel,
+						impactType:  apiAvailabilityImpactType,
+						summary:     "Cluster Operator testOperator is degraded (Slow)",
+						description: "Networking is hard",
+					},
+					remediation: updateInsightRemediation{
+						reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md",
 					},
 				},
 			},
 		},
 		{
-			name: "insights flatten linebreaks in messages",
+			name: "insights do not flatten linebreaks in messages",
 			available: configv1.ClusterOperatorStatusCondition{
 				Type:               configv1.OperatorAvailable,
 				Status:             configv1.ConditionFalse,
@@ -475,9 +498,13 @@ func TestCoInsights(t *testing.T) {
 					startedAt: anchorTime.Add(-unavailableErrorThreshold).Add(-time.Second),
 					scope:     updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
 					impact: updateInsightImpact{
-						level:      errorImpactLevel,
-						impactType: apiAvailabilityImpactType,
-						summary:    `Cluster Operator testOperator is unavailable | Broken: Operator is broken // and message has linebreaks`,
+						level:       errorImpactLevel,
+						impactType:  apiAvailabilityImpactType,
+						summary:     `Cluster Operator testOperator is unavailable (Broken)`,
+						description: "Operator is broken\nand message has linebreaks",
+					},
+					remediation: updateInsightRemediation{
+						reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md",
 					},
 				},
 			},

diff --git a/pkg/cli/admin/upgrade/status/examples/4.14.1-degraded.detailed-output b/pkg/cli/admin/upgrade/status/examples/4.14.1-degraded.detailed-output
@@ -33,9 +33,39 @@ ip-10-0-4-159.us-east-2.compute.internal    Outdated     Pending   4.14.0-rc.3
 ip-10-0-99-40.us-east-2.compute.internal    Outdated     Pending   4.14.0-rc.3   ?     
 
 = Update Health =
-SINCE     LEVEL   IMPACT             MESSAGE
-58m18s    Error   API Availability   Cluster Operator kube-apiserver is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-58m18s    Error   API Availability   Cluster Operator kube-controller-manager is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-58m18s    Error   API Availability   Cluster Operator kube-scheduler is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-58m38s    Error   API Availability   Cluster Operator etcd is degraded | EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady: EtcdEndpointsDegraded: EtcdEndpointsController can't evaluate whether quorum is safe: etcd cluster has quorum of 2 and 2 healthy members which is not fault tolerant: [{Member:ID:12895393557789359222 name:"ip-10-0-73-118.ec2.internal" peerURLs:"https://10.0.73.118:2380" clientURLs:"https://10.0.73.118:2379"  Healthy:true Took:1.725492ms Error:<nil>} {Member:ID:13608765340770574953 name:"ip-10-0-0-60.ec2.internal" peerURLs:"https://10.0.0.60:2380" clientURLs:"https://10.0.0.60:2379"  Healthy:true Took:1.542919ms Error:<nil>} {Member:ID:18044478200504924924 name:"ip-10-0-12-74.ec2.internal" peerURLs:"https://10.0.12.74:2380" clientURLs:"https://10.0.12.74:2379"  Healthy:false Took: Error:create client failure: failed to make etcd client for endpoints [https://10.0.12.74:2379]: context deadline exceeded}] // EtcdMembersDegraded: 2 of 3 members are available, ip-10-0-12-74.ec2.internal is unhealthy // NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-1h0m17s   Error   API Availability   Cluster Operator control-plane-machine-set is unavailable | UnavailableReplicas: Missing 1 available replica(s)
+Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady)
+  Since:       58m18s
+  Level:       Error
+  Impact:      API Availability
+  Reference:   https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
+  Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
+
+Message: Cluster Operator kube-controller-manager is degraded (NodeController_MasterNodesReady)
+  Since:       58m18s
+  Level:       Error
+  Impact:      API Availability
+  Reference:   https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
+  Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
+
+Message: Cluster Operator kube-scheduler is degraded (NodeController_MasterNodesReady)
+  Since:       58m18s
+  Level:       Error
+  Impact:      API Availability
+  Reference:   https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
+  Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
+
+Message: Cluster Operator etcd is degraded (EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady)
+  Since:       58m38s
+  Level:       Error
+  Impact:      API Availability
+  Reference:   https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
+  Description: EtcdEndpointsDegraded: EtcdEndpointsController can't evaluate whether quorum is safe: etcd cluster has quorum of 2 and 2 healthy members which is not fault tolerant: [{Member:ID:12895393557789359222 name:"ip-10-0-73-118.ec2.internal" peerURLs:"https://10.0.73.118:2380" clientURLs:"https://10.0.73.118:2379"  Healthy:true Took:1.725492ms Error:<nil>} {Member:ID:13608765340770574953 name:"ip-10-0-0-60.ec2.internal" peerURLs:"https://10.0.0.60:2380" clientURLs:"https://10.0.0.60:2379"  Healthy:true Took:1.542919ms Error:<nil>} {Member:ID:18044478200504924924 name:"ip-10-0-12-74.ec2.internal" peerURLs:"https://10.0.12.74:2380" clientURLs:"https://10.0.12.74:2379"  Healthy:false Took: Error:create client failure: failed to make etcd client for endpoints [https://10.0.12.74:2379]: context deadline exceeded}]
+               , EtcdMembersDegraded: 2 of 3 members are available, ip-10-0-12-74.ec2.internal is unhealthy
+               , NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
+
+Message: Cluster Operator control-plane-machine-set is unavailable (UnavailableReplicas)
+  Since:       1h0m17s
+  Level:       Error
+  Impact:      API Availability
+  Reference:   https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md
+  Description: Missing 1 available replica(s)
diff --git a/pkg/cli/admin/upgrade/status/examples/4.14.1-degraded.output b/pkg/cli/admin/upgrade/status/examples/4.14.1-degraded.output
@@ -34,8 +34,8 @@ ip-10-0-99-40.us-east-2.compute.internal    Outdated     Pending   4.14.0-rc.3
 
 = Update Health =
 SINCE     LEVEL   IMPACT             MESSAGE
-58m18s    Error   API Availability   Cluster Operator kube-apiserver is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-58m18s    Error   API Availability   Cluster Operator kube-controller-manager is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-58m18s    Error   API Availability   Cluster Operator kube-scheduler is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-58m38s    Error   API Availability   Cluster Operator etcd is degraded | EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady: EtcdEndpointsDegraded: EtcdEndpointsController can't evaluate whether quorum is safe: etcd cluster has quorum of 2 and 2 healthy members which is not fault tolerant: [{Member:ID:12895393557789359222 name:"ip-10-0-73-118.ec2.internal" peerURLs:"https://10.0.73.118:2380" clientURLs:"https://10.0.73.118:2379"  Healthy:true Took:1.725492ms Error:<nil>} {Member:ID:13608765340770574953 name:"ip-10-0-0-60.ec2.internal" peerURLs:"https://10.0.0.60:2380" clientURLs:"https://10.0.0.60:2379"  Healthy:true Took:1.542919ms Error:<nil>} {Member:ID:18044478200504924924 name:"ip-10-0-12-74.ec2.internal" peerURLs:"https://10.0.12.74:2380" clientURLs:"https://10.0.12.74:2379"  Healthy:false Took: Error:create client failure: failed to make etcd client for endpoints [https://10.0.12.74:2379]: context deadline exceeded}] // EtcdMembersDegraded: 2 of 3 members are available, ip-10-0-12-74.ec2.internal is unhealthy // NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
-1h0m17s   Error   API Availability   Cluster Operator control-plane-machine-set is unavailable | UnavailableReplicas: Missing 1 available replica(s)
+58m18s    Error   API Availability   Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady)
+58m18s    Error   API Availability   Cluster Operator kube-controller-manager is degraded (NodeController_MasterNodesReady)
+58m18s    Error   API Availability   Cluster Operator kube-scheduler is degraded (NodeController_MasterNodesReady)
+58m38s    Error   API Availability   Cluster Operator etcd is degraded (EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady)
+1h0m17s   Error   API Availability   Cluster Operator control-plane-machine-set is unavailable (UnavailableReplicas)
diff --git a/pkg/cli/admin/upgrade/status/examples/4.14.1-paused-worker-pool.detailed-output b/pkg/cli/admin/upgrade/status/examples/4.14.1-paused-worker-pool.detailed-output
@@ -27,5 +27,9 @@ ip-10-0-4-159.us-east-2.compute.internal    Excluded     Paused   4.14.0    -
 ip-10-0-99-40.us-east-2.compute.internal    Excluded     Paused   4.14.0    -     
 
 = Update Health =
-SINCE   LEVEL     IMPACT           MESSAGE
--       Warning   Update Stalled   Worker pool worker is paused | Outdated nodes in a paused pool will not be updated.
+Message: Outdated nodes in a paused pool 'worker' will not be updated
+  Since:       -
+  Level:       Warning
+  Impact:      Update Stalled
+  Reference:   https://docs.openshift.com/container-platform/latest/support/troubleshooting/troubleshooting-operator-issues.html#troubleshooting-disabling-autoreboot-mco_troubleshooting-operator-issues
+  Description: Pool is paused, which stops all changes to the nodes in the pool, including updates. The nodes will not be updated until the pool is unpaused by the administrator.
diff --git a/pkg/cli/admin/upgrade/status/examples/4.14.1-paused-worker-pool.output b/pkg/cli/admin/upgrade/status/examples/4.14.1-paused-worker-pool.output
@@ -28,4 +28,4 @@ ip-10-0-99-40.us-east-2.compute.internal    Excluded     Paused   4.14.0    -
 
 = Update Health =
 SINCE   LEVEL     IMPACT           MESSAGE
--       Warning   Update Stalled   Worker pool worker is paused | Outdated nodes in a paused pool will not be updated.
+-       Warning   Update Stalled   Outdated nodes in a paused pool 'worker' will not be updated