Skip to content

Commit

Permalink
Merge pull request #1736 from petr-muller/ota-1174-details-health
Browse files Browse the repository at this point in the history
OTA-1174: `upgrade status`: Expand health insights with `--detailed=health`
  • Loading branch information
openshift-merge-bot[bot] committed Apr 19, 2024
2 parents b3dfe36 + 31b1c91 commit 17c015a
Show file tree
Hide file tree
Showing 15 changed files with 493 additions and 144 deletions.
24 changes: 18 additions & 6 deletions pkg/cli/admin/upgrade/status/controlplane.go
Expand Up @@ -48,10 +48,16 @@ func coInsights(name string, available v1.ClusterOperatorStatusCondition, degrad
startedAt: available.LastTransitionTime.Time,
scope: updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: name}}},
impact: updateInsightImpact{
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: fmt.Sprintf("Cluster Operator %s is unavailable | %s: %s", name, available.Reason, strings.ReplaceAll(available.Message, "\n", ` // `)),
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: fmt.Sprintf("Cluster Operator %s is unavailable (%s)", name, available.Reason),
description: available.Message,
},
remediation: updateInsightRemediation{reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md"},
}
if available.Message == "" {
// Backfill the description if CO doesn't provide one
insight.impact.description = "<no message>"
}
if evaluated.After(available.LastTransitionTime.Time.Add(unavailableErrorThreshold)) {
insight.impact.level = errorImpactLevel
Expand All @@ -63,10 +69,16 @@ func coInsights(name string, available v1.ClusterOperatorStatusCondition, degrad
startedAt: degraded.LastTransitionTime.Time,
scope: updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: name}}},
impact: updateInsightImpact{
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: fmt.Sprintf("Cluster Operator %s is degraded | %s: %s", name, degraded.Reason, strings.ReplaceAll(degraded.Message, "\n", ` // `)),
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: fmt.Sprintf("Cluster Operator %s is degraded (%s)", name, degraded.Reason),
description: degraded.Message,
},
remediation: updateInsightRemediation{reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md"},
}
if degraded.Message == "" {
// Backfill the description if CO doesn't provide one
insight.impact.description = "<no message>"
}
if evaluated.After(degraded.LastTransitionTime.Time.Add(degradedErrorThreshold)) {
insight.impact.level = errorImpactLevel
Expand Down
61 changes: 44 additions & 17 deletions pkg/cli/admin/upgrade/status/controlplane_test.go
Expand Up @@ -129,7 +129,14 @@ var cvFixture = configv1.ClusterVersion{
},
}

var allowUnexportedInsightStructs = cmp.AllowUnexported(updateInsight{}, updateInsightScope{}, scopeResource{}, updateInsightImpact{})
var allowUnexportedInsightStructs = cmp.AllowUnexported(
updateInsight{},
updateInsightScope{},
scopeResource{},
updateInsightImpact{},
updateInsightRemediation{},
updateHealthData{},
)

func TestAssessControlPlaneStatus_Operators(t *testing.T) {
testCases := []struct {
Expand Down Expand Up @@ -404,18 +411,26 @@ func TestCoInsights(t *testing.T) {
startedAt: anchorTime.Add(-unavailableWarningThreshold).Add(-time.Second),
scope: updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
impact: updateInsightImpact{
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is unavailable | Broken: Operator is broken",
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is unavailable (Broken)",
description: "Operator is broken",
},
remediation: updateInsightRemediation{
reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md",
},
},
{
startedAt: anchorTime.Add(-degradedWarningThreshold).Add(-time.Second),
scope: updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
impact: updateInsightImpact{
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is degraded | Slow: Networking is hard",
level: warningImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is degraded (Slow)",
description: "Networking is hard",
},
remediation: updateInsightRemediation{
reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md",
},
},
},
Expand All @@ -441,24 +456,32 @@ func TestCoInsights(t *testing.T) {
startedAt: anchorTime.Add(-unavailableErrorThreshold).Add(-time.Second),
scope: updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
impact: updateInsightImpact{
level: errorImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is unavailable | Broken: Operator is broken",
level: errorImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is unavailable (Broken)",
description: "Operator is broken",
},
remediation: updateInsightRemediation{
reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md",
},
},
{
startedAt: anchorTime.Add(-degradedErrorThreshold).Add(-time.Second),
scope: updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
impact: updateInsightImpact{
level: errorImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is degraded | Slow: Networking is hard",
level: errorImpactLevel,
impactType: apiAvailabilityImpactType,
summary: "Cluster Operator testOperator is degraded (Slow)",
description: "Networking is hard",
},
remediation: updateInsightRemediation{
reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md",
},
},
},
},
{
name: "insights flatten linebreaks in messages",
name: "insights do not flatten linebreaks in messages",
available: configv1.ClusterOperatorStatusCondition{
Type: configv1.OperatorAvailable,
Status: configv1.ConditionFalse,
Expand All @@ -475,9 +498,13 @@ func TestCoInsights(t *testing.T) {
startedAt: anchorTime.Add(-unavailableErrorThreshold).Add(-time.Second),
scope: updateInsightScope{scopeType: scopeTypeControlPlane, resources: []scopeResource{{kind: scopeKindClusterOperator, name: "testOperator"}}},
impact: updateInsightImpact{
level: errorImpactLevel,
impactType: apiAvailabilityImpactType,
summary: `Cluster Operator testOperator is unavailable | Broken: Operator is broken // and message has linebreaks`,
level: errorImpactLevel,
impactType: apiAvailabilityImpactType,
summary: `Cluster Operator testOperator is unavailable (Broken)`,
description: "Operator is broken\nand message has linebreaks",
},
remediation: updateInsightRemediation{
reference: "https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md",
},
},
},
Expand Down
Expand Up @@ -33,9 +33,39 @@ ip-10-0-4-159.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3
ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3 ?

= Update Health =
SINCE LEVEL IMPACT MESSAGE
58m18s Error API Availability Cluster Operator kube-apiserver is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
58m18s Error API Availability Cluster Operator kube-controller-manager is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
58m18s Error API Availability Cluster Operator kube-scheduler is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
58m38s Error API Availability Cluster Operator etcd is degraded | EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady: EtcdEndpointsDegraded: EtcdEndpointsController can't evaluate whether quorum is safe: etcd cluster has quorum of 2 and 2 healthy members which is not fault tolerant: [{Member:ID:12895393557789359222 name:"ip-10-0-73-118.ec2.internal" peerURLs:"https://10.0.73.118:2380" clientURLs:"https://10.0.73.118:2379" Healthy:true Took:1.725492ms Error:<nil>} {Member:ID:13608765340770574953 name:"ip-10-0-0-60.ec2.internal" peerURLs:"https://10.0.0.60:2380" clientURLs:"https://10.0.0.60:2379" Healthy:true Took:1.542919ms Error:<nil>} {Member:ID:18044478200504924924 name:"ip-10-0-12-74.ec2.internal" peerURLs:"https://10.0.12.74:2380" clientURLs:"https://10.0.12.74:2379" Healthy:false Took: Error:create client failure: failed to make etcd client for endpoints [https://10.0.12.74:2379]: context deadline exceeded}] // EtcdMembersDegraded: 2 of 3 members are available, ip-10-0-12-74.ec2.internal is unhealthy // NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
1h0m17s Error API Availability Cluster Operator control-plane-machine-set is unavailable | UnavailableReplicas: Missing 1 available replica(s)
Message: Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady)
Since: 58m18s
Level: Error
Impact: API Availability
Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)

Message: Cluster Operator kube-controller-manager is degraded (NodeController_MasterNodesReady)
Since: 58m18s
Level: Error
Impact: API Availability
Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)

Message: Cluster Operator kube-scheduler is degraded (NodeController_MasterNodesReady)
Since: 58m18s
Level: Error
Impact: API Availability
Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
Description: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)

Message: Cluster Operator etcd is degraded (EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady)
Since: 58m38s
Level: Error
Impact: API Availability
Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDegraded.md
Description: EtcdEndpointsDegraded: EtcdEndpointsController can't evaluate whether quorum is safe: etcd cluster has quorum of 2 and 2 healthy members which is not fault tolerant: [{Member:ID:12895393557789359222 name:"ip-10-0-73-118.ec2.internal" peerURLs:"https://10.0.73.118:2380" clientURLs:"https://10.0.73.118:2379" Healthy:true Took:1.725492ms Error:<nil>} {Member:ID:13608765340770574953 name:"ip-10-0-0-60.ec2.internal" peerURLs:"https://10.0.0.60:2380" clientURLs:"https://10.0.0.60:2379" Healthy:true Took:1.542919ms Error:<nil>} {Member:ID:18044478200504924924 name:"ip-10-0-12-74.ec2.internal" peerURLs:"https://10.0.12.74:2380" clientURLs:"https://10.0.12.74:2379" Healthy:false Took: Error:create client failure: failed to make etcd client for endpoints [https://10.0.12.74:2379]: context deadline exceeded}]
, EtcdMembersDegraded: 2 of 3 members are available, ip-10-0-12-74.ec2.internal is unhealthy
, NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)

Message: Cluster Operator control-plane-machine-set is unavailable (UnavailableReplicas)
Since: 1h0m17s
Level: Error
Impact: API Availability
Reference: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/ClusterOperatorDown.md
Description: Missing 1 available replica(s)
10 changes: 5 additions & 5 deletions pkg/cli/admin/upgrade/status/examples/4.14.1-degraded.output
Expand Up @@ -34,8 +34,8 @@ ip-10-0-99-40.us-east-2.compute.internal Outdated Pending 4.14.0-rc.3

= Update Health =
SINCE LEVEL IMPACT MESSAGE
58m18s Error API Availability Cluster Operator kube-apiserver is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
58m18s Error API Availability Cluster Operator kube-controller-manager is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
58m18s Error API Availability Cluster Operator kube-scheduler is degraded | NodeController_MasterNodesReady: NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
58m38s Error API Availability Cluster Operator etcd is degraded | EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady: EtcdEndpointsDegraded: EtcdEndpointsController can't evaluate whether quorum is safe: etcd cluster has quorum of 2 and 2 healthy members which is not fault tolerant: [{Member:ID:12895393557789359222 name:"ip-10-0-73-118.ec2.internal" peerURLs:"https://10.0.73.118:2380" clientURLs:"https://10.0.73.118:2379" Healthy:true Took:1.725492ms Error:<nil>} {Member:ID:13608765340770574953 name:"ip-10-0-0-60.ec2.internal" peerURLs:"https://10.0.0.60:2380" clientURLs:"https://10.0.0.60:2379" Healthy:true Took:1.542919ms Error:<nil>} {Member:ID:18044478200504924924 name:"ip-10-0-12-74.ec2.internal" peerURLs:"https://10.0.12.74:2380" clientURLs:"https://10.0.12.74:2379" Healthy:false Took: Error:create client failure: failed to make etcd client for endpoints [https://10.0.12.74:2379]: context deadline exceeded}] // EtcdMembersDegraded: 2 of 3 members are available, ip-10-0-12-74.ec2.internal is unhealthy // NodeControllerDegraded: The master nodes not ready: node "ip-10-0-12-74.ec2.internal" not ready since 2023-11-03 16:28:43 +0000 UTC because KubeletNotReady (container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:Network plugin returns error: No CNI configuration file in /etc/kubernetes/cni/net.d/. Has your network provider started?)
1h0m17s Error API Availability Cluster Operator control-plane-machine-set is unavailable | UnavailableReplicas: Missing 1 available replica(s)
58m18s Error API Availability Cluster Operator kube-apiserver is degraded (NodeController_MasterNodesReady)
58m18s Error API Availability Cluster Operator kube-controller-manager is degraded (NodeController_MasterNodesReady)
58m18s Error API Availability Cluster Operator kube-scheduler is degraded (NodeController_MasterNodesReady)
58m38s Error API Availability Cluster Operator etcd is degraded (EtcdEndpoints_ErrorUpdatingEtcdEndpoints::EtcdMembers_UnhealthyMembers::NodeController_MasterNodesReady)
1h0m17s Error API Availability Cluster Operator control-plane-machine-set is unavailable (UnavailableReplicas)
Expand Up @@ -27,5 +27,9 @@ ip-10-0-4-159.us-east-2.compute.internal Excluded Paused 4.14.0 -
ip-10-0-99-40.us-east-2.compute.internal Excluded Paused 4.14.0 -

= Update Health =
SINCE LEVEL IMPACT MESSAGE
- Warning Update Stalled Worker pool worker is paused | Outdated nodes in a paused pool will not be updated.
Message: Outdated nodes in a paused pool 'worker' will not be updated
Since: -
Level: Warning
Impact: Update Stalled
Reference: https://docs.openshift.com/container-platform/latest/support/troubleshooting/troubleshooting-operator-issues.html#troubleshooting-disabling-autoreboot-mco_troubleshooting-operator-issues
Description: Pool is paused, which stops all changes to the nodes in the pool, including updates. The nodes will not be updated until the pool is unpaused by the administrator.
Expand Up @@ -28,4 +28,4 @@ ip-10-0-99-40.us-east-2.compute.internal Excluded Paused 4.14.0 -

= Update Health =
SINCE LEVEL IMPACT MESSAGE
- Warning Update Stalled Worker pool worker is paused | Outdated nodes in a paused pool will not be updated.
- Warning Update Stalled Outdated nodes in a paused pool 'worker' will not be updated

0 comments on commit 17c015a

Please sign in to comment.