From 18697e82b4f733cd24ca370eb907674b2c79c228 Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Tue, 11 Mar 2025 20:19:03 +0100 Subject: [PATCH 01/15] Filter out unreachable taints from tolerations --- pkg/cli/admin/mustgather/mustgather.go | 49 +++++++++++++++++++++----- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 6fd182d179..4355478e44 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -940,7 +940,45 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P } cleanedSourceDir := path.Clean(o.SourceDir) - volumeUsageChecker := fmt.Sprintf(volumeUsageCheckerScript, cleanedSourceDir, cleanedSourceDir, o.VolumePercentage, o.VolumePercentage, executedCommand) + volumeUsageChecker := fmt.Sprintf( + volumeUsageCheckerScript, + cleanedSourceDir, + cleanedSourceDir, + o.VolumePercentage, + o.VolumePercentage, + executedCommand + ) + + excludedTaints := []corev1.Taint{ + {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, + {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoSchedule}, + } + + var tolerations []corev1.Toleration + if node == "" && hasMaster { + tolerations = append(tolerations, corev1.Toleration{ + Key: "node-role.kubernetes.io/master", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }) + } + + tolerations = append(tolerations, corev1.Toleration{ + Key: "node.kubernetes.io/not-ready", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + }) + + filteredTolerations := make([]corev1.Toleration, 0) +TolerationLoop: + for _, tol := range tolerations { + for _, excluded := range excludedTaints { + if tol.ToleratesTaint(&excluded) { + continue TolerationLoop + } + } + filteredTolerations = append(filteredTolerations, tol) + } ret := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ @@ -1016,14 +1054,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P HostNetwork: o.HostNetwork, NodeSelector: nodeSelector, TerminationGracePeriodSeconds: &zero, - Tolerations: []corev1.Toleration{ - { - // An empty key with operator Exists matches all keys, - // values and effects which means this will tolerate everything. - // As noted in https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ - Operator: "Exists", - }, - }, + Tolerations: filteredTolerations, }, } if o.HostNetwork { From f41482b42f0e0fa518b060093d0f92d34f1f5015 Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Wed, 2 Apr 2025 21:32:02 +0200 Subject: [PATCH 02/15] fixing syntax issue on line 949 --- pkg/cli/admin/mustgather/mustgather.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 4355478e44..6c3ea266fc 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -942,12 +942,11 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P cleanedSourceDir := path.Clean(o.SourceDir) volumeUsageChecker := fmt.Sprintf( volumeUsageCheckerScript, - cleanedSourceDir, - cleanedSourceDir, - o.VolumePercentage, - o.VolumePercentage, + cleanedSourceDir, + o.VolumePercentage, executedCommand ) + ) excludedTaints := []corev1.Taint{ {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, @@ -1115,4 +1114,4 @@ func (o *MustGatherOptions) BackupGathering(ctx context.Context, errs []error) { fmt.Fprintf(o.ErrOut, "error running backup collection: %v\n", err) return } -} +} \ No newline at end of file From cd6a9c3ee01fd8cee7257c4f2ed8ac3636a9e73f Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Wed, 2 Apr 2025 22:25:03 +0200 Subject: [PATCH 03/15] fixing syntax issue on line 949 --- pkg/cli/admin/mustgather/mustgather.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 6c3ea266fc..18ae7e089a 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -946,7 +946,6 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P o.VolumePercentage, executedCommand ) - ) excludedTaints := []corev1.Taint{ {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, From 96859e515902f4b0d96f64f5492002d43def4bfe Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Thu, 3 Apr 2025 13:05:49 +0200 Subject: [PATCH 04/15] sintax fix --- pkg/cli/admin/mustgather/mustgather.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 18ae7e089a..84b0645d58 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -941,10 +941,10 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P cleanedSourceDir := path.Clean(o.SourceDir) volumeUsageChecker := fmt.Sprintf( - volumeUsageCheckerScript, + volumeUsageCheckerScript, cleanedSourceDir, o.VolumePercentage, - executedCommand + executedCommand, ) excludedTaints := []corev1.Taint{ @@ -1113,4 +1113,4 @@ func (o *MustGatherOptions) BackupGathering(ctx context.Context, errs []error) { fmt.Fprintf(o.ErrOut, "error running backup collection: %v\n", err) return } -} \ No newline at end of file +} From dd2eb43decf8ecf2bf71f8c0e785910734da3bef Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Thu, 3 Apr 2025 15:00:17 +0200 Subject: [PATCH 05/15] defining constat --- pkg/cli/admin/mustgather/mustgather.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 84b0645d58..5ef298e8f1 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -51,6 +51,8 @@ import ( const ( gatherContainerName = "gather" + unreachableTaintKey = "node.kubernetes.io/unreachable" + ) var ( From 2849b326dbdcca202fd969018dd5d33f89390efe Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Thu, 3 Apr 2025 16:58:09 +0200 Subject: [PATCH 06/15] revert initial change --- pkg/cli/admin/mustgather/mustgather.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 5ef298e8f1..a5973d8c6e 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -942,13 +942,9 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P } cleanedSourceDir := path.Clean(o.SourceDir) - volumeUsageChecker := fmt.Sprintf( - volumeUsageCheckerScript, - cleanedSourceDir, - o.VolumePercentage, - executedCommand, - ) - + volumeUsageChecker := fmt.Sprintf(volumeUsageCheckerScript, cleanedSourceDir, cleanedSourceDir, o.VolumePercentage, o.VolumePercentage, executedCommand) + //fmt.Sprintf(volumeUsageCheckerScript,cleanedSourceDir,o.VolumePercentage,executedCommand,) + excludedTaints := []corev1.Taint{ {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoSchedule}, From b0502cae1215d8615f2413c8870f4192f904bcee Mon Sep 17 00:00:00 2001 From: Mihai Idu Date: Tue, 15 Apr 2025 22:24:47 +0200 Subject: [PATCH 07/15] Update mustgather.go remove commented line 946 --- pkg/cli/admin/mustgather/mustgather.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index a5973d8c6e..3a3a779016 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -943,7 +943,6 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P cleanedSourceDir := path.Clean(o.SourceDir) volumeUsageChecker := fmt.Sprintf(volumeUsageCheckerScript, cleanedSourceDir, cleanedSourceDir, o.VolumePercentage, o.VolumePercentage, executedCommand) - //fmt.Sprintf(volumeUsageCheckerScript,cleanedSourceDir,o.VolumePercentage,executedCommand,) excludedTaints := []corev1.Taint{ {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, From 365fb3a6f3ca97cdfa687da01a28121c19b11589 Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Tue, 27 May 2025 10:38:01 +0200 Subject: [PATCH 08/15] addressing PR comments --- pkg/cli/admin/mustgather/mustgather.go | 29 +++++++++++++++----------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 656daeb578..a77331b938 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -100,6 +100,20 @@ sleep 5 done` ) +var ( + tolerationNotReady = corev1.Toleration{ + Key: "node.kubernetes.io/not-ready", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + } + + tolerationMasterNoSchedule = corev1.Toleration{ + Key: "node-role.kubernetes.io/master", + Operator: corev1.TolerationOpExists, + Effect: corev1.TaintEffectNoSchedule, + } +) + const ( // number of concurrent must-gather Pods to run if --all-images or multiple --image are provided concurrentMG = 4 @@ -949,21 +963,12 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoSchedule}, } - var tolerations []corev1.Toleration + tolerations := []corev1.Toleration{tolerationNotReady} + if node == "" && hasMaster { - tolerations = append(tolerations, corev1.Toleration{ - Key: "node-role.kubernetes.io/master", - Operator: corev1.TolerationOpExists, - Effect: corev1.TaintEffectNoSchedule, - }) + tolerations = append(tolerations, tolerationMasterNoSchedule) } - tolerations = append(tolerations, corev1.Toleration{ - Key: "node.kubernetes.io/not-ready", - Operator: corev1.TolerationOpExists, - Effect: corev1.TaintEffectNoSchedule, - }) - filteredTolerations := make([]corev1.Toleration, 0) TolerationLoop: for _, tol := range tolerations { From 93d98b5e9b2733b137ef371de5fbefd03eb8353e Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Tue, 27 May 2025 15:15:02 +0200 Subject: [PATCH 09/15] adding node message scheduling --- pkg/cli/admin/mustgather/mustgather.go | 46 +++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index a77331b938..2c7ecfc798 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -51,8 +51,7 @@ import ( const ( gatherContainerName = "gather" - unreachableTaintKey = "node.kubernetes.io/unreachable" - + unreachableTaintKey = "node.kubernetes.io/unreachable" ) var ( @@ -483,6 +482,42 @@ func (o *MustGatherOptions) Run() error { if err != nil { return err } + if o.NodeName != "" { + node, err := o.Client.CoreV1().Nodes().Get(context.TODO(), o.NodeName, metav1.GetOptions{}) + if err == nil { + for _, taint := range node.Spec.Taints { + if taint.Key == "node.kubernetes.io/unreachable" { + o.log("WARNING: The must-gather pod is scheduled to node '%s', which is tainted with 'node.kubernetes.io/unreachable'. This may cause the pod to get stuck.", o.NodeName) + + // Suggest alternative nodes without this taint + var suggestions []string + for _, n := range nodes.Items { + if n.Name == o.NodeName { + continue + } + unreachable := false + for _, t := range n.Spec.Taints { + if t.Key == "node.kubernetes.io/unreachable" { + unreachable = true + break + } + } + if !unreachable { + suggestions = append(suggestions, n.Name) + } + } + + if len(suggestions) > 0 { + o.log("Consider using one of these nodes instead: %s", strings.Join(suggestions, ", ")) + } else { + o.log("No alternative nodes without the 'node.kubernetes.io/unreachable' taint were found.") + } + break + } + } + } + } + // ... check if we have any master nodes ... var hasMaster bool for _, node := range nodes.Items { if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok { @@ -957,7 +992,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P cleanedSourceDir := path.Clean(o.SourceDir) volumeUsageChecker := fmt.Sprintf(volumeUsageCheckerScript, cleanedSourceDir, cleanedSourceDir, o.VolumePercentage, o.VolumePercentage, executedCommand) - + excludedTaints := []corev1.Taint{ {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoSchedule}, @@ -969,7 +1004,10 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P tolerations = append(tolerations, tolerationMasterNoSchedule) } - filteredTolerations := make([]corev1.Toleration, 0) + // This filters tolerations against excluded taints. + // Currently, no matches are possible with hardcoded values, + // but this logic supports future dynamic exclusions (e.g., user-provided taints). + filteredTolerations := make([]corev1.Toleration, 0, len(tolerations)) TolerationLoop: for _, tol := range tolerations { for _, excluded := range excludedTaints { From d27e8396b1fa55fa152e26bf5aaa8de7c8f8b677 Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Tue, 27 May 2025 18:19:00 +0200 Subject: [PATCH 10/15] dynamically prefer healthy nodes for must-gather; warn if using tainted nodes - Prioritize nodes without 'unreachable' or 'not-ready' taints and with recent heartbeats - Fallback to tainted nodes if no healthy nodes exist - Print non-blocking warnings when scheduling to problematic nodes - All logic is derived dynamically from cluster state --- pkg/cli/admin/mustgather/mustgather.go | 82 +++++++++++++++++--------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index 2c7ecfc798..e0648d1b9b 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -417,6 +417,41 @@ func (o *MustGatherOptions) Validate() error { return nil } +// prioritizeHealthyNodes returns a preferred node to run the must-gather pod on, and a fallback node if no preferred node is found. +func prioritizeHealthyNodes(nodes *corev1.NodeList) (preferred *corev1.Node, fallback *corev1.Node) { + var fallbackNode *corev1.Node + var latestHeartbeat time.Time + + for _, node := range nodes.Items { + var hasUnhealthyTaint bool + for _, taint := range node.Spec.Taints { + if taint.Key == "node.kubernetes.io/unreachable" || taint.Key == "node.kubernetes.io/not-ready" { + hasUnhealthyTaint = true + break + } + } + + // Look at heartbeat time for readiness hint + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + // Check if heartbeat is recent (less than 2m old) + if time.Since(cond.LastHeartbeatTime.Time) < 2*time.Minute { + if !hasUnhealthyTaint { + return &node, nil // return immediately on good candidate + } + } + break + } + } + + if fallbackNode == nil || node.CreationTimestamp.Time.After(latestHeartbeat) { + fallbackNode = &node + latestHeartbeat = node.CreationTimestamp.Time + } + } + + return nil, fallbackNode +} // Run creates and runs a must-gather pod func (o *MustGatherOptions) Run() error { @@ -482,42 +517,33 @@ func (o *MustGatherOptions) Run() error { if err != nil { return err } - if o.NodeName != "" { - node, err := o.Client.CoreV1().Nodes().Get(context.TODO(), o.NodeName, metav1.GetOptions{}) - if err == nil { + + if o.NodeName == "" { + preferred, fallback := prioritizeHealthyNodes(nodes) + if preferred != nil { + o.NodeName = preferred.Name + } else if fallback != nil { + o.NodeName = fallback.Name + o.log("WARNING: No healthy node was available. must-gather will run on tainted node '%s'. This may cause the pod to get stuck.", o.NodeName) + } + } else { + for _, node := range nodes.Items { + if node.Name != o.NodeName { + continue + } for _, taint := range node.Spec.Taints { if taint.Key == "node.kubernetes.io/unreachable" { o.log("WARNING: The must-gather pod is scheduled to node '%s', which is tainted with 'node.kubernetes.io/unreachable'. This may cause the pod to get stuck.", o.NodeName) - - // Suggest alternative nodes without this taint - var suggestions []string - for _, n := range nodes.Items { - if n.Name == o.NodeName { - continue - } - unreachable := false - for _, t := range n.Spec.Taints { - if t.Key == "node.kubernetes.io/unreachable" { - unreachable = true - break - } - } - if !unreachable { - suggestions = append(suggestions, n.Name) - } - } - - if len(suggestions) > 0 { - o.log("Consider using one of these nodes instead: %s", strings.Join(suggestions, ", ")) - } else { - o.log("No alternative nodes without the 'node.kubernetes.io/unreachable' taint were found.") - } break } } + break } } - // ... check if we have any master nodes ... + + if err != nil { + return err + } var hasMaster bool for _, node := range nodes.Items { if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok { From 6c551a33bd4c622a943f41f5452af4a27c0ed186 Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Tue, 27 May 2025 21:48:10 +0200 Subject: [PATCH 11/15] pass verify --- pkg/cli/admin/mustgather/mustgather.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index e0648d1b9b..4e437fd1f7 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -417,6 +417,7 @@ func (o *MustGatherOptions) Validate() error { return nil } + // prioritizeHealthyNodes returns a preferred node to run the must-gather pod on, and a fallback node if no preferred node is found. func prioritizeHealthyNodes(nodes *corev1.NodeList) (preferred *corev1.Node, fallback *corev1.Node) { var fallbackNode *corev1.Node From d4b12675269f67ccd48d8e7af1be42b1f97bfd99 Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Mon, 16 Jun 2025 12:44:43 +0200 Subject: [PATCH 12/15] fixing reference variables --- pkg/cli/admin/mustgather/mustgather.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index c801f46d48..b260a74754 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -52,6 +52,7 @@ import ( const ( gatherContainerName = "gather" unreachableTaintKey = "node.kubernetes.io/unreachable" + notReadyTaintKey = "node.kubernetes.io/not-ready" ) var ( @@ -424,7 +425,7 @@ func prioritizeHealthyNodes(nodes *corev1.NodeList) (preferred *corev1.Node, fal for _, node := range nodes.Items { var hasUnhealthyTaint bool for _, taint := range node.Spec.Taints { - if taint.Key == "node.kubernetes.io/unreachable" || taint.Key == "node.kubernetes.io/not-ready" { + if taint.Key == unreachableTaintKey || taint.Key == notReadyTaintKey { hasUnhealthyTaint = true break } @@ -531,7 +532,7 @@ func (o *MustGatherOptions) Run() error { continue } for _, taint := range node.Spec.Taints { - if taint.Key == "node.kubernetes.io/unreachable" { + if taint.Key == unreachableTaintKey { o.log("WARNING: The must-gather pod is scheduled to node '%s', which is tainted with 'node.kubernetes.io/unreachable'. This may cause the pod to get stuck.", o.NodeName) break } From 8d010710b43676ae76c5dce5ea419f9aae1ee41e Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Mon, 16 Jun 2025 17:01:49 +0200 Subject: [PATCH 13/15] reverted fallback mechanism --- pkg/cli/admin/mustgather/mustgather.go | 48 +++++++------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index b260a74754..fa1aef99aa 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -518,29 +518,6 @@ func (o *MustGatherOptions) Run() error { return err } - if o.NodeName == "" { - preferred, fallback := prioritizeHealthyNodes(nodes) - if preferred != nil { - o.NodeName = preferred.Name - } else if fallback != nil { - o.NodeName = fallback.Name - o.log("WARNING: No healthy node was available. must-gather will run on tainted node '%s'. This may cause the pod to get stuck.", o.NodeName) - } - } else { - for _, node := range nodes.Items { - if node.Name != o.NodeName { - continue - } - for _, taint := range node.Spec.Taints { - if taint.Key == unreachableTaintKey { - o.log("WARNING: The must-gather pod is scheduled to node '%s', which is tainted with 'node.kubernetes.io/unreachable'. This may cause the pod to get stuck.", o.NodeName) - break - } - } - break - } - } - if err != nil { return err } @@ -1019,31 +996,32 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P cleanedSourceDir := path.Clean(o.SourceDir) volumeUsageChecker := fmt.Sprintf(volumeUsageCheckerScript, cleanedSourceDir, o.VolumePercentage, o.VolumePercentage, executedCommand) + // Define taints we do not want to tolerate (can be extended with user input in the future) excludedTaints := []corev1.Taint{ {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoExecute}, - {Key: unreachableTaintKey, Effect: corev1.TaintEffectNoSchedule}, + {Key: notReadyTaintKey, Effect: corev1.TaintEffectNoSchedule}, } - tolerations := []corev1.Toleration{tolerationNotReady} - + // Define candidate tolerations we may want to apply + candidateTolerations := []corev1.Toleration{tolerationNotReady} if node == "" && hasMaster { - tolerations = append(tolerations, tolerationMasterNoSchedule) + candidateTolerations = append(candidateTolerations, tolerationMasterNoSchedule) } - // This filters tolerations against excluded taints. - // Currently, no matches are possible with hardcoded values, - // but this logic supports future dynamic exclusions (e.g., user-provided taints). - filteredTolerations := make([]corev1.Toleration, 0, len(tolerations)) -TolerationLoop: - for _, tol := range tolerations { + // Build the toleration list by only adding tolerations that do NOT tolerate excluded taints + filteredTolerations := make([]corev1.Toleration, 0, len(candidateTolerations)) + TolerationLoop: + for _, tol := range candidateTolerations { for _, excluded := range excludedTaints { if tol.ToleratesTaint(&excluded) { + // Skip this toleration if it tolerates an excluded taint continue TolerationLoop } } filteredTolerations = append(filteredTolerations, tol) } + ret := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "must-gather-", @@ -1118,7 +1096,7 @@ TolerationLoop: HostNetwork: o.HostNetwork, NodeSelector: nodeSelector, TerminationGracePeriodSeconds: &zero, - Tolerations: filteredTolerations, + Tolerations: filteredTolerations, }, } if o.HostNetwork { @@ -1199,4 +1177,4 @@ func runInspect(streams genericiooptions.IOStreams, config *rest.Config, destDir return fmt.Errorf("error running backup collection: %w", err) } return nil -} +} \ No newline at end of file From 516c4a69f1d5ef5bcce85d5c0f06000554d63247 Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Tue, 17 Jun 2025 14:50:53 +0200 Subject: [PATCH 14/15] passing tests and validations --- pkg/cli/admin/mustgather/mustgather.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index fa1aef99aa..4ca683acb4 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -52,7 +52,7 @@ import ( const ( gatherContainerName = "gather" unreachableTaintKey = "node.kubernetes.io/unreachable" - notReadyTaintKey = "node.kubernetes.io/not-ready" + notReadyTaintKey = "node.kubernetes.io/not-ready" ) var ( @@ -1010,7 +1010,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P // Build the toleration list by only adding tolerations that do NOT tolerate excluded taints filteredTolerations := make([]corev1.Toleration, 0, len(candidateTolerations)) - TolerationLoop: +TolerationLoop: for _, tol := range candidateTolerations { for _, excluded := range excludedTaints { if tol.ToleratesTaint(&excluded) { @@ -1021,7 +1021,6 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P filteredTolerations = append(filteredTolerations, tol) } - ret := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ GenerateName: "must-gather-", @@ -1096,7 +1095,7 @@ func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.P HostNetwork: o.HostNetwork, NodeSelector: nodeSelector, TerminationGracePeriodSeconds: &zero, - Tolerations: filteredTolerations, + Tolerations: filteredTolerations, }, } if o.HostNetwork { @@ -1177,4 +1176,4 @@ func runInspect(streams genericiooptions.IOStreams, config *rest.Config, destDir return fmt.Errorf("error running backup collection: %w", err) } return nil -} \ No newline at end of file +} From 5a0f363d0d58f3956365cd29f8f936296320477a Mon Sep 17 00:00:00 2001 From: Mihai IDU Date: Fri, 4 Jul 2025 22:31:39 +0200 Subject: [PATCH 15/15] Add nodeAffinity-based scheduling with fallback --- pkg/cli/admin/mustgather/mustgather.go | 90 ++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 12 deletions(-) diff --git a/pkg/cli/admin/mustgather/mustgather.go b/pkg/cli/admin/mustgather/mustgather.go index dd39414bad..45d784721e 100644 --- a/pkg/cli/admin/mustgather/mustgather.go +++ b/pkg/cli/admin/mustgather/mustgather.go @@ -115,6 +115,30 @@ var ( } ) +// buildNodeSelector builds a node selector from the provided node names. +func buildNodeAffinity(nodeNames []string) *corev1.Affinity { + if len(nodeNames) == 0 { + return nil + } + return &corev1.Affinity{ + NodeAffinity: &corev1.NodeAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ + NodeSelectorTerms: []corev1.NodeSelectorTerm{ + { + MatchExpressions: []corev1.NodeSelectorRequirement{ + { + Key: "kubernetes.io/hostname", + Operator: corev1.NodeSelectorOpIn, + Values: nodeNames, + }, + }, + }, + }, + }, + }, + } +} + const ( // number of concurrent must-gather Pods to run if --all-images or multiple --image are provided concurrentMG = 4 @@ -512,19 +536,52 @@ func (o *MustGatherOptions) Run() error { defer cleanupNamespace() } - // Prefer to run in master if there's any but don't be explicit otherwise. - // This enables the command to run by default in hypershift where there's no masters. - nodes, err := o.Client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ - LabelSelector: o.NodeSelector, - }) + nodes, err := o.Client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{}) if err != nil { return err } - if err != nil { - return err + var cpNodes, workerNodes, unschedulableNodes []corev1.Node + for _, node := range nodes.Items { + isReady := false + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + isReady = true + break + } + } + if !isReady { + continue + } + if node.Spec.Unschedulable { + unschedulableNodes = append(unschedulableNodes, node) + continue + } + if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok { + cpNodes = append(cpNodes, node) + } else { + workerNodes = append(workerNodes, node) + } + } + + selectedNodes := cpNodes + if len(selectedNodes) == 0 { + selectedNodes = workerNodes + } + if len(selectedNodes) == 0 { + selectedNodes = unschedulableNodes } - var hasMaster bool + + var nodeNames []string + for _, n := range selectedNodes { + nodeNames = append(nodeNames, n.Name) + } + + affinity := buildNodeAffinity(nodeNames) + // If we have no nodes, we cannot run the must-gather pod. + + // Determine if there is a master node + hasMaster := false for _, node := range nodes.Items { if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok { hasMaster = true @@ -553,7 +610,7 @@ func (o *MustGatherOptions) Run() error { return err } for _, node := range nodes.Items { - pods = append(pods, o.newPod(node.Name, image, hasMaster)) + pods = append(pods, o.newPod(node.Name, image, hasMaster, affinity)) } } else { if o.NodeName != "" { @@ -563,7 +620,7 @@ func (o *MustGatherOptions) Run() error { return err } } - pods = append(pods, o.newPod(o.NodeName, image, hasMaster)) + pods = append(pods, o.newPod(o.NodeName, image, hasMaster, affinity)) } } @@ -981,7 +1038,8 @@ func newClusterRoleBinding(ns *corev1.Namespace) *rbacv1.ClusterRoleBinding { // newPod creates a pod with 2 containers with a shared volume mount: // - gather: init containers that run gather command // - copy: no-op container we can exec into -func (o *MustGatherOptions) newPod(node, image string, hasMaster bool) *corev1.Pod { +func (o *MustGatherOptions) newPod(node, image string, hasMaster bool, affinity *corev1.Affinity) *corev1.Pod { + zero := int64(0) nodeSelector := map[string]string{ @@ -1098,7 +1156,15 @@ TolerationLoop: HostNetwork: o.HostNetwork, NodeSelector: nodeSelector, TerminationGracePeriodSeconds: &zero, - Tolerations: filteredTolerations, + Tolerations: []corev1.Toleration{ + { + // An empty key with operator Exists matches all keys, + // values and effects which means this will tolerate everything. + // As noted in https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ + Operator: "Exists", + }, + }, + Affinity: affinity, }, } if o.HostNetwork {