speed up e2e-ocl by deleting build pods

cheesesashimi · cheesesashimi · commit 1f9967dd0cc0 · 2025-09-30T17:11:48.000-04:00
This also modifies the MachineConfigPool status updates to use the
constants we've added for that purpose instead of non-authoritative
strings.
diff --git a/pkg/controller/build/reconciler.go b/pkg/controller/build/reconciler.go
@@ -1634,7 +1634,7 @@ func (b *buildReconciler) initializeBuildDegradedCondition(ctx context.Context,
 			return err
 		}
 
-		buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildStarted", "Build started for pool "+currentPool.Name)
+		buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuilding), "Build started for pool "+currentPool.Name)
 		apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
 
 		_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
@@ -1651,7 +1651,7 @@ func (b *buildReconciler) syncBuildSuccessStatus(ctx context.Context, pool *mcfg
 			return err
 		}
 
-		buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildSucceeded", "Build succeeded for pool "+currentPool.Name)
+		buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuildSuccess), "Build succeeded for pool "+currentPool.Name)
 		apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
 
 		_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
@@ -1669,7 +1669,7 @@ func (b *buildReconciler) syncBuildFailureStatus(ctx context.Context, pool *mcfg
 		}
 
 		// The message content may be truncated https://github.com/kubernetes/apimachinery/blob/f5dd29d6ada12819a4a6ddc97d5bdf812f8a1cad/pkg/apis/meta/v1/types.go#L1619-L1635
-		buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionTrue, "BuildFailed", fmt.Sprintf("Failed to build OS image for pool %s (MachineOSBuild: %s): %v", currentPool.Name, mosbName, buildErr))
+		buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionTrue, string(mcfgv1.MachineConfigPoolBuildFailed), fmt.Sprintf("Failed to build OS image for pool %s (MachineOSBuild: %s): %v", currentPool.Name, mosbName, buildErr))
 		apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
 
 		_, updateErr := b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
diff --git a/pkg/controller/node/status.go b/pkg/controller/node/status.go
@@ -355,21 +355,21 @@ func (ctrl *Controller) calculateStatus(mcs []*mcfgv1.MachineConfigNode, cconfig
 		switch {
 		case mosbState.IsBuilding() || mosbState.IsBuildPrepared():
 			// Active build detected - clear any previous BuildDegraded condition
-			buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildStarted", "New build started")
+			buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuilding), "New build started")
 			apihelpers.SetMachineConfigPoolCondition(&status, *buildDegradedClear)
 			// Update local variable for degraded calculation
 			buildDegraded = false
 		case mosbState.IsBuildSuccess():
 			// Successful build detected - clear any previous BuildDegraded condition
-			buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildSucceeded", "Build completed successfully")
+			buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuildSuccess), "Build completed successfully")
 			apihelpers.SetMachineConfigPoolCondition(&status, *buildDegradedClear)
 			buildDegraded = false
 		}
 	case mosc != nil:
 		// MachineOSConfig exists but no MachineOSBuild - this indicates a retry attempt
 		// Clear any previous BuildDegraded condition to allow the retry
 		if apihelpers.IsMachineConfigPoolConditionTrue(pool.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) {
-			buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildPending", "MachineOSConfig updated/created, waiting for MachineOSBuild")
+			buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuildPending), "MachineOSConfig updated/created, waiting for MachineOSBuild")
 			apihelpers.SetMachineConfigPoolCondition(&status, *buildDegradedClear)
 			buildDegraded = false
 		}
diff --git a/test/e2e-ocl/helpers_test.go b/test/e2e-ocl/helpers_test.go
@@ -28,6 +28,7 @@ import (
 	"github.com/openshift/machine-config-operator/test/helpers"
 	"github.com/stretchr/testify/require"
 	"golang.org/x/sync/errgroup"
+	batchv1 "k8s.io/api/batch/v1"
 	corev1 "k8s.io/api/core/v1"
 	k8serrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -592,7 +593,8 @@ func streamBuildPodLogsToFile(ctx context.Context, t *testing.T, cs *framework.C
 	return streamPodContainerLogsToFile(ctx, t, cs, pod, dirPath)
 }
 
-func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.Pod, error) {
+// Returns a list of pods that match a given job name.
+func listPodsForJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.PodList, error) {
 	job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{})
 	if err != nil {
 		return nil, fmt.Errorf("could not get job %s: %w", job, err)
@@ -603,6 +605,16 @@ func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string)
 		return nil, fmt.Errorf("could not get pods with job label %s: %w", jobName, err)
 	}
 
+	return podList, nil
+}
+
+// Retrieves the currently running build pod for a given job name.
+func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.Pod, error) {
+	podList, err := listPodsForJob(ctx, cs, jobName)
+	if err != nil {
+		return nil, fmt.Errorf("could not list pods for job %s: %w", jobName, err)
+	}
+
 	if podList != nil {
 		if len(podList.Items) == 1 {
 			return &podList.Items[0], nil
@@ -611,16 +623,33 @@ func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string)
 		// this is needed when we test the case for a new pod being created after deleting the existing one
 		// as sometimes it takes time for the old pod to be completely deleted
 		for _, pod := range podList.Items {
-			for _, status := range pod.Status.InitContainerStatuses {
-				if status.State.Running != nil {
-					return &pod, nil
-				}
+			if isBuildPodRunning(&pod) {
+				return &pod, nil
 			}
 		}
 	}
+
 	return nil, fmt.Errorf("no pod found for job %s", jobName)
 }
 
+// Determines if a build pod is running by first examining the init container
+// statuses and then the main container statuses.
+func isBuildPodRunning(pod *corev1.Pod) bool {
+	for _, status := range pod.Status.InitContainerStatuses {
+		if status.State.Running != nil {
+			return true
+		}
+	}
+
+	for _, status := range pod.Status.ContainerStatuses {
+		if status.State.Running != nil {
+			return true
+		}
+	}
+
+	return false
+}
+
 // getJobForMOSB returns the name of the job that was created for the given MOSB by comparing the job UID
 // to the UID stored in the MOSB annotation
 func getJobForMOSB(ctx context.Context, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) (string, error) {
@@ -1058,3 +1087,46 @@ func scaleDownDeployment(t *testing.T, cs *framework.ClientSet, deployment metav
 		require.NoError(t, setDeploymentReplicas(t, cs, deployment, originalReplicas))
 	})
 }
+
+// forceMachineOSBuildToFail() repeatedly deletes the build pod associated
+// with the given MachineOSBuild so that the job will fail.
+func forceMachineOSBuildToFail(ctx context.Context, t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) error {
+	start := time.Now()
+
+	jobName, err := getJobForMOSB(ctx, cs, mosb)
+	if err != nil {
+		return fmt.Errorf("could not identify job for MachineOSBuild %s: %w", mosb.Name, err)
+	}
+
+	t.Logf("Found job %s for MachineOSBuild %s, will delete pods belonging to this job to cause build failure", jobName, mosb.Name)
+
+	return wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) {
+		job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{})
+		if err != nil {
+			return false, fmt.Errorf("could not get job %s for MachineOSBuild %s: %w", jobName, mosb.Name, err)
+		}
+
+		for _, condition := range job.Status.Conditions {
+			if condition.Reason == batchv1.JobReasonBackoffLimitExceeded && condition.Status == corev1.ConditionTrue {
+				t.Logf("Job %s has indicated failure after %s", jobName, time.Since(start))
+				return true, nil
+			}
+		}
+
+		podList, err := listPodsForJob(ctx, cs, jobName)
+		if err != nil {
+			return false, fmt.Errorf("could not list pods for job %s: %w", jobName, err)
+		}
+
+		for _, pod := range podList.Items {
+			if pod.DeletionTimestamp == nil {
+				t.Logf("Deleting pod %s belonging to job %s", pod.Name, jobName)
+				if err := cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil {
+					return false, fmt.Errorf("could not delete pod %s: %w", pod.Name, err)
+				}
+			}
+		}
+
+		return false, nil
+	})
+}
diff --git a/test/e2e-ocl/onclusterlayering_test.go b/test/e2e-ocl/onclusterlayering_test.go
@@ -433,12 +433,11 @@ func TestMachineConfigPoolChangeRestartsBuild(t *testing.T) {
 	require.NoError(t, err)
 }
 
-// This test starts a build with an image that is known to fail because it uses
-// an invalid containerfile. After failure, it edits the  MachineOSConfig
-// with the expectation that the failed build and its  will be deleted and a new
-// build will start in its place.
+// This test starts a build that it then forces to fail by deleting the build
+// pods until the job itself fails. After failure, it edits the
+// MachineOSConfig with the expectation that the failed build and its  will be
+// deleted and a new build will start in its place.
 func TestGracefulBuildFailureRecovery(t *testing.T) {
-
 	ctx, cancel := context.WithCancel(context.Background())
 	t.Cleanup(cancel)
 
@@ -451,18 +450,17 @@ func TestGracefulBuildFailureRecovery(t *testing.T) {
 		},
 	})
 
-	// Add a bad containerfile so that we can cause a build failure
-	t.Logf("Adding a bad containerfile for MachineOSConfig %s to cause a build failure", mosc.Name)
-
-	mosc.Spec.Containerfile = getBadContainerFileForFailureTest()
-
 	createMachineOSConfig(t, cs, mosc)
 
 	// Wait for the build to start.
 	firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name)
 
 	t.Logf("Waiting for MachineOSBuild %s to fail", firstMosb.Name)
 
+	// Repeatedly delete the build pod until the job fails to cause a failure.
+	// Otherwise, it takes a very long time for the job to actually fail.
+	require.NoError(t, forceMachineOSBuildToFail(ctx, t, cs, firstMosb))
+
 	// Wait for the build to fail.
 	kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx)
 	kubeassert.Eventually().MachineOSBuildIsFailure(firstMosb)
@@ -476,8 +474,6 @@ func TestGracefulBuildFailureRecovery(t *testing.T) {
 	updated, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{})
 	require.NoError(t, err)
 
-	t.Logf("Cleared out bad containerfile")
-
 	mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{})
 	require.NoError(t, err)
 
@@ -494,7 +490,6 @@ func TestGracefulBuildFailureRecovery(t *testing.T) {
 	// Ensure that the second build is still running.
 	kubeassert.MachineOSBuildExists(secondMosb)
 	assertBuildObjectsAreCreated(t, kubeassert, secondMosb)
-
 }
 
 // This test validates that when a running builder is deleted, the
@@ -1343,9 +1338,12 @@ func waitForImageBuildDegradedCondition(ctx context.Context, t *testing.T, cs *f
 	return condition
 }
 
-// TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess tests that the ImageBuildDegraded condition is set to
-// True when a MachineOSBuild fails, and is set to False when a MachineOSBuild succeeds after a previous failure.
-func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
+// TestImageBuildDegradedOnFailureAndClearedOnBuildStart tests that the
+// ImageBuildDegraded condition is set to True when a MachineOSBuild fails, and
+// is set to False when a MachineOSBuild is started after a previous failure.
+// Previously, this test waited until the build was completed before verifying
+// that the state was no longer degraded.
+func TestImageBuildDegradedOnFailureAndClearedOnBuildStart(t *testing.T) {
 	ctx, cancel := context.WithCancel(context.Background())
 	t.Cleanup(cancel)
 
@@ -1358,7 +1356,8 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
 		},
 	})
 
-	// First, add a bad containerfile to cause a build failure
+	// First, add a bad containerfile to cause a build failure. However, we will
+	// actually delete the build pod to force the failure to happen faster.
 	t.Logf("Adding a bad containerfile for MachineOSConfig %s to cause a build failure", mosc.Name)
 	mosc.Spec.Containerfile = getBadContainerFileForFailureTest()
 
@@ -1368,13 +1367,17 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
 	firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name)
 	t.Logf("Waiting for MachineOSBuild %s to fail", firstMosb.Name)
 
+	// Force the build to fail faster by repeatedly deleting the build pods until
+	// the job reflects a failure status.
+	require.NoError(t, forceMachineOSBuildToFail(ctx, t, cs, firstMosb))
+
 	kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx)
 	kubeassert.Eventually().MachineOSBuildIsFailure(firstMosb)
 
 	// Wait for and verify ImageBuildDegraded condition is set to True
 	degradedCondition := waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionTrue)
 	require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should be present")
-	assert.Equal(t, "BuildFailed", degradedCondition.Reason, "ImageBuildDegraded reason should be BuildFailed")
+	assert.Equal(t, string(mcfgv1.MachineConfigPoolBuildFailed), degradedCondition.Reason, "ImageBuildDegraded reason should be BuildFailed")
 	assert.Contains(t, degradedCondition.Message, fmt.Sprintf("Failed to build OS image for pool %s", layeredMCPName), "ImageBuildDegraded message should contain pool name")
 	assert.Contains(t, degradedCondition.Message, firstMosb.Name, "ImageBuildDegraded message should contain MachineOSBuild name")
 
@@ -1402,16 +1405,12 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
 	// Compute the new MachineOSBuild name
 	moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updated, mcp)
 
-	// Wait for the second build to start and complete successfully
-	secondMosb := waitForBuildToStart(t, cs, moscChangeMosb)
-	finishedBuild := waitForBuildToComplete(t, cs, secondMosb)
-
-	t.Logf("Second build completed successfully: %s", finishedBuild.Name)
+	// Wait for the second build to start
+	waitForBuildToStart(t, cs, moscChangeMosb)
 
-	// Wait for and verify ImageBuildDegraded condition is now False
+	// Wait for and verify ImageBuildDegraded condition is False after the new build starts.
 	degradedCondition = waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionFalse)
 	require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should still be present")
-	assert.Equal(t, "BuildSucceeded", degradedCondition.Reason, "ImageBuildDegraded reason should be BuildSucceeded")
-
+	assert.Equal(t, string(mcfgv1.MachineConfigPoolBuilding), degradedCondition.Reason, "ImageBuildDegraded reason should be Building")
 	t.Logf("ImageBuildDegraded condition correctly cleared to False with message: %s", degradedCondition.Message)
 }

Original file line number	Diff line number	Diff line change
`@@ -1634,7 +1634,7 @@ func (b *buildReconciler) initializeBuildDegradedCondition(ctx context.Context,`
`1634`	`1634`	`return err`
`1635`	`1635`	`}`
`1636`	`1636`
`1637`		`- buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildStarted", "Build started for pool "+currentPool.Name)`
	`1637`	`+ buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuilding), "Build started for pool "+currentPool.Name)`
`1638`	`1638`	`apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)`
`1639`	`1639`
`1640`	`1640`	`_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})`
`@@ -1651,7 +1651,7 @@ func (b buildReconciler) syncBuildSuccessStatus(ctx context.Context, pool mcfg`
`1651`	`1651`	`return err`
`1652`	`1652`	`}`
`1653`	`1653`
`1654`		`- buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildSucceeded", "Build succeeded for pool "+currentPool.Name)`
	`1654`	`+ buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuildSuccess), "Build succeeded for pool "+currentPool.Name)`
`1655`	`1655`	`apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)`
`1656`	`1656`
`1657`	`1657`	`_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})`
`@@ -1669,7 +1669,7 @@ func (b buildReconciler) syncBuildFailureStatus(ctx context.Context, pool mcfg`
`1669`	`1669`	`}`
`1670`	`1670`
`1671`	`1671`	`// The message content may be truncated https://github.com/kubernetes/apimachinery/blob/f5dd29d6ada12819a4a6ddc97d5bdf812f8a1cad/pkg/apis/meta/v1/types.go#L1619-L1635`
`1672`		`- buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionTrue, "BuildFailed", fmt.Sprintf("Failed to build OS image for pool %s (MachineOSBuild: %s): %v", currentPool.Name, mosbName, buildErr))`
	`1672`	`+ buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionTrue, string(mcfgv1.MachineConfigPoolBuildFailed), fmt.Sprintf("Failed to build OS image for pool %s (MachineOSBuild: %s): %v", currentPool.Name, mosbName, buildErr))`
`1673`	`1673`	`apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)`
`1674`	`1674`
`1675`	`1675`	`_, updateErr := b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})`