Skip to content

Commit 1f9967d

Browse files
committed
speed up e2e-ocl by deleting build pods
This also modifies the MachineConfigPool status updates to use the constants we've added for that purpose instead of non-authoritative strings.
1 parent 034b109 commit 1f9967d

File tree

4 files changed

+108
-37
lines changed

4 files changed

+108
-37
lines changed

pkg/controller/build/reconciler.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1634,7 +1634,7 @@ func (b *buildReconciler) initializeBuildDegradedCondition(ctx context.Context,
16341634
return err
16351635
}
16361636

1637-
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildStarted", "Build started for pool "+currentPool.Name)
1637+
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuilding), "Build started for pool "+currentPool.Name)
16381638
apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
16391639

16401640
_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
@@ -1651,7 +1651,7 @@ func (b *buildReconciler) syncBuildSuccessStatus(ctx context.Context, pool *mcfg
16511651
return err
16521652
}
16531653

1654-
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildSucceeded", "Build succeeded for pool "+currentPool.Name)
1654+
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuildSuccess), "Build succeeded for pool "+currentPool.Name)
16551655
apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
16561656

16571657
_, err = b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})
@@ -1669,7 +1669,7 @@ func (b *buildReconciler) syncBuildFailureStatus(ctx context.Context, pool *mcfg
16691669
}
16701670

16711671
// The message content may be truncated https://github.com/kubernetes/apimachinery/blob/f5dd29d6ada12819a4a6ddc97d5bdf812f8a1cad/pkg/apis/meta/v1/types.go#L1619-L1635
1672-
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionTrue, "BuildFailed", fmt.Sprintf("Failed to build OS image for pool %s (MachineOSBuild: %s): %v", currentPool.Name, mosbName, buildErr))
1672+
buildDegraded := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionTrue, string(mcfgv1.MachineConfigPoolBuildFailed), fmt.Sprintf("Failed to build OS image for pool %s (MachineOSBuild: %s): %v", currentPool.Name, mosbName, buildErr))
16731673
apihelpers.SetMachineConfigPoolCondition(&currentPool.Status, *buildDegraded)
16741674

16751675
_, updateErr := b.mcfgclient.MachineconfigurationV1().MachineConfigPools().UpdateStatus(ctx, currentPool, metav1.UpdateOptions{})

pkg/controller/node/status.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -355,21 +355,21 @@ func (ctrl *Controller) calculateStatus(mcs []*mcfgv1.MachineConfigNode, cconfig
355355
switch {
356356
case mosbState.IsBuilding() || mosbState.IsBuildPrepared():
357357
// Active build detected - clear any previous BuildDegraded condition
358-
buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildStarted", "New build started")
358+
buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuilding), "New build started")
359359
apihelpers.SetMachineConfigPoolCondition(&status, *buildDegradedClear)
360360
// Update local variable for degraded calculation
361361
buildDegraded = false
362362
case mosbState.IsBuildSuccess():
363363
// Successful build detected - clear any previous BuildDegraded condition
364-
buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildSucceeded", "Build completed successfully")
364+
buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuildSuccess), "Build completed successfully")
365365
apihelpers.SetMachineConfigPoolCondition(&status, *buildDegradedClear)
366366
buildDegraded = false
367367
}
368368
case mosc != nil:
369369
// MachineOSConfig exists but no MachineOSBuild - this indicates a retry attempt
370370
// Clear any previous BuildDegraded condition to allow the retry
371371
if apihelpers.IsMachineConfigPoolConditionTrue(pool.Status.Conditions, mcfgv1.MachineConfigPoolImageBuildDegraded) {
372-
buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, "BuildPending", "MachineOSConfig updated/created, waiting for MachineOSBuild")
372+
buildDegradedClear := apihelpers.NewMachineConfigPoolCondition(mcfgv1.MachineConfigPoolImageBuildDegraded, corev1.ConditionFalse, string(mcfgv1.MachineConfigPoolBuildPending), "MachineOSConfig updated/created, waiting for MachineOSBuild")
373373
apihelpers.SetMachineConfigPoolCondition(&status, *buildDegradedClear)
374374
buildDegraded = false
375375
}

test/e2e-ocl/helpers_test.go

Lines changed: 77 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"github.com/openshift/machine-config-operator/test/helpers"
2929
"github.com/stretchr/testify/require"
3030
"golang.org/x/sync/errgroup"
31+
batchv1 "k8s.io/api/batch/v1"
3132
corev1 "k8s.io/api/core/v1"
3233
k8serrors "k8s.io/apimachinery/pkg/api/errors"
3334
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -592,7 +593,8 @@ func streamBuildPodLogsToFile(ctx context.Context, t *testing.T, cs *framework.C
592593
return streamPodContainerLogsToFile(ctx, t, cs, pod, dirPath)
593594
}
594595

595-
func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.Pod, error) {
596+
// Returns a list of pods that match a given job name.
597+
func listPodsForJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.PodList, error) {
596598
job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{})
597599
if err != nil {
598600
return nil, fmt.Errorf("could not get job %s: %w", job, err)
@@ -603,6 +605,16 @@ func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string)
603605
return nil, fmt.Errorf("could not get pods with job label %s: %w", jobName, err)
604606
}
605607

608+
return podList, nil
609+
}
610+
611+
// Retrieves the currently running build pod for a given job name.
612+
func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string) (*corev1.Pod, error) {
613+
podList, err := listPodsForJob(ctx, cs, jobName)
614+
if err != nil {
615+
return nil, fmt.Errorf("could not list pods for job %s: %w", jobName, err)
616+
}
617+
606618
if podList != nil {
607619
if len(podList.Items) == 1 {
608620
return &podList.Items[0], nil
@@ -611,16 +623,33 @@ func getPodFromJob(ctx context.Context, cs *framework.ClientSet, jobName string)
611623
// this is needed when we test the case for a new pod being created after deleting the existing one
612624
// as sometimes it takes time for the old pod to be completely deleted
613625
for _, pod := range podList.Items {
614-
for _, status := range pod.Status.InitContainerStatuses {
615-
if status.State.Running != nil {
616-
return &pod, nil
617-
}
626+
if isBuildPodRunning(&pod) {
627+
return &pod, nil
618628
}
619629
}
620630
}
631+
621632
return nil, fmt.Errorf("no pod found for job %s", jobName)
622633
}
623634

635+
// Determines if a build pod is running by first examining the init container
636+
// statuses and then the main container statuses.
637+
func isBuildPodRunning(pod *corev1.Pod) bool {
638+
for _, status := range pod.Status.InitContainerStatuses {
639+
if status.State.Running != nil {
640+
return true
641+
}
642+
}
643+
644+
for _, status := range pod.Status.ContainerStatuses {
645+
if status.State.Running != nil {
646+
return true
647+
}
648+
}
649+
650+
return false
651+
}
652+
624653
// getJobForMOSB returns the name of the job that was created for the given MOSB by comparing the job UID
625654
// to the UID stored in the MOSB annotation
626655
func getJobForMOSB(ctx context.Context, cs *framework.ClientSet, build *mcfgv1.MachineOSBuild) (string, error) {
@@ -1058,3 +1087,46 @@ func scaleDownDeployment(t *testing.T, cs *framework.ClientSet, deployment metav
10581087
require.NoError(t, setDeploymentReplicas(t, cs, deployment, originalReplicas))
10591088
})
10601089
}
1090+
1091+
// forceMachineOSBuildToFail() repeatedly deletes the build pod associated
1092+
// with the given MachineOSBuild so that the job will fail.
1093+
func forceMachineOSBuildToFail(ctx context.Context, t *testing.T, cs *framework.ClientSet, mosb *mcfgv1.MachineOSBuild) error {
1094+
start := time.Now()
1095+
1096+
jobName, err := getJobForMOSB(ctx, cs, mosb)
1097+
if err != nil {
1098+
return fmt.Errorf("could not identify job for MachineOSBuild %s: %w", mosb.Name, err)
1099+
}
1100+
1101+
t.Logf("Found job %s for MachineOSBuild %s, will delete pods belonging to this job to cause build failure", jobName, mosb.Name)
1102+
1103+
return wait.PollImmediate(1*time.Second, 5*time.Minute, func() (bool, error) {
1104+
job, err := cs.BatchV1Interface.Jobs(ctrlcommon.MCONamespace).Get(ctx, jobName, metav1.GetOptions{})
1105+
if err != nil {
1106+
return false, fmt.Errorf("could not get job %s for MachineOSBuild %s: %w", jobName, mosb.Name, err)
1107+
}
1108+
1109+
for _, condition := range job.Status.Conditions {
1110+
if condition.Reason == batchv1.JobReasonBackoffLimitExceeded && condition.Status == corev1.ConditionTrue {
1111+
t.Logf("Job %s has indicated failure after %s", jobName, time.Since(start))
1112+
return true, nil
1113+
}
1114+
}
1115+
1116+
podList, err := listPodsForJob(ctx, cs, jobName)
1117+
if err != nil {
1118+
return false, fmt.Errorf("could not list pods for job %s: %w", jobName, err)
1119+
}
1120+
1121+
for _, pod := range podList.Items {
1122+
if pod.DeletionTimestamp == nil {
1123+
t.Logf("Deleting pod %s belonging to job %s", pod.Name, jobName)
1124+
if err := cs.CoreV1Interface.Pods(ctrlcommon.MCONamespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil {
1125+
return false, fmt.Errorf("could not delete pod %s: %w", pod.Name, err)
1126+
}
1127+
}
1128+
}
1129+
1130+
return false, nil
1131+
})
1132+
}

test/e2e-ocl/onclusterlayering_test.go

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -433,12 +433,11 @@ func TestMachineConfigPoolChangeRestartsBuild(t *testing.T) {
433433
require.NoError(t, err)
434434
}
435435

436-
// This test starts a build with an image that is known to fail because it uses
437-
// an invalid containerfile. After failure, it edits the MachineOSConfig
438-
// with the expectation that the failed build and its will be deleted and a new
439-
// build will start in its place.
436+
// This test starts a build that it then forces to fail by deleting the build
437+
// pods until the job itself fails. After failure, it edits the
438+
// MachineOSConfig with the expectation that the failed build and its will be
439+
// deleted and a new build will start in its place.
440440
func TestGracefulBuildFailureRecovery(t *testing.T) {
441-
442441
ctx, cancel := context.WithCancel(context.Background())
443442
t.Cleanup(cancel)
444443

@@ -451,18 +450,17 @@ func TestGracefulBuildFailureRecovery(t *testing.T) {
451450
},
452451
})
453452

454-
// Add a bad containerfile so that we can cause a build failure
455-
t.Logf("Adding a bad containerfile for MachineOSConfig %s to cause a build failure", mosc.Name)
456-
457-
mosc.Spec.Containerfile = getBadContainerFileForFailureTest()
458-
459453
createMachineOSConfig(t, cs, mosc)
460454

461455
// Wait for the build to start.
462456
firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name)
463457

464458
t.Logf("Waiting for MachineOSBuild %s to fail", firstMosb.Name)
465459

460+
// Repeatedly delete the build pod until the job fails to cause a failure.
461+
// Otherwise, it takes a very long time for the job to actually fail.
462+
require.NoError(t, forceMachineOSBuildToFail(ctx, t, cs, firstMosb))
463+
466464
// Wait for the build to fail.
467465
kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx)
468466
kubeassert.Eventually().MachineOSBuildIsFailure(firstMosb)
@@ -476,8 +474,6 @@ func TestGracefulBuildFailureRecovery(t *testing.T) {
476474
updated, err := cs.MachineconfigurationV1Interface.MachineOSConfigs().Update(ctx, apiMosc, metav1.UpdateOptions{})
477475
require.NoError(t, err)
478476

479-
t.Logf("Cleared out bad containerfile")
480-
481477
mcp, err := cs.MachineconfigurationV1Interface.MachineConfigPools().Get(ctx, layeredMCPName, metav1.GetOptions{})
482478
require.NoError(t, err)
483479

@@ -494,7 +490,6 @@ func TestGracefulBuildFailureRecovery(t *testing.T) {
494490
// Ensure that the second build is still running.
495491
kubeassert.MachineOSBuildExists(secondMosb)
496492
assertBuildObjectsAreCreated(t, kubeassert, secondMosb)
497-
498493
}
499494

500495
// This test validates that when a running builder is deleted, the
@@ -1343,9 +1338,12 @@ func waitForImageBuildDegradedCondition(ctx context.Context, t *testing.T, cs *f
13431338
return condition
13441339
}
13451340

1346-
// TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess tests that the ImageBuildDegraded condition is set to
1347-
// True when a MachineOSBuild fails, and is set to False when a MachineOSBuild succeeds after a previous failure.
1348-
func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
1341+
// TestImageBuildDegradedOnFailureAndClearedOnBuildStart tests that the
1342+
// ImageBuildDegraded condition is set to True when a MachineOSBuild fails, and
1343+
// is set to False when a MachineOSBuild is started after a previous failure.
1344+
// Previously, this test waited until the build was completed before verifying
1345+
// that the state was no longer degraded.
1346+
func TestImageBuildDegradedOnFailureAndClearedOnBuildStart(t *testing.T) {
13491347
ctx, cancel := context.WithCancel(context.Background())
13501348
t.Cleanup(cancel)
13511349

@@ -1358,7 +1356,8 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
13581356
},
13591357
})
13601358

1361-
// First, add a bad containerfile to cause a build failure
1359+
// First, add a bad containerfile to cause a build failure. However, we will
1360+
// actually delete the build pod to force the failure to happen faster.
13621361
t.Logf("Adding a bad containerfile for MachineOSConfig %s to cause a build failure", mosc.Name)
13631362
mosc.Spec.Containerfile = getBadContainerFileForFailureTest()
13641363

@@ -1368,13 +1367,17 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
13681367
firstMosb := waitForBuildToStartForPoolAndConfig(t, cs, layeredMCPName, mosc.Name)
13691368
t.Logf("Waiting for MachineOSBuild %s to fail", firstMosb.Name)
13701369

1370+
// Force the build to fail faster by repeatedly deleting the build pods until
1371+
// the job reflects a failure status.
1372+
require.NoError(t, forceMachineOSBuildToFail(ctx, t, cs, firstMosb))
1373+
13711374
kubeassert := helpers.AssertClientSet(t, cs).WithContext(ctx)
13721375
kubeassert.Eventually().MachineOSBuildIsFailure(firstMosb)
13731376

13741377
// Wait for and verify ImageBuildDegraded condition is set to True
13751378
degradedCondition := waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionTrue)
13761379
require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should be present")
1377-
assert.Equal(t, "BuildFailed", degradedCondition.Reason, "ImageBuildDegraded reason should be BuildFailed")
1380+
assert.Equal(t, string(mcfgv1.MachineConfigPoolBuildFailed), degradedCondition.Reason, "ImageBuildDegraded reason should be BuildFailed")
13781381
assert.Contains(t, degradedCondition.Message, fmt.Sprintf("Failed to build OS image for pool %s", layeredMCPName), "ImageBuildDegraded message should contain pool name")
13791382
assert.Contains(t, degradedCondition.Message, firstMosb.Name, "ImageBuildDegraded message should contain MachineOSBuild name")
13801383

@@ -1402,16 +1405,12 @@ func TestImageBuildDegradedOnFailureAndClearedOnBuildSuccess(t *testing.T) {
14021405
// Compute the new MachineOSBuild name
14031406
moscChangeMosb := buildrequest.NewMachineOSBuildFromAPIOrDie(ctx, cs.GetKubeclient(), updated, mcp)
14041407

1405-
// Wait for the second build to start and complete successfully
1406-
secondMosb := waitForBuildToStart(t, cs, moscChangeMosb)
1407-
finishedBuild := waitForBuildToComplete(t, cs, secondMosb)
1408-
1409-
t.Logf("Second build completed successfully: %s", finishedBuild.Name)
1408+
// Wait for the second build to start
1409+
waitForBuildToStart(t, cs, moscChangeMosb)
14101410

1411-
// Wait for and verify ImageBuildDegraded condition is now False
1411+
// Wait for and verify ImageBuildDegraded condition is False after the new build starts.
14121412
degradedCondition = waitForImageBuildDegradedCondition(ctx, t, cs, layeredMCPName, corev1.ConditionFalse)
14131413
require.NotNil(t, degradedCondition, "ImageBuildDegraded condition should still be present")
1414-
assert.Equal(t, "BuildSucceeded", degradedCondition.Reason, "ImageBuildDegraded reason should be BuildSucceeded")
1415-
1414+
assert.Equal(t, string(mcfgv1.MachineConfigPoolBuilding), degradedCondition.Reason, "ImageBuildDegraded reason should be Building")
14161415
t.Logf("ImageBuildDegraded condition correctly cleared to False with message: %s", degradedCondition.Message)
14171416
}

0 commit comments

Comments
 (0)