Skip to content

Commit

Permalink
Include ignored pods when computing backoff delay for Job pod failures
Browse files Browse the repository at this point in the history
  • Loading branch information
mimowo committed Jul 19, 2023
1 parent 90c362b commit 35d0af9
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 10 deletions.
25 changes: 15 additions & 10 deletions pkg/controller/job/job_controller.go
Expand Up @@ -794,7 +794,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
active := int32(len(jobCtx.activePods))
newSucceededPods, newFailedPods := getNewFinishedPods(jobCtx)
jobCtx.succeeded = job.Status.Succeeded + int32(len(newSucceededPods)) + int32(len(jobCtx.uncounted.succeeded))
failed := job.Status.Failed + int32(len(newFailedPods)) + int32(len(jobCtx.uncounted.failed))
failed := job.Status.Failed + int32(nonIgnoredFailedPodsCount(jobCtx, newFailedPods)) + int32(len(jobCtx.uncounted.failed))
var ready *int32
if feature.DefaultFeatureGate.Enabled(features.JobReadyPods) {
ready = pointer.Int32(countReadyPods(jobCtx.activePods))
Expand Down Expand Up @@ -951,6 +951,19 @@ func (jm *Controller) deleteActivePods(ctx context.Context, job *batch.Job, pods
return successfulDeletes, errorFromChannel(errCh)
}

func nonIgnoredFailedPodsCount(jobCtx *syncJobCtx, failedPods []*v1.Pod) int {
result := len(failedPods)
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil {
for _, p := range failedPods {
_, countFailed, _ := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, p)
if !countFailed {
result--
}
}
}
return result
}

// deleteJobPods deletes the pods, returns the number of successful removals
// and any error.
func (jm *Controller) deleteJobPods(ctx context.Context, job *batch.Job, jobKey string, pods []*v1.Pod) (int32, error) {
Expand Down Expand Up @@ -1406,15 +1419,7 @@ func getNewFinishedPods(jobCtx *syncJobCtx) (succeededPods, failedPods []*v1.Pod
return p.Status.Phase == v1.PodSucceeded
})
failedPods = getValidPodsWithFilter(jobCtx, jobCtx.uncounted.Failed(), func(p *v1.Pod) bool {
if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) && jobCtx.job.Spec.PodFailurePolicy != nil {
if !isPodFailed(p, jobCtx.job) {
return false
}
_, countFailed, _ := matchPodFailurePolicy(jobCtx.job.Spec.PodFailurePolicy, p)
return countFailed
} else {
return isPodFailed(p, jobCtx.job)
}
return isPodFailed(p, jobCtx.job)
})
return succeededPods, failedPods
}
Expand Down
47 changes: 47 additions & 0 deletions pkg/controller/job/job_controller_test.go
Expand Up @@ -3019,6 +3019,53 @@ func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
wantStatusFailed: 0,
wantStatusSucceeded: 0,
},
"ignore pod failure based on OnPodConditions, ignored failures delays pod recreation": {
enableJobPodFailurePolicy: true,
job: batch.Job{
TypeMeta: metav1.TypeMeta{Kind: "Job"},
ObjectMeta: validObjectMeta,
Spec: batch.JobSpec{
Selector: validSelector,
Template: validTemplate,
Parallelism: pointer.Int32(1),
Completions: pointer.Int32(1),
BackoffLimit: pointer.Int32(0),
PodFailurePolicy: &batch.PodFailurePolicy{
Rules: []batch.PodFailurePolicyRule{
{
Action: batch.PodFailurePolicyActionIgnore,
OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
},
},
pods: []v1.Pod{
{
ObjectMeta: metav1.ObjectMeta{
DeletionTimestamp: &now,
},
Status: v1.PodStatus{
Phase: v1.PodFailed,
Conditions: []v1.PodCondition{
{
Type: v1.DisruptionTarget,
Status: v1.ConditionTrue,
},
},
},
},
},
wantConditions: nil,
wantStatusActive: 0,
wantStatusFailed: 0,
wantStatusSucceeded: 0,
},
"fail job based on OnPodConditions": {
enableJobPodFailurePolicy: true,
job: batch.Job{
Expand Down

0 comments on commit 35d0af9

Please sign in to comment.