From ec2e8d261f0b81cd540b89adec02f8a98d7f9d1a Mon Sep 17 00:00:00 2001 From: jorgee Date: Mon, 16 Mar 2026 11:10:50 +0100 Subject: [PATCH] fixes exit code when spot claim and succeeded autoretry (maxSpotAttempts > 0) Signed-off-by: jorgee --- .../cloud/google/batch/GoogleBatchTaskHandler.groovy | 4 ++++ .../google/batch/GoogleBatchTaskHandlerTest.groovy | 12 +++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy index 1fb8d0ec68..c3fd836ce7 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy @@ -688,6 +688,10 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { task.stderr = executor.logging.stderr(uid, taskId) ?: errorFile } else { + // Retried spot instances could keep the 500xx exit code event when the automatic retied succeeds. In this case, we need to read the exit code from .exitcode + // https://github.com/nextflow-io/nextflow/issues/6779 + if( task.exitStatus >= 50000 ) + task.exitStatus = readExitFile() task.stdout = outputFile task.stderr = errorFile } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy index e5748d2c17..d1a8155658 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy @@ -796,7 +796,7 @@ class GoogleBatchTaskHandlerTest extends Specification { def client = Mock(BatchClient){ getTaskInArrayStatus(jobId, taskId) >> { TASK_STATE ? makeTaskStatus(TASK_STATE, DESC, EXIT_CODE): null } getTaskStatus(jobId, taskId) >> { TASK_STATE ? makeTaskStatus(TASK_STATE, DESC, EXIT_CODE): null } - getJobStatus(jobId ) >> makeJobStatus(JobStatus.State.FAILED,DESC) + getJobStatus(jobId ) >> makeJobStatus(JOB_STATE,DESC) } def logging = Mock(BatchLogging) def executor = Mock(GoogleBatchExecutor){ @@ -808,16 +808,18 @@ class GoogleBatchTaskHandlerTest extends Specification { when: def result = handler.checkIfCompleted() then: - 0 * handler.readExitFile() >> EXIT_STATUS + NUM_READ_EXIT * handler.readExitFile() >> EXIT_STATUS handler.status == TASK_STATUS handler.task.exitStatus == EXIT_STATUS handler.task.error?.message == TASK_ERROR result == RESULT where: - TASK_STATE | DESC | EXIT_CODE | ARRAY_CHILD | TASK_STATUS | EXIT_STATUS | RESULT | TASK_ERROR - TaskStatus.State.FAILED | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | true | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.' - TaskStatus.State.FAILED | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | false | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.' + TASK_STATE | JOB_STATE | NUM_READ_EXIT | DESC | EXIT_CODE | ARRAY_CHILD | TASK_STATUS | EXIT_STATUS | RESULT | TASK_ERROR + TaskStatus.State.FAILED | JobStatus.State.FAILED | 0 | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | true | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.' + TaskStatus.State.FAILED | JobStatus.State.FAILED | 0 | 'Task failed due to Spot VM preemption with exit code 50001.' | 50001 | false | nextflow.processor.TaskStatus.COMPLETED | 50001 | true | 'Task failed due to Spot VM preemption with exit code 50001.' + TaskStatus.State.SUCCEEDED | JobStatus.State.SUCCEEDED | 1 | null | 50001 | true | nextflow.processor.TaskStatus.COMPLETED | 0 | true | null + TaskStatus.State.SUCCEEDED | JobStatus.State.SUCCEEDED | 1 | null | 50001 | false | nextflow.processor.TaskStatus.COMPLETED | 0 | true | null } StatusEvent makeStatusEventWithTime(long seconds, Integer exitCode) {