diff --git a/.gitignore b/.gitignore index 7aa66c9746..923ddaa26c 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ plugins-prod sandbox wave-tests x/* +mise.toml diff --git a/docs/google.md b/docs/google.md index d2188a4e8f..6ae339a78d 100644 --- a/docs/google.md +++ b/docs/google.md @@ -174,7 +174,9 @@ See the [Google Batch documentation](https://cloud.google.com/compute/docs/disks :::{versionadded} 23.06.0-edge ::: -The `disk` directive can be used to set the boot disk size or provision a disk for scratch storage. If the disk type is specified with the `type` option, a new disk will be mounted to the task VM at `/tmp` with the requested size and type. Otherwise, it will set the boot disk size, overriding the `google.batch.bootDiskSize` config option. See the [Google Batch documentation](https://cloud.google.com/compute/docs/disks) for more information about the available disk types. +The `disk` directive can be used to set the boot disk size or provision a disk for scratch storage. If the disk type is specified with the `type` option, a new disk will be mounted to the task VM at `/tmp` with the requested size and type. Otherwise, it will set the boot disk size, overriding the `google.batch.bootDiskSize` config option. + +The boot disk type can be specified using the `google.batch.bootDiskType` config option (e.g., `pd-standard`, `pd-balanced`, `pd-ssd`, `hyperdisk-balanced`). See the [Google Batch documentation](https://cloud.google.com/compute/docs/disks) for more information about the available disk types. Examples: @@ -290,5 +292,4 @@ Nextflow will automatically manage the transfer of input and output files betwee - Compute resources in Google Cloud are subject to [resource quotas](https://cloud.google.com/compute/quotas), which may affect your ability to run pipelines at scale. You can request quota increases, and your quotas may automatically increase over time as you use the platform. In particular, GPU quotas are initially set to 0, so you must explicitly request a quota increase in order to use GPUs. You can initially request an increase to 1 GPU at a time, and after one billing cycle you may be able to increase it further. -- Currently, it's not possible to specify a disk type different from the default one assigned by the service depending on the chosen instance type. diff --git a/docs/reference/config.md b/docs/reference/config.md index f8e678fa45..a14c8f8bb7 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -915,6 +915,10 @@ The following settings are available: `google.batch.bootDiskSize` : The size of the virtual machine boot disk, e.g `50.GB` (default: none). +`google.batch.bootDiskType` +: The type of the virtual machine boot disk, e.g `pd-balanced`, `pd-ssd`, `pd-standard`, `hyperdisk-balanced` (default: none). +: See [Google documentation](https://cloud.google.com/compute/docs/disks) for details on available disk types. + `google.batch.cpuPlatform` : The [minimum CPU Platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform#specifications), e.g. `'Intel Skylake'` (default: none). diff --git a/modules/nextflow/src/main/resources/META-INF/build-info.properties b/modules/nextflow/src/main/resources/META-INF/build-info.properties index 1d53f6522e..d418457325 100644 --- a/modules/nextflow/src/main/resources/META-INF/build-info.properties +++ b/modules/nextflow/src/main/resources/META-INF/build-info.properties @@ -1,4 +1,4 @@ build=0 version=25.10.0 -timestamp=1761145935343 -commitId=2069c9729 +timestamp=1764102875132 +commitId=08907d7e0 diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy index d21554ec70..546f4dd27b 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelector.groovy @@ -83,6 +83,12 @@ class GoogleBatchMachineTypeSelector { */ private static final List ACCELERATOR_OPTIMIZED_FAMILIES = ['a2-*', 'a3-*', 'g2-*'] + /* + * Families that only support Hyperdisk (no standard PD) + * LAST UPDATE 2024-05-22 + */ + private static final List GENERAL_PURPOSE_FAMILIES = ['c4-*', 'c4a-*', 'c4d-*', 'n4-*', 'n4d-*', 'n4a-*'] + @Immutable static class MachineType { String type @@ -256,10 +262,15 @@ class GoogleBatchMachineTypeSelector { // These families have a local SSD already attached and is not configurable. if( ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) || + ((machineType.family == "c4" || machineType.family == "c4a" || machineType.family == "c4d") && machineType.type.endsWith("-lssd")) || machineType.family == "a3" || machineType.type.startsWith("a2-ultragpu-") ) return new MemoryUnit( 0 ) + // These families do not support local SSD + if( machineType.family == "n4" || machineType.family == "n4a" || machineType.family == "n4d" ) + return new MemoryUnit( 0 ) + // For other special families, the user must provide a valid size. If a family does not // support local disks, then Google Batch shall return an appropriate error. return requested @@ -303,4 +314,14 @@ class GoogleBatchMachineTypeSelector { return ACCELERATOR_OPTIMIZED_FAMILIES.any { matchType(it, machineType.type) } } + /** + * Check if the machine type belongs to a family that only supports Hyperdisk. + * + * @param machineType Machine type + * @return Boolean value indicating if the machine type requires Hyperdisk. + */ + boolean isHyperdiskOnly(String machineType) { + return GENERAL_PURPOSE_FAMILIES.any { matchType(it, machineType) } + } + } diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy index f5f4da785d..b811e075ad 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/GoogleBatchTaskHandler.groovy @@ -333,9 +333,6 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { else { final instancePolicy = AllocationPolicy.InstancePolicy.newBuilder() - if( batchConfig.getBootDiskImage() ) - instancePolicy.setBootDisk( AllocationPolicy.Disk.newBuilder().setImage( batchConfig.getBootDiskImage() ) ) - if( fusionEnabled() && !disk ) { disk = new DiskResource(request: '375 GB', type: 'local-ssd') log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adding local volume as fusion scratch: $disk" @@ -355,6 +352,27 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask { ) } + // Configure boot disk + final bootDisk = AllocationPolicy.Disk.newBuilder() + boolean setBoot = false + if( batchConfig.getBootDiskImage() ) { + bootDisk.setImage( batchConfig.getBootDiskImage() ) + setBoot = true + } + + if( batchConfig.bootDiskType ) { + bootDisk.setType( batchConfig.bootDiskType ) + setBoot = true + } + + if( machineType && GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(machineType.type) && !batchConfig.bootDiskType ) { + bootDisk.setType('hyperdisk-balanced') + setBoot = true + } + + if( setBoot ) + instancePolicy.setBootDisk(bootDisk) + if( task.config.getAccelerator() ) { final accelerator = AllocationPolicy.Accelerator.newBuilder() .setCount( task.config.getAccelerator().getRequest() ) diff --git a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy index 154fc4d4b3..49dcf263de 100644 --- a/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy +++ b/plugins/nf-google/src/main/nextflow/cloud/google/batch/client/BatchConfig.groovy @@ -67,6 +67,12 @@ class BatchConfig implements ConfigScope { """) final MemoryUnit bootDiskSize + @ConfigOption + @Description(""" + The type of the virtual machine boot disk, e.g `pd-balanced`, `pd-ssd`, `pd-standard` (default: none). + """) + final String bootDiskType + @ConfigOption @Description(""" The [minimum CPU Platform](https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform#specifications), e.g. `'Intel Skylake'` (default: none). @@ -144,6 +150,7 @@ class BatchConfig implements ConfigScope { autoRetryExitCodes = opts.autoRetryExitCodes as List ?: DEFAULT_RETRY_LIST bootDiskImage = opts.bootDiskImage bootDiskSize = opts.bootDiskSize as MemoryUnit + bootDiskType = opts.bootDiskType cpuPlatform = opts.cpuPlatform gcsfuseOptions = opts.gcsfuseOptions as List ?: DEFAULT_GCSFUSE_OPTS installGpuDrivers = opts.installGpuDrivers as boolean diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy index 35a8780f7d..e3ea28e47d 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchMachineTypeSelectorTest.groovy @@ -112,4 +112,35 @@ class GoogleBatchMachineTypeSelectorTest extends Specification { 'a3-highgpu-1g' | 0 | true 'g2-standard-4' | 0 | true } + + def 'should identify hyperdisk only families'() { + expect: + GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(TYPE) == EXPECTED + + where: + TYPE | EXPECTED + 'c4-standard-4' | true + 'c4a-standard-4'| true + 'c4d-standard-4'| true + 'n4-standard-4' | true + 'n4d-standard-4'| true + 'n4a-standard-4'| true + 'n2-standard-4' | false + 'e2-standard-4' | false + } + + def 'should return 0 for local ssd on unsupported families'() { + expect: + final machineType = new MachineType(type: TYPE, family: FAMILY, cpusPerVm: 4) + GoogleBatchMachineTypeSelector.INSTANCE.findValidLocalSSDSize(MemoryUnit.of('375 GB'), machineType) == MemoryUnit.of('0 GB') + + where: + TYPE | FAMILY + 'n4-standard-4' | 'n4' + 'n4a-standard-4' | 'n4a' + 'n4d-standard-4' | 'n4d' + 'c4-standard-4-lssd' | 'c4' + 'c4a-standard-4-lssd'| 'c4a' + 'c4d-standard-4-lssd'| 'c4d' + } } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy index 9525def0bf..af2d593c41 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/GoogleBatchTaskHandlerTest.groovy @@ -774,4 +774,114 @@ class GoogleBatchTaskHandlerTest extends Specification { } + def 'should use hyperdisk-balanced for boot disk on new machine families' () { + given: + def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch') + def CONTAINER_IMAGE = 'debian:latest' + def exec = Mock(GoogleBatchExecutor) { + getBatchConfig() >> Mock(BatchConfig) + } + def bean = new TaskBean(workDir: WORK_DIR, inputFiles: [:]) + def task = Mock(TaskRun) { + toTaskBean() >> bean + getHashLog() >> 'abcd1234' + getWorkDir() >> WORK_DIR + getContainer() >> CONTAINER_IMAGE + getConfig() >> Mock(TaskConfig) { + getCpus() >> 4 + getResourceLabels() >> [:] + getMachineType() >> MACHINE_TYPE + } + } + def handler = Spy(new GoogleBatchTaskHandler(task, exec)) + def launcher = new GoogleBatchLauncherSpecMock('bash .command.run', []) + + when: + def req = handler.newSubmitRequest(task, launcher) + then: + handler.fusionEnabled() >> false + handler.findBestMachineType(_, false) >> new GoogleBatchMachineTypeSelector.MachineType(type: MACHINE_TYPE, location: "location", priceModel: PriceModel.spot) + + and: + def instancePolicy = req.getAllocationPolicy().getInstances(0).getPolicy() + instancePolicy.getBootDisk().getType() == 'hyperdisk-balanced' + + where: + MACHINE_TYPE << ['c4-standard-4', 'c4a-standard-4', 'c4d-standard-4', 'n4-standard-4', 'n4d-standard-4', 'n4a-standard-4'] + } + + def 'should set boot disk type from config' () { + given: + def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch') + def CONTAINER_IMAGE = 'debian:latest' + def BOOT_DISK_TYPE = 'pd-ssd' + def exec = Mock(GoogleBatchExecutor) { + getBatchConfig() >> Mock(BatchConfig) { + getBootDiskType() >> BOOT_DISK_TYPE + } + } + def bean = new TaskBean(workDir: WORK_DIR, inputFiles: [:]) + def task = Mock(TaskRun) { + toTaskBean() >> bean + getHashLog() >> 'abcd1234' + getWorkDir() >> WORK_DIR + getContainer() >> CONTAINER_IMAGE + getConfig() >> Mock(TaskConfig) { + getCpus() >> 2 + getResourceLabels() >> [:] + } + } + def handler = Spy(new GoogleBatchTaskHandler(task, exec)) + def launcher = new GoogleBatchLauncherSpecMock('bash .command.run', []) + + when: + def req = handler.newSubmitRequest(task, launcher) + then: + handler.fusionEnabled() >> false + handler.findBestMachineType(_, false) >> null + + and: + def instancePolicy = req.getAllocationPolicy().getInstances(0).getPolicy() + instancePolicy.getBootDisk().getType() == BOOT_DISK_TYPE + } + + def 'should prioritize bootDiskType config over hyperdisk-balanced for new machine families' () { + given: + def WORK_DIR = CloudStorageFileSystem.forBucket('foo').getPath('/scratch') + def CONTAINER_IMAGE = 'debian:latest' + def BOOT_DISK_TYPE = 'pd-standard' + def MACHINE_TYPE = 'c4-standard-4' + def exec = Mock(GoogleBatchExecutor) { + getBatchConfig() >> Mock(BatchConfig) { + getBootDiskType() >> BOOT_DISK_TYPE + } + } + def bean = new TaskBean(workDir: WORK_DIR, inputFiles: [:]) + def task = Mock(TaskRun) { + toTaskBean() >> bean + getHashLog() >> 'abcd1234' + getWorkDir() >> WORK_DIR + getContainer() >> CONTAINER_IMAGE + getConfig() >> Mock(TaskConfig) { + getCpus() >> 4 + getResourceLabels() >> [:] + getMachineType() >> MACHINE_TYPE + } + } + def handler = Spy(new GoogleBatchTaskHandler(task, exec)) + def launcher = new GoogleBatchLauncherSpecMock('bash .command.run', []) + + when: + def req = handler.newSubmitRequest(task, launcher) + then: + handler.fusionEnabled() >> false + handler.findBestMachineType(_, false) >> new GoogleBatchMachineTypeSelector.MachineType(type: MACHINE_TYPE, location: "location", priceModel: PriceModel.spot) + + and: + def instancePolicy = req.getAllocationPolicy().getInstances(0).getPolicy() + // bootDiskType from config should override the automatic hyperdisk-balanced + instancePolicy.getBootDisk().getType() == BOOT_DISK_TYPE + } + + } diff --git a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy index 4b24d652ea..d58f475186 100644 --- a/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy +++ b/plugins/nf-google/src/test/nextflow/cloud/google/batch/client/BatchConfigTest.groovy @@ -37,6 +37,7 @@ class BatchConfigTest extends Specification { and: !config.bootDiskImage !config.bootDiskSize + !config.bootDiskType !config.logsPath } @@ -49,6 +50,7 @@ class BatchConfigTest extends Specification { retryPolicy: [maxAttempts: 10], bootDiskImage: 'batch-foo', bootDiskSize: '100GB', + bootDiskType: 'pd-ssd', logsPath: 'gs://my-logs-bucket/logs' ] @@ -63,6 +65,7 @@ class BatchConfigTest extends Specification { and: config.bootDiskImage == 'batch-foo' config.bootDiskSize == MemoryUnit.of('100GB') + config.bootDiskType == 'pd-ssd' and: config.logsPath == 'gs://my-logs-bucket/logs' }