Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,5 @@ plugins-prod
/test-sched
/test-module
/results
/x/*
/x/*
mise.toml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,25 @@ class GoogleBatchMachineTypeSelector {
*/
private static final List<String> ACCELERATOR_OPTIMIZED_FAMILIES = ['a2-*', 'a3-*', 'g2-*']

/*
* Families that only support Hyperdisk disk types (not pd-standard, pd-balanced, pd-ssd).
* These require 'hyperdisk-*' as boot disk type.
* https://docs.cloud.google.com/compute/docs/general-purpose-machines?hl=en#supported_disk_types_for_c4
*/
private static final List<String> HYPERDISK_ONLY_FAMILIES = ['c4-*', 'c4a-*', 'c4d-*', 'n4-*', 'n4a-*', 'n4d-*', 'z3-*']
/*
* Families that do not support Local SSD
*/
private static final List<String> PD_ONLY_FAMILIES = ['e2-*']
/*
* Families that do not support Local SSD
*/
private static final List<String> NO_LOCAL_SSD_SUPPORT_FAMILIES = ['e2-*', 'h3-*', 'm2-*', 'm4-*', 'n4-*', 't2a-*', 't2d-*', 'x4-*']
/*
* Families that support local SSD with 'lssd' suffix
*/
private static final List<String> PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES = ['c3-*', 'c3a-*', 'c3d-*', 'c4-*', 'c4a-*', 'c4d-*', 'h4d-*', 'z3-*']

@Immutable
static class MachineType {
String type
Expand Down Expand Up @@ -122,11 +141,13 @@ class GoogleBatchMachineTypeSelector {
final matchMachineType = {String type -> !families || families.find { matchType(it, type) }}

// find machines with enough resources and SSD local disk
final validMachineTypes = getAvailableMachineTypes(region, spot).findAll {
def validMachineTypes = getAvailableMachineTypes(region, spot).findAll {
it.cpusPerVm >= cpus &&
it.memPerVm >= memoryGB &&
matchMachineType(it.type)
}.collect()
if (fusionEnabled)
validMachineTypes = validMachineTypes.findAll { hasLocalSsd(it.type)}.collect()

final sortedByCost = validMachineTypes.sort {
(it.cpusPerVm > 2 || it.memPerVm > 2 ? FAMILY_COST_CORRECTION.get(it.family, 1.0) : 1.0) * (spot ? it.spotPrice : it.onDemandPrice)
Expand All @@ -135,7 +156,7 @@ class GoogleBatchMachineTypeSelector {
return sortedByCost.first()
}

protected boolean matchType(String family, String vmType) {
protected static boolean matchType(String family, String vmType) {
if (!family)
return true
if (family.contains('*'))
Expand Down Expand Up @@ -253,17 +274,22 @@ class GoogleBatchMachineTypeSelector {
return findFirstValidSize(requested, [8])
}

// These families have a local SSD already attached and is not configurable.
if( ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) ||
machineType.family == "a3" ||
machineType.type.startsWith("a2-ultragpu-") )
if( notConfigurableLocalSSD(machineType) )
return new MemoryUnit( 0 )

// For other special families, the user must provide a valid size. If a family does not
// support local disks, then Google Batch shall return an appropriate error.
return requested
}

protected notConfigurableLocalSSD(MachineType machineType) {
// These families have a local SSD already attached and is not configurable.
return ((machineType.family == "c3" || machineType.family == "c3d") && machineType.type.endsWith("-lssd")) ||
((machineType.family == "c4" || machineType.family == "c4a" || machineType.family == "c4d") && machineType.type.endsWith("-lssd")) ||
machineType.family == "a3" ||
machineType.type.startsWith("a2-ultragpu-")
}

/**
* Find first valid disk size given the possible mounted partition
*
Expand All @@ -287,6 +313,53 @@ class GoogleBatchMachineTypeSelector {
return new MemoryUnit( numberOfDisks * 375L * (1<<30) )
}

/**
* Check if the machine type belongs to a family that only supports Hyperdisk.
*
* @param machineType Machine type
* @return Boolean value indicating if the machine type requires Hyperdisk.
*/
static boolean isHyperdiskOnly(String machineType) {
return HYPERDISK_ONLY_FAMILIES.any { matchType(it, machineType) }
}

/**
* Check if the machine type belongs to a family that only supports pd-* disk.
*
* @param machineType Machine type
* @return Boolean value indicating if the machine type requires pd-* disk type.
*/
static boolean isPdOnly(String machineType) {
return PD_ONLY_FAMILIES.any { matchType(it, machineType) }
}

/**
* Check if the machine type allow to have a local-ssd .
*
* @param machineType Machine type
* @return Boolean value indicating if the machine type can have local ssd disks.
*/
static boolean hasLocalSsd(String machineType) {
if( machineType.contains('lssd') )
return true

if( PARTIAL_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineType) } )
return false

if( NO_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineType) } )
return false

return true
}
/**
* Check if a machine type doesn't support
* @param machineTypeOrFamily
* @return
*/
static boolean unsupportedLocalSSD(String machineTypeOrFamily) {
return NO_LOCAL_SSD_SUPPORT_FAMILIES.any { matchType(it, machineTypeOrFamily) }
}

/**
* Determine whether GPU drivers should be installed.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -401,11 +401,12 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
else {
final instancePolicy = AllocationPolicy.InstancePolicy.newBuilder()

if( batchConfig.getBootDiskImage() )
instancePolicy.setBootDisk(AllocationPolicy.Disk.newBuilder().setImage(batchConfig.getBootDiskImage()))

if( fusionEnabled() && !disk ) {
disk = new DiskResource(request: '375 GB', type: 'local-ssd')
final reqMachineType = task.config.getMachineType()
disk = new DiskResource(
request: '375 GB',
type: reqMachineType ? chooseFusionDiskType(reqMachineType) : 'local-ssd'
)
log.debug "[GOOGLE BATCH] Process `${task.lazyName()}` - adding local volume as fusion scratch: $disk"
}

Expand All @@ -423,6 +424,20 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
)
}

// Configure boot disk
final bootDisk = AllocationPolicy.Disk.newBuilder()
boolean setBoot = false
if( batchConfig.getBootDiskImage() ) {
bootDisk.setImage(batchConfig.getBootDiskImage())
setBoot = true
}
if( machineType && GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(machineType.type) ) {
bootDisk.setType('hyperdisk-balanced')
setBoot = true
}
if( setBoot )
instancePolicy.setBootDisk(bootDisk)

if( task.config.getAccelerator() ) {
final accelerator = AllocationPolicy.Accelerator.newBuilder()
.setCount(task.config.getAccelerator().getRequest())
Expand Down Expand Up @@ -482,6 +497,22 @@ class GoogleBatchTaskHandler extends TaskHandler implements FusionAwareTask {
return new InstancePolicyResult(instancePolicyOrTemplate.build(), requiresScratchVolume)
}

/**
* Choose the disk type for Fusion according to the machine or family.
* Preference is 'local-ssd', 'hyperdisk-balanced' and 'pd-balanced' other types can be set by setting disk directive
* @param machineTypeOrFamily
* @return Disk type
*/
protected String chooseFusionDiskType(String machineTypeOrFamily){
if( !GoogleBatchMachineTypeSelector.unsupportedLocalSSD(machineTypeOrFamily) ){
return 'local-ssd'
} else if( GoogleBatchMachineTypeSelector.isPdOnly(machineTypeOrFamily) ){
return 'pd-balanced'
} else {
return 'hyperdisk-balanced'
}
}

/**
* Build the allocation policy for the job
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
new MachineType(type: 'm2-type08', family: 'm2', 'spotPrice': 0.036, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n2-type09', family: 'n2', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 10, 'memPerVm': 10),
new MachineType(type: 'c2-type10', family: 'c2', 'spotPrice': 0.045, 'onDemandPrice': 0.45, 'cpusPerVm': 10, 'memPerVm': 10),
new MachineType(type: 'c4-type11', family: 'c4', 'spotPrice': 0.040, 'onDemandPrice': 0.40, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'c4a-type12', family: 'c4a', 'spotPrice': 0.038, 'onDemandPrice': 0.38, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'c4d-type13', family: 'c4d', 'spotPrice': 0.039, 'onDemandPrice': 0.39, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n4-type14', family: 'n4', 'spotPrice': 0.035, 'onDemandPrice': 0.35, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n4a-type15', family: 'n4a', 'spotPrice': 0.033, 'onDemandPrice': 0.33, 'cpusPerVm': 8, 'memPerVm': 8),
new MachineType(type: 'n4d-type16', family: 'n4d', 'spotPrice': 0.034, 'onDemandPrice': 0.34, 'cpusPerVm': 8, 'memPerVm': 8),
]

def 'should select best machine type'() {
Expand All @@ -57,6 +63,12 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
8 | 8000 | 'reg' | true | false | null | 'm1-type07'
8 | 8000 | 'reg' | false | false | ['m?-*', 'c2-*'] | 'm2-type08'
8 | 8000 | 'reg' | false | false | ['m1-type07', 'm2-type66'] | 'm1-type07'
8 | 8000 | 'reg' | true | false | ['c4-*'] | 'c4-type11'
8 | 8000 | 'reg' | true | false | ['c4a-*'] | 'c4a-type12'
8 | 8000 | 'reg' | true | false | ['c4d-*'] | 'c4d-type13'
8 | 8000 | 'reg' | true | false | ['n4-*'] | 'n4-type14'
8 | 8000 | 'reg' | true | false | ['n4a-*'] | 'n4a-type15'
8 | 8000 | 'reg' | true | false | ['n4d-*'] | 'n4d-type16'


}
Expand Down Expand Up @@ -113,6 +125,27 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
'200 GB' | 'c2-standard-4' | 'c2' | 4 | '375 GB'
'50 GB' | 'c2d-highmem-56' | 'c2d' | 56 | '1500 GB'
'750 GB' | 'm3-megamem-64' | 'm3' | 64 | '1500 GB'
'100 GB' | 'c4-standard-8-lssd' | 'c4' | 8 | '0'
'100 GB' | 'c4a-standard-8-lssd' | 'c4a' | 8 | '0'
'100 GB' | 'c4d-standard-8-lssd' | 'c4d' | 8 | '0'
}

def 'should know when hyperdisk is required'() {
expect:
GoogleBatchMachineTypeSelector.INSTANCE.isHyperdiskOnly(TYPE) == EXPECTED

where:
TYPE | EXPECTED
'c4-standard-8' | true
'c4a-standard-8' | true
'c4d-standard-8' | true
'n4-standard-8' | true
'n4a-standard-8' | true
'n4d-standard-8' | true
'n1-standard-8' | false
'n2-standard-8' | false
'e2-standard-8' | false
'c2-standard-8' | false
}

def 'should know when to install GPU drivers'() {
Expand All @@ -128,4 +161,34 @@ class GoogleBatchMachineTypeSelectorTest extends Specification {
'a3-highgpu-1g' | 0 | true
'g2-standard-4' | 0 | true
}

def 'should detect non-configurable local SSD'() {
expect:
final machineType = new MachineType(type: TYPE, family: FAMILY)
GoogleBatchMachineTypeSelector.INSTANCE.notConfigurableLocalSSD(machineType) == EXPECTED

where:
TYPE | FAMILY | EXPECTED
// c3/c3d with -lssd suffix → true
'c3-standard-8-lssd' | 'c3' | true
'c3d-standard-8-lssd' | 'c3d' | true
// c4/c4a/c4d with -lssd suffix → true
'c4-standard-8-lssd' | 'c4' | true
'c4a-standard-8-lssd' | 'c4a' | true
'c4d-standard-8-lssd' | 'c4d' | true
// a3 family → always true regardless of type
'a3-highgpu-8g' | 'a3' | true
'a3-megagpu-64g' | 'a3' | true
// a2-ultragpu- prefix → true regardless of family
'a2-ultragpu-1g' | 'a2' | true
'a2-ultragpu-8g' | 'a2' | true
// c3/c4 without -lssd suffix → false
'c3-standard-8' | 'c3' | false
'c4-standard-8' | 'c4' | false
// a2 non-ultragpu → false
'a2-highgpu-1g' | 'a2' | false
// unrelated families → false
'n2-standard-4' | 'n2' | false
'e2-standard-8' | 'e2' | false
}
}
Loading
Loading