Skip to content

Commit

Permalink
Improve gsutil transfer throughput
Browse files Browse the repository at this point in the history
This commit improves the transfer throughput of the gsutil
used by GLS jobs tuning the usage of GSUtil:parallel_thread_count
and GSUtil:sliced_object_download_max_components settings and using
as google/cloud-sdk:slim as base image instead of alpine.

More details here https://groups.google.com/g/nextflow/c/0L7HlD7mmL0
  • Loading branch information
pditommaso committed Jun 12, 2021
1 parent 1c9fc4a commit 2da120b
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 17 deletions.
2 changes: 2 additions & 0 deletions docs/google.rst
Expand Up @@ -134,6 +134,8 @@ google.lifeSciences.serviceAccountEmail Define the Google service account
google.lifeSciences.subnetwork Define the name of the subnetwork to attach the instance to must be specified here, when the specified network is configured for custom subnet creation. The value is prefixed with `regions/subnetworks/` unless it contains a `/`, in which case it is assumed to be a fully specified subnetwork resource URL. Requires version ``21.03.0-edge`` or later.
google.lifeSciences.sshDaemon When ``true`` runs SSH daemon in the VM carrying out the job to which it's possible to connect for debugging purposes (default: ``false``).
google.lifeSciences.sshImage The container image used to run the SSH daemon (default: ``gcr.io/cloud-genomics-pipelines/tools``).
google.lifeSciences.parallelThreadCount Defines the value for the option ``GSUtil:parallel_thread_count`` used by ``gsutil`` for transfer input and output data (default: ``1``).
google.lifeSciences.downloadMaxComponents Defines the value for the option ``GSUtil:sliced_object_download_max_components`` used by ``gsutil`` for transfer input and output data (default: ``8``).
============================================== =================


Expand Down
Expand Up @@ -40,12 +40,15 @@ import nextflow.util.MemoryUnit
@CompileStatic
class GoogleLifeSciencesConfig implements CloudTransferOptions {

public final static String DEFAULT_COPY_IMAGE = 'google/cloud-sdk:alpine'
public final static String DEFAULT_COPY_IMAGE = 'google/cloud-sdk:slim'

public final static String DEFAULT_SSH_IMAGE = 'gcr.io/cloud-genomics-pipelines/tools'

public final static String DEFAULT_ENTRY_POINT = '/bin/bash'

public final static int DEF_PARALLEL_THREAD_COUNT =1
public final static int DEF_DOWNLOAD_MAX_COMPONENTS =8

String project
List<String> zones
List<String> regions
Expand All @@ -64,6 +67,8 @@ class GoogleLifeSciencesConfig implements CloudTransferOptions {
String network
String subnetwork
String serviceAccountEmail
int parallelThreadCount
int downloadMaxComponents

int maxParallelTransfers = MAX_TRANSFER
int maxTransferAttempts = MAX_TRANSFER_ATTEMPTS
Expand Down Expand Up @@ -138,6 +143,9 @@ class GoogleLifeSciencesConfig implements CloudTransferOptions {
def regions = (config.navigate("google.region") as String)?.split(",")?.toList() ?: Collections.<String>emptyList()
def location = config.navigate("google.location") as String ?: fallbackToRegionOrZone(regions,zones)

def parallelThreadCount = config.navigate('google.lifeSciences.parallelThreadCount', DEF_PARALLEL_THREAD_COUNT) as int
def downloadMaxComponents = config.navigate('google.lifeSciences.downloadMaxComponents', DEF_DOWNLOAD_MAX_COMPONENTS) as int

new GoogleLifeSciencesConfig(
project: project,
regions: regions,
Expand All @@ -158,7 +166,9 @@ class GoogleLifeSciencesConfig implements CloudTransferOptions {
delayBetweenAttempts: delayBetweenAttempts,
network: network,
subnetwork: subnetwork,
serviceAccountEmail: serviceAccountEmail
serviceAccountEmail: serviceAccountEmail,
parallelThreadCount: parallelThreadCount,
downloadMaxComponents: downloadMaxComponents
)
}

Expand Down
Expand Up @@ -50,27 +50,31 @@ class GoogleLifeSciencesFileCopyStrategy extends SimpleFileCopyStrategy {
this.task = bean
}

protected makeGsEnv(GoogleLifeSciencesConfig cfg) {
def threadCount = cfg.parallelThreadCount ?: GoogleLifeSciencesConfig.DEF_PARALLEL_THREAD_COUNT
def maxComps = cfg.downloadMaxComponents ?: GoogleLifeSciencesConfig.DEF_DOWNLOAD_MAX_COMPONENTS
"""
# google storage helper
gs_opts=('-q' '-m' '-o' 'GSUtil:parallel_thread_count=$threadCount' '-o' 'GSUtil:sliced_object_download_max_components=$maxComps')
""".stripIndent()
}

String getBeforeStartScript() {
def maxConnect = config.maxParallelTransfers ?: CloudTransferOptions.MAX_TRANSFER
def attempts = config.maxTransferAttempts ?: CloudTransferOptions.MAX_TRANSFER_ATTEMPTS
def delayBetweenAttempts = config.delayBetweenAttempts ?: CloudTransferOptions.DEFAULT_DELAY_BETWEEN_ATTEMPTS

BashFunLib.body(maxConnect, attempts, delayBetweenAttempts) +

BashFunLib.body(maxConnect, attempts, delayBetweenAttempts) + makeGsEnv(config) +
'''
# google storage helper
nxf_gs_download() {
local source=$1
local target=$2
local project=$3
local basedir=$(dirname $2)
local ret
local opts="-m -q"
local opts
local opts=("${gs_opts[@]}")
if [[ $project ]]; then
opts=('-q' '-m' '-u' "$project")
else
opts=('-q' '-m')
opts+=('-u' "$project")
fi
## download assuming it's a file download
Expand All @@ -89,7 +93,7 @@ class GoogleLifeSciencesFileCopyStrategy extends SimpleFileCopyStrategy {
nxf_gs_upload() {
local name=$1
local target=$2
gsutil -m -q cp -R "$name" "$target/$name"
gsutil ${gs_opts[@]} cp -R "$name" "$target/$name"
}
'''.stripIndent()
}
Expand Down
Expand Up @@ -255,4 +255,17 @@ class GoogleLifeSciencesConfigTest extends Specification {

}

def 'should config thread count & max components' () {
when:
def config = GoogleLifeSciencesConfig.fromSession0([google:[project:'foo', region:'x', lifeSciences: [:]]])
then:
config.parallelThreadCount == 1
config.downloadMaxComponents == 8

when:
config = GoogleLifeSciencesConfig.fromSession0([google:[project:'foo', region:'x', lifeSciences: [parallelThreadCount: 10, downloadMaxComponents: 20]]])
then:
config.parallelThreadCount == 10
config.downloadMaxComponents == 20
}
}
Expand Up @@ -61,18 +61,17 @@ nxf_parallel() {
}

# google storage helper
gs_opts=('-q' '-m' '-o' 'GSUtil:parallel_thread_count=1' '-o' 'GSUtil:sliced_object_download_max_components=8')

nxf_gs_download() {
local source=$1
local target=$2
local project=$3
local basedir=$(dirname $2)
local ret
local opts="-m -q"
local opts
local opts=("${gs_opts[@]}")
if [[ $project ]]; then
opts=('-q' '-m' '-u' "$project")
else
opts=('-q' '-m')
opts+=('-u' "$project")
fi

## download assuming it's a file download
Expand All @@ -91,7 +90,7 @@ nxf_gs_download() {
nxf_gs_upload() {
local name=$1
local target=$2
gsutil -m -q cp -R "$name" "$target/$name"
gsutil ${gs_opts[@]} cp -R "$name" "$target/$name"
}

nxf_sleep() {
Expand Down

0 comments on commit 2da120b

Please sign in to comment.