From a233dc0d091b678856b9a1b04cc4e47d46a6eec8 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 29 Jan 2024 17:02:08 +0000 Subject: [PATCH 1/8] Add Aspera CLI download support to pipeline --- CHANGELOG.md | 10 +++ CITATIONS.md | 2 + README.md | 6 +- conf/test_full.config | 4 +- modules/local/aspera_cli/environment.yml | 7 ++ modules/local/aspera_cli/main.nf | 63 ++++++++++++++++ modules/local/aspera_cli/nextflow.config | 17 +++++ modules/local/aspera_cli/tests/main.nf.test | 37 ++++++++++ .../local/aspera_cli/tests/main.nf.test.snap | 71 +++++++++++++++++++ nextflow.config | 1 + nextflow_schema.json | 6 ++ workflows/sra/main.nf | 45 ++++++++---- workflows/sra/nextflow.config | 1 + workflows/sra/tests/main.nf.test | 2 +- .../sra_custom_ena_metadata_fields.nf.test | 1 + .../sra_nf_core_pipeline_atacseq.nf.test | 1 + .../tests/sra_nf_core_pipeline_rnaseq.nf.test | 1 + .../sra_nf_core_pipeline_taxprofiler.nf.test | 1 + .../sra_nf_core_pipeline_viralrecon.nf.test | 1 + 19 files changed, 260 insertions(+), 17 deletions(-) create mode 100644 modules/local/aspera_cli/environment.yml create mode 100644 modules/local/aspera_cli/main.nf create mode 100644 modules/local/aspera_cli/nextflow.config create mode 100644 modules/local/aspera_cli/tests/main.nf.test create mode 100644 modules/local/aspera_cli/tests/main.nf.test.snap diff --git a/CHANGELOG.md b/CHANGELOG.md index d512d75c..c35e92bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,6 +40,16 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements > > **NB:** Dependency has been **removed** if new version information isn't present. +### Parameters + +| Old parameter | New parameter | +| ------------- | ---------------------- | +| | `--force_ftp_download` | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present. +> **NB:** Parameter has been **added** if just the new parameter information is present. +> **NB:** Parameter has been **removed** if new parameter information isn't present. + ## [[1.11.0](https://github.com/nf-core/fetchngs/releases/tag/1.11.0)] - 2023-10-18 ### Credits diff --git a/CITATIONS.md b/CITATIONS.md index 482d5600..a982bfd2 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,8 @@ ## Pipeline tools +- [Aspera CLI](https://github.com/IBM/aspera-cli) + - [Python](http://www.python.org) - [Requests](https://docs.python-requests.org/) diff --git a/README.md b/README.md index 56ba2cb0..2442527c 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,10 @@ Via a single file of ids, provided one-per-line (see [example input file](https: 1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) 2. Fetch extensive id metadata via ENA API 3. Download FastQ files: - - If direct download links are available from the ENA API, fetch in parallel via `curl` and perform `md5sum` check - - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ + - If direct download links are available from the ENA API: + - Fetch in parallel via `aspera-cli` and perform `md5sum` check (default) + - Fetch in parallel via `wget` and perform `md5sum` check. Use `--force_ftp_download` to force this behaviour. + - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ. Use `--force_sratools_download` to force this behaviour. 4. Collate id metadata and paths to FastQ files in a single samplesheet ### Synapse ids diff --git a/conf/test_full.config b/conf/test_full.config index 9215d510..84595dfa 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,6 +14,6 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/bb634bcfef520552e8314dfa3f8a764e1d62f7dc/testdata/v1.12.0/sra_ids_test_full.csv' + // File containing SRA ids from nf-core/rnaseq -profile test_full for full-sized test + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/100736c99d87667fb7c247c267bc8acfac647bed/testdata/v1.12.0/sra_ids_rnaseq_test_full.csv' } diff --git a/modules/local/aspera_cli/environment.yml b/modules/local/aspera_cli/environment.yml new file mode 100644 index 00000000..9fbc162f --- /dev/null +++ b/modules/local/aspera_cli/environment.yml @@ -0,0 +1,7 @@ +name: aspera_cli +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::aspera-cli=4.14.0 diff --git a/modules/local/aspera_cli/main.nf b/modules/local/aspera_cli/main.nf new file mode 100644 index 00000000..cf47b188 --- /dev/null +++ b/modules/local/aspera_cli/main.nf @@ -0,0 +1,63 @@ +process ASPERA_CLI { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aspera-cli:4.14.0--hdfd78af_1' : + 'biocontainers/aspera-cli:4.14.0--hdfd78af_1' }" + + input: + tuple val(meta), val(fastq) + val user + + output: + tuple val(meta), path("*fastq.gz"), emit: fastq + tuple val(meta), path("*md5") , emit: md5 + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + if (meta.single_end) { + """ + ascp \\ + $args \\ + -i \$CONDA_PREFIX/etc/aspera/aspera_bypass_dsa.pem \\ + ${user}@${fastq[0]} \\ + ${meta.id}.fastq.gz + + echo "${meta.md5_1} ${meta.id}.fastq.gz" > ${meta.id}.fastq.gz.md5 + md5sum -c ${meta.id}.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aspera_cli: \$(ascli --version) + END_VERSIONS + """ + } else { + """ + ascp \\ + $args \\ + -i \$CONDA_PREFIX/etc/aspera/aspera_bypass_dsa.pem \\ + ${user}@${fastq[0]} \\ + ${meta.id}_1.fastq.gz + + echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 + md5sum -c ${meta.id}_1.fastq.gz.md5 + + ascp \\ + $args \\ + -i \$CONDA_PREFIX/etc/aspera/aspera_bypass_dsa.pem \\ + ${user}@${fastq[1]} \\ + ${meta.id}_2.fastq.gz + + echo "${meta.md5_2} ${meta.id}_2.fastq.gz" > ${meta.id}_2.fastq.gz.md5 + md5sum -c ${meta.id}_2.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aspera_cli: \$(ascli --version) + END_VERSIONS + """ + } +} \ No newline at end of file diff --git a/modules/local/aspera_cli/nextflow.config b/modules/local/aspera_cli/nextflow.config new file mode 100644 index 00000000..fa2dbd90 --- /dev/null +++ b/modules/local/aspera_cli/nextflow.config @@ -0,0 +1,17 @@ +process { + withName: 'ASPERA_CLI' { + ext.args = '-QT -l 300m -P33001' + publishDir = [ + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] + ] + } +} diff --git a/modules/local/aspera_cli/tests/main.nf.test b/modules/local/aspera_cli/tests/main.nf.test new file mode 100644 index 00000000..d62dc94a --- /dev/null +++ b/modules/local/aspera_cli/tests/main.nf.test @@ -0,0 +1,37 @@ +nextflow_process { + + name "Test process: ASPERA_CLI" + script "../main.nf" + process "ASPERA_CLI" + + tag "ASPERA_CLI" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + + process { + """ + input[0] = [ + [ id:'SRX9626017_SRR13191702', single_end:false, md5_1: '89c5be920021a035084d8aeb74f32df7', md5_2: '56271be38a80db78ef3bdfc5d9909b98' ], // meta map + [ + 'fasp.sra.ebi.ac.uk:/vol1/fastq/SRR131/002/SRR13191702/SRR13191702_1.fastq.gz', + 'fasp.sra.ebi.ac.uk:/vol1/fastq/SRR131/002/SRR13191702/SRR13191702_2.fastq.gz' + ] + ] + input[1] = 'era-fasp' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/aspera_cli/tests/main.nf.test.snap b/modules/local/aspera_cli/tests/main.nf.test.snap new file mode 100644 index 00000000..15497822 --- /dev/null +++ b/modules/local/aspera_cli/tests/main.nf.test.snap @@ -0,0 +1,71 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz:md5,baaaea61cba4294ec696fdfea1610848", + "SRX9626017_SRR13191702_2.fastq.gz:md5,8e43ad99049fabb6526a4b846da01c32" + ] + ] + ], + "1": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz.md5:md5,055a6916ec9ee478e453d50651f87997", + "SRX9626017_SRR13191702_2.fastq.gz.md5:md5,c30ac785f8d80ec563fabf604d8bf945" + ] + ] + ], + "2": [ + "versions.yml:md5,a51a1dfc6308d71058ddc12c46101dd3" + ], + "fastq": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz:md5,baaaea61cba4294ec696fdfea1610848", + "SRX9626017_SRR13191702_2.fastq.gz:md5,8e43ad99049fabb6526a4b846da01c32" + ] + ] + ], + "md5": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz.md5:md5,055a6916ec9ee478e453d50651f87997", + "SRX9626017_SRR13191702_2.fastq.gz.md5:md5,c30ac785f8d80ec563fabf604d8bf945" + ] + ] + ], + "versions": [ + "versions.yml:md5,a51a1dfc6308d71058ddc12c46101dd3" + ] + } + ], + "timestamp": "2024-01-29T13:00:29.847293" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index b3c65b78..3aece1d7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,7 @@ params { ena_metadata_fields = null sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description' synapse_config = null + force_ftp_download = false force_sratools_download = false skip_fastq_download = false dbgap_key = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 31d9f17a..bdf37cd7 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -54,6 +54,12 @@ "help_text": "The default is 'auto' which can be used with nf-core/rnaseq v3.10 onwards to auto-detect strandedness during the pipeline execution.", "default": "auto" }, + "force_ftp_download": { + "type": "boolean", + "fa_icon": "fas fa-tools", + "description": "Force download FASTQ files via FTP instead of via the Aspera CLI.", + "help_text": "If the Aspera CLI is not working on your infrastructure use this flag to force the pipeline to download data via FTP." + }, "force_sratools_download": { "type": "boolean", "fa_icon": "fas fa-tools", diff --git a/workflows/sra/main.nf b/workflows/sra/main.nf index 62d8c577..80383085 100644 --- a/workflows/sra/main.nf +++ b/workflows/sra/main.nf @@ -8,6 +8,7 @@ include { MULTIQC_MAPPINGS_CONFIG } from '../../modules/local/multiqc_mappings_c include { SRA_FASTQ_FTP } from '../../modules/local/sra_fastq_ftp' include { SRA_IDS_TO_RUNINFO } from '../../modules/local/sra_ids_to_runinfo' include { SRA_RUNINFO_TO_FTP } from '../../modules/local/sra_runinfo_to_ftp' +include { ASPERA_CLI } from '../../modules/local/aspera_cli' include { SRA_TO_SAMPLESHEET } from '../../modules/local/sra_to_samplesheet' include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' @@ -54,10 +55,11 @@ workflow SRA { .out .tsv .splitCsv(header:true, sep:'\t') - .map{ meta -> - def meta_clone = meta.clone() - meta_clone.single_end = meta_clone.single_end.toBoolean() - return meta_clone + .map { + meta -> + def meta_clone = meta.clone() + meta_clone.single_end = meta_clone.single_end.toBoolean() + return meta_clone } .unique() .set { ch_sra_metadata } @@ -65,16 +67,36 @@ workflow SRA { if (!params.skip_fastq_download) { ch_sra_metadata - .map { - meta -> - [ meta, [ meta.fastq_1, meta.fastq_2 ] ] - } .branch { - ftp: it[0].fastq_1 && !params.force_sratools_download - sra: !it[0].fastq_1 || params.force_sratools_download + meta -> + def download_method = 'aspera' + if (!meta.fastq_aspera || params.force_ftp_download) { + if (meta.fastq_1) { + download_method = 'ftp' + } + } + if ((!meta.fastq_aspera && !meta.fastq_1) || params.force_sratools_download) { + download_method = 'sratools' + } + + aspera: download_method == 'aspera' + return [ meta, meta.fastq_aspera.tokenize(';').take(2) ] + ftp: download_method == 'ftp' + return [ meta, [ meta.fastq_1, meta.fastq_2 ] ] + sratools: download_method == 'sratools' + return [ meta, meta.run_accession ] } .set { ch_sra_reads } + // + // MODULE: If Aspera link is provided in run information then download FastQ directly via Aspera CLI and validate with md5sums + // + ASPERA_CLI ( + ch_sra_reads.aspera, + 'era-fasp' + ) + ch_versions = ch_versions.mix(ASPERA_CLI.out.versions.first()) + // // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums // @@ -87,7 +109,7 @@ workflow SRA { // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. // FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( - ch_sra_reads.sra.map { meta, reads -> [ meta, meta.run_accession ] }, + ch_sra_reads.sratools, params.dbgap_key ? file(params.dbgap_key, checkIfExists: true) : [] ) ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) @@ -157,7 +179,6 @@ workflow SRA { softwareVersionsToYAML(ch_versions) .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_fetchngs_software_mqc_versions.yml', sort: true, newLine: true) - emit: samplesheet = ch_samplesheet mappings = ch_mappings diff --git a/workflows/sra/nextflow.config b/workflows/sra/nextflow.config index a48bcb73..56d7814c 100644 --- a/workflows/sra/nextflow.config +++ b/workflows/sra/nextflow.config @@ -1,4 +1,5 @@ includeConfig "../../modules/local/multiqc_mappings_config/nextflow.config" +includeConfig "../../modules/local/aspera_cli/nextflow.config" includeConfig "../../modules/local/sra_fastq_ftp/nextflow.config" includeConfig "../../modules/local/sra_ids_to_runinfo/nextflow.config" includeConfig "../../modules/local/sra_runinfo_to_ftp/nextflow.config" diff --git a/workflows/sra/tests/main.nf.test b/workflows/sra/tests/main.nf.test index 8fcd0e47..062d6886 100644 --- a/workflows/sra/tests/main.nf.test +++ b/workflows/sra/tests/main.nf.test @@ -7,12 +7,12 @@ nextflow_workflow { // Dependencies tag "MULTIQC_MAPPINGS_CONFIG" + tag "ASPERA_CLI" tag "SRA_FASTQ_FTP" tag "SRA_IDS_TO_RUNINFO" tag "SRA_RUNINFO_TO_FTP" tag "SRA_TO_SAMPLESHEET" - test("Parameters: default") { when { diff --git a/workflows/sra/tests/sra_custom_ena_metadata_fields.nf.test b/workflows/sra/tests/sra_custom_ena_metadata_fields.nf.test index 0724b4fb..5fd4882b 100644 --- a/workflows/sra/tests/sra_custom_ena_metadata_fields.nf.test +++ b/workflows/sra/tests/sra_custom_ena_metadata_fields.nf.test @@ -9,6 +9,7 @@ nextflow_workflow { // Modules tag "MULTIQC_MAPPINGS_CONFIG" + tag "ASPERA_CLI" tag "SRA_FASTQ_FTP" tag "SRA_IDS_TO_RUNINFO" tag "SRA_RUNINFO_TO_FTP" diff --git a/workflows/sra/tests/sra_nf_core_pipeline_atacseq.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_atacseq.nf.test index d792e09f..95069597 100644 --- a/workflows/sra/tests/sra_nf_core_pipeline_atacseq.nf.test +++ b/workflows/sra/tests/sra_nf_core_pipeline_atacseq.nf.test @@ -9,6 +9,7 @@ nextflow_workflow { // Modules tag "MULTIQC_MAPPINGS_CONFIG" + tag "ASPERA_CLI" tag "SRA_FASTQ_FTP" tag "SRA_IDS_TO_RUNINFO" tag "SRA_RUNINFO_TO_FTP" diff --git a/workflows/sra/tests/sra_nf_core_pipeline_rnaseq.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_rnaseq.nf.test index 7f2603f6..99c7dbc5 100644 --- a/workflows/sra/tests/sra_nf_core_pipeline_rnaseq.nf.test +++ b/workflows/sra/tests/sra_nf_core_pipeline_rnaseq.nf.test @@ -9,6 +9,7 @@ nextflow_workflow { // Modules tag "MULTIQC_MAPPINGS_CONFIG" + tag "ASPERA_CLI" tag "SRA_FASTQ_FTP" tag "SRA_IDS_TO_RUNINFO" tag "SRA_RUNINFO_TO_FTP" diff --git a/workflows/sra/tests/sra_nf_core_pipeline_taxprofiler.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_taxprofiler.nf.test index 7d4fcb23..d48a714e 100644 --- a/workflows/sra/tests/sra_nf_core_pipeline_taxprofiler.nf.test +++ b/workflows/sra/tests/sra_nf_core_pipeline_taxprofiler.nf.test @@ -9,6 +9,7 @@ nextflow_workflow { // Modules tag "MULTIQC_MAPPINGS_CONFIG" + tag "ASPERA_CLI" tag "SRA_FASTQ_FTP" tag "SRA_IDS_TO_RUNINFO" tag "SRA_RUNINFO_TO_FTP" diff --git a/workflows/sra/tests/sra_nf_core_pipeline_viralrecon.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_viralrecon.nf.test index 89f8f370..f5802463 100644 --- a/workflows/sra/tests/sra_nf_core_pipeline_viralrecon.nf.test +++ b/workflows/sra/tests/sra_nf_core_pipeline_viralrecon.nf.test @@ -9,6 +9,7 @@ nextflow_workflow { // Modules tag "MULTIQC_MAPPINGS_CONFIG" + tag "ASPERA_CLI" tag "SRA_FASTQ_FTP" tag "SRA_IDS_TO_RUNINFO" tag "SRA_RUNINFO_TO_FTP" From f58c5682a6c602befe660d06271811375a806d34 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 29 Jan 2024 17:09:05 +0000 Subject: [PATCH 2/8] Fix ECLint --- CHANGELOG.md | 1 + modules/local/aspera_cli/main.nf | 2 +- modules/local/aspera_cli/tests/main.nf.test | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c35e92bc..fb853f96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ Thank you to everyone else that has contributed by reporting bugs, enhancements - [PR #246](https://github.com/nf-core/fetchngs/pull/246) - Handle dark/light mode for logo in GitHub README properly - [PR #248](https://github.com/nf-core/fetchngs/pull/248) - Update pipeline level test data path to use mirror on s3 - [PR #249](https://github.com/nf-core/fetchngs/pull/249) - Update modules which includes absolute paths for test data, making module level test compatible within the pipeline. +- [PR #259](https://github.com/nf-core/fetchngs/pull/259) - Add Aspera CLI download support to pipeline ([#68](https://github.com/nf-core/fetchngs/issues/68)) ### Software dependencies diff --git a/modules/local/aspera_cli/main.nf b/modules/local/aspera_cli/main.nf index cf47b188..412c618a 100644 --- a/modules/local/aspera_cli/main.nf +++ b/modules/local/aspera_cli/main.nf @@ -41,7 +41,7 @@ process ASPERA_CLI { -i \$CONDA_PREFIX/etc/aspera/aspera_bypass_dsa.pem \\ ${user}@${fastq[0]} \\ ${meta.id}_1.fastq.gz - + echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 md5sum -c ${meta.id}_1.fastq.gz.md5 diff --git a/modules/local/aspera_cli/tests/main.nf.test b/modules/local/aspera_cli/tests/main.nf.test index d62dc94a..3858e2c3 100644 --- a/modules/local/aspera_cli/tests/main.nf.test +++ b/modules/local/aspera_cli/tests/main.nf.test @@ -3,7 +3,7 @@ nextflow_process { name "Test process: ASPERA_CLI" script "../main.nf" process "ASPERA_CLI" - + tag "ASPERA_CLI" test("Should run without failures") { From 5f00ba81150a935a7c5d09ae4763e8e7b23575e2 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 29 Jan 2024 17:30:54 +0000 Subject: [PATCH 3/8] Mix Aspera CLI output channel with FTP and sratools reads --- workflows/sra/main.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/sra/main.nf b/workflows/sra/main.nf index 80383085..1e6c6e2f 100644 --- a/workflows/sra/main.nf +++ b/workflows/sra/main.nf @@ -115,9 +115,10 @@ workflow SRA { ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) // Isolate FASTQ channel which will be added to emit block - SRA_FASTQ_FTP + ASPERA_CLI .out .fastq + .mix(SRA_FASTQ_FTP.out.fastq) .mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads) .map { meta, fastq -> From d538a2aaa8a2ec5733124b298d221a864062c718 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Mon, 29 Jan 2024 17:31:57 +0000 Subject: [PATCH 4/8] Fix ECLint --- modules/local/aspera_cli/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/aspera_cli/main.nf b/modules/local/aspera_cli/main.nf index 412c618a..de78a7b2 100644 --- a/modules/local/aspera_cli/main.nf +++ b/modules/local/aspera_cli/main.nf @@ -60,4 +60,4 @@ process ASPERA_CLI { END_VERSIONS """ } -} \ No newline at end of file +} From 888597053131b9f60c0340f6b5dadc3cbc380af7 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 30 Jan 2024 10:09:18 +0000 Subject: [PATCH 5/8] Update usage docs --- docs/usage.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 0cfb95d8..5c244223 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -66,6 +66,10 @@ You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. From v1.9 of this pipeline the default `strandedness` in the output samplesheet will be set to `auto` when using `--nf_core_pipeline rnaseq`. This will only work with v3.10 onwards of nf-core/rnaseq which permits the auto-detection of strandedness during the pipeline execution. You can change this behaviour with the `--nf_core_rnaseq_strandedness` parameter which is set to `auto` by default. +### Bypass Aspera data download + +If the appropriate download links are available, the pipeline uses the Aspera CLI by default to download FastQ files. If you are having issues and prefer to use FTP or sra-tools instead, you can use the [`--force_ftp_download`](https://nf-co.re/fetchngs/parameters#force_ftp_download) and [`--force_sratools_download`](https://nf-co.re/fetchngs/parameters#force_sratools_download) parameters, respectively. + ### Bypass `FTP` data download If FTP connections are blocked on your network use the [`--force_sratools_download`](https://nf-co.re/fetchngs/parameters#force_sratools_download) parameter to force the pipeline to download data using sra-tools instead of the ENA FTP. From 1f42b059a66ea22d163fb602b257125f1f25c464 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 30 Jan 2024 10:15:43 +0000 Subject: [PATCH 6/8] Fix pre-commit lint --- workflows/sra/main.nf | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/sra/main.nf b/workflows/sra/main.nf index 197cd658..6c657a68 100644 --- a/workflows/sra/main.nf +++ b/workflows/sra/main.nf @@ -55,7 +55,7 @@ workflow SRA { .out .tsv .splitCsv(header:true, sep:'\t') - .map { + .map { meta -> def meta_clone = meta.clone() meta_clone.single_end = meta_clone.single_end.toBoolean() @@ -68,15 +68,15 @@ workflow SRA { ch_sra_metadata .branch { - meta -> + meta -> def download_method = 'aspera' if (!meta.fastq_aspera || params.force_ftp_download) { - if (meta.fastq_1) { + if (meta.fastq_1) { download_method = 'ftp' } } - if ((!meta.fastq_aspera && !meta.fastq_1) || params.force_sratools_download) { - download_method = 'sratools' + if ((!meta.fastq_aspera && !meta.fastq_1) || params.force_sratools_download) { + download_method = 'sratools' } aspera: download_method == 'aspera' From 84c206602a0f99a0120b0f867e4397b2bfb79600 Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 30 Jan 2024 10:54:04 +0000 Subject: [PATCH 7/8] Add comment for contents of meta.fastq_aspera --- workflows/sra/main.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflows/sra/main.nf b/workflows/sra/main.nf index 6c657a68..35ae18dc 100644 --- a/workflows/sra/main.nf +++ b/workflows/sra/main.nf @@ -70,6 +70,9 @@ workflow SRA { .branch { meta -> def download_method = 'aspera' + // meta.fastq_aspera is a metadata string with ENA fasp links supported by Aspera + // For single-end: 'fasp.sra.ebi.ac.uk:/vol1/fastq/ERR116/006/ERR1160846/ERR1160846.fastq.gz' + // For paired-end: 'fasp.sra.ebi.ac.uk:/vol1/fastq/SRR130/020/SRR13055520/SRR13055520_1.fastq.gz;fasp.sra.ebi.ac.uk:/vol1/fastq/SRR130/020/SRR13055520/SRR13055520_2.fastq.gz' if (!meta.fastq_aspera || params.force_ftp_download) { if (meta.fastq_1) { download_method = 'ftp' From e33eef40f27689b7f768d1525122d4a33e3c288d Mon Sep 17 00:00:00 2001 From: Harshil Patel Date: Tue, 30 Jan 2024 11:48:03 +0000 Subject: [PATCH 8/8] Update usage docs for samples with more than 2 FastQ files --- docs/usage.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/docs/usage.md b/docs/usage.md index 5c244223..42d134f3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -66,6 +66,29 @@ You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. From v1.9 of this pipeline the default `strandedness` in the output samplesheet will be set to `auto` when using `--nf_core_pipeline rnaseq`. This will only work with v3.10 onwards of nf-core/rnaseq which permits the auto-detection of strandedness during the pipeline execution. You can change this behaviour with the `--nf_core_rnaseq_strandedness` parameter which is set to `auto` by default. +### Accessions with more than 2 FastQ files + +Using `SRR9320616` as an example, if we run the pipeline with default options to download via Aspera/FTP the ENA API indicates that this sample is associated with a single FastQ file: + +``` +run_accession experiment_accession sample_accession secondary_sample_accession study_accession secondary_study_accession submission_accession run_alias experiment_alias sample_alias study_alias library_layout library_selection library_source library_strategy library_name instrument_model instrument_platform base_count read_count tax_id scientific_name sample_title experiment_title study_title sample_description fastq_md5 fastq_bytes fastq_ftp fastq_galaxy fastq_aspera +SRR9320616 SRX6088086 SAMN12086751 SRS4989433 PRJNA549480 SRP201778 SRA900583 GSM3895942_r1 GSM3895942 GSM3895942 GSE132901 PAIRED cDNA TRANSCRIPTOMIC RNA-Seq Illumina HiSeq 2500 ILLUMINA 11857688850 120996825 10090 Mus musculus Old 3 Kidney Illumina HiSeq 2500 sequencing: GSM3895942: Old 3 Kidney Mus musculus RNA-Seq A murine aging cell atlas reveals cell identity and tissue-specific trajectories of aging Old 3 Kidney 98c939bbae1a1fcf9624905516485b67 7763114613 ftp.sra.ebi.ac.uk/vol1/fastq/SRR932/006/SRR9320616/SRR9320616.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/SRR932/006/SRR9320616/SRR9320616.fastq.gz fasp.sra.ebi.ac.uk:/vol1/fastq/SRR932/006/SRR9320616/SRR9320616.fastq.gz +``` + +However, this sample actually has 2 additional FastQ files that are flagged as technical and can only be obtained by running sra-tools. This is particularly important for certain preps like 10x and others using UMI barcodes. + +``` +$ fasterq-dump --threads 6 --split-files --include-technical SRR9320616 --outfile SRR9320616.fastq --progress + +SRR9320616_1.fastq +SRR9320616_2.fastq +SRR9320616_3.fastq +``` + +This highlights that there is a discrepancy between the read data hosted on the ENA API and what can actually be fetched from sra-tools, where the latter seems to be the source of truth. If you anticipate that you may have more than 2 FastQ files per sample, it is recommended to use this pipeline with the `--force_sratools_download` parameter. + +See [issue #260](https://github.com/nf-core/fetchngs/issues/260) for more details. + ### Bypass Aspera data download If the appropriate download links are available, the pipeline uses the Aspera CLI by default to download FastQ files. If you are having issues and prefer to use FTP or sra-tools instead, you can use the [`--force_ftp_download`](https://nf-co.re/fetchngs/parameters#force_ftp_download) and [`--force_sratools_download`](https://nf-co.re/fetchngs/parameters#force_sratools_download) parameters, respectively.