diff --git a/.travis.yml b/.travis.yml index 83a7bfdea..4fbfb7db0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -50,6 +50,8 @@ script: - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_trim --saveReference # Run the basic pipeline with paired end data without adapterRemoval - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --skip_adapterremoval --saveReference + # Run the basic pipeline with output unmapped reads as fastq + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --strip_input_fastq # Run the same pipeline testing optional step: fastp, complexity - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker --pairedEnd --complexity_filter --bwa_index results/reference_genome/bwa_index/bwa_index/ # Test BAM Trimming diff --git a/CHANGELOG.md b/CHANGELOG.md index 4901be67c..7e0895192 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` +* [#189](https://github.com/nf-core/eager/pull/189) - Outputing unmapped reads in a fastq files with the --strip_input_fastq flag * [#186](https://github.com/nf-core/eager/pull/186) - Make FastQC skipping [possible] /(https://github.com/nf-core/eager/issues/182) * Merged in [nf-core/tools](https://github.com/nf-core/tools) release V1.6 template changes diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py new file mode 100755 index 000000000..fd4346ac8 --- /dev/null +++ b/bin/extract_map_reads.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 + +import argparse +import multiprocessing +import pysam +from functools import partial +import gzip +import sys + + +def _get_args(): + '''This function parses and return arguments passed in''' + parser = argparse.ArgumentParser( + prog='extract_mapped_reads', + formatter_class=argparse.RawDescriptionHelpFormatter, + description=f''' +Remove mapped in bam file from fastq files + ''') + parser.add_argument('bam_file', help="path to bam file") + parser.add_argument('fwd', help='path to forward fastq file') + parser.add_argument( + '-2', + dest="rev", + default=None, + help="path to forward fastq file") + parser.add_argument( + '-of', + dest="out_fwd", + default=None, + help="path to forward output fastq file") + parser.add_argument( + '-or', + dest="out_rev", + default=None, + help="path to forward output fastq file") + parser.add_argument( + '-m', + dest='mode', + default='strip', + help='Read removal mode: remove reads (strip) or replace sequence by N (replace)' + ) + parser.add_argument( + '-p', + dest='process', + default=4, + help='Number of parallel processes' + ) + + args = parser.parse_args() + + bam = args.bam_file + in_fwd = args.fwd + in_rev = args.rev + out_fwd = args.out_fwd + out_rev = args.out_rev + mode = args.mode + proc = int(args.process) + + return(bam, in_fwd, in_rev, out_fwd, out_rev, mode, proc) + + +def extract_mapped_chr(chr, bam): + """ + Get mapped reads per chromosome + INPUT: + - chr(str): chromosome + - bam(str): bamfile path + OUTPUT: + - res(list): list of mapped reads (str) name per chromosome + """ + res = [] + bamfile = pysam.AlignmentFile(bam, "rb") + reads = bamfile.fetch(chr, multiple_iterators=True) + for read in reads: + res.append(read.query_name) + return(res) + + +def extract_mapped(bam, processes): + """ + Get mapped reads in parallel + INPUT: + - bam(str): bamfile path + OUTPUT: + - result(list) list of mapped reads name (str) + """ + try: + bamfile = pysam.AlignmentFile(bam, "rb") + chrs = bamfile.references + except ValueError as e: + print(e) + extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam) + p = multiprocessing.Pool(processes) + res = p.map(extract_mapped_chr_partial, chrs) + p.close() + p.join() + result = [i for ares in res for i in ares] + return(result) + + +def parse_fq(fq): + """ + Parse a FASTQ file + INPUT: + - fq(str): path to fastq file + OUTPUT: + - fqd(dict): dictionary with read names as keys, seq and quality as values + in a list + """ + + def get_fq_reads(allreads): + fqd = {} + myflag = True + for line in allreads: + line = line.decode('utf-8').rstrip() + if myflag == True: + instrument = line.split()[0].split(":")[0] + myflag = False + if line.startswith(instrument): + seqname = line[1:].split()[0] + fqd[seqname] = [] + continue + else: + fqd[seqname].append(line) + return(fqd) + + if fq.endswith('.gz'): + with gzip.open(fq, 'rb') as allreads: + fqd = get_fq_reads(allreads) + else: + with open(fq, 'r') as allreads: + fqd = get_fq_reads(allreads) + + return(fqd) + + +def sort_mapped(fq_dict, mapped_reads): + """ + Sort mapped reads from dictionary of fastq reads + INPUT: + - fq_dict(dict) dictionary with read names as keys, seq and quality as values + in a list + - mapped_reads(list) list of mapped reads + OUTPUT: + - mfqd(dict) dictionary with mapped read names as keys, seq and quality as values + in a list + - fqd(dict) dictionary with unmapped read names as key, unmapped/mapped (u|m), + seq and quality as values in a list + """ + fqd = {} + unmapped = [i for i in list(fq_dict.keys()) if i not in mapped_reads] + mapped = [i for i in list(fq_dict.keys()) if i in mapped_reads] + # print(unmap) + for r in unmapped: + fqd[r] = ['u']+fq_dict[r] + for r in mapped: + fqd[r] = ['m']+fq_dict[r] + + return(fqd) + + +def write_fq(fq_dict, fname, mode): + """ + Write to fastq file + INPUT: + - fq_dict(dict) dictionary with unmapped read names as keys, seq and quality as values + in a list + - fname(string) Path to output fastq file + - mode(string) strip (remove read) or replace (replace read sequence) by Ns + """ + + if fname.endswith('.gz'): + with gzip.open(fname, 'wb') as f: + for k in list(fq_dict.keys()): + if mode == 'strip': + # if unmapped, write all the read lines + if fq_dict[k][0] == 'u': + f.write(f"@{k}\n".encode()) + for i in fq_dict[k][1:]: + f.write(f"{i}\n".encode()) + # if mapped, do not write the read lines + elif fq_dict[k][0] == 'm': + continue + + elif mode == 'replace': + # if unmapped, write all the read lines + if fq_dict[k][0] == 'u': + f.write(f"@{k}\n".encode()) + for i in fq_dict[k][1:]: + f.write(f"{i}\n".encode()) + # if mapped, write all the read lines, but replace sequence + # by N*(len(sequence)) + elif fq_dict[k][0] == 'm': + f.write(f"@{k}\n".encode()) + f.write(f"{'N'*len(fq_dict[k][1])}\n".encode()) + for i in fq_dict[k][2:]: + f.write(f"{i}\n".encode()) + + else: + with open(fname, 'w') as f: + for k in list(fq_dict.keys()): + if mode == 'strip': + if fq_dict[k][0] == 'u': + f.write(f"@{k}\n") + for i in fq_dict[k][1:]: + f.write(f"{i}\n") + elif fq_dict[k][0] == 'm': + continue + elif mode == 'replace': + if fq_dict[k][0] == 'u': + f.write(f"@{k}\n") + for i in fq_dict[k][1:]: + f.write(f"{i}\n") + elif fq_dict[k][0] == 'm': + f.write(f"@{k}\n") + f.write(f"{'N'*len(fq_dict[k][1])}\n") + for i in fq_dict[k][2:]: + f.write(f"{i}\n") + + +if __name__ == "__main__": + BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args() + + if IN_REV and not OUT_REV: + print('You specified an input reverse fastq, but no output reverse fastq') + sys.exit(1) + + if OUT_FWD == None: + out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz" + else: + out_fwd = OUT_FWD + + mapped_reads = extract_mapped(BAM, PROC) + fwd_dict = parse_fq(IN_FWD) + fwd_reads = sort_mapped(fwd_dict, mapped_reads) + write_fq(fwd_reads, out_fwd, MODE) + if IN_REV: + if OUT_REV == None: + out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz" + else: + out_rev = OUT_REV + rev_dict = parse_fq(IN_REV) + rev_reads = sort_mapped(rev_dict, mapped_reads) + write_fq(rev_reads, out_rev, MODE) diff --git a/conf/base.config b/conf/base.config index d72dde862..11b585c68 100644 --- a/conf/base.config +++ b/conf/base.config @@ -64,6 +64,11 @@ process { withName: damageprofiler { errorStrategy = 'ignore' } + + withName: extract_unmapped_reads { + cpus = { check_max(8 * task.attempt, 'cpus') } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + } } params { diff --git a/docs/usage.md b/docs/usage.md index de8212695..f34b9c3ca 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -501,6 +501,18 @@ Default set to `1` and clipps off one base of the left or right side of reads. N By default, nf-core/eager uses hard clipping and sets clipped bases to `N` with quality `!` in the BAM output. Turn this on to use soft-clipping instead, masking reads at the read ends respectively using the CIGAR string. +## Mapped reads Stripping + +These parameters are used for removing mapped reads from orginal fastq files, usually in the context of uploading the original fastq files to a read archive (SRA/ENA) + +### `--strip_input_fastq` + +Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data) + +### `--strip_mode` + +Read removal mode. Strip mapped reads completely (strip) or just replace mapped reads sequence by N (replace) + ## Library-Type Parameters These parameters are required in some cases, e.g. when performing in-solution SNP capture protocols (390K,1240K, ...) for population genetics for example. Make sure to specify the required parameters in such cases. diff --git a/environment.yml b/environment.yml index 48897437f..ac098ba46 100644 --- a/environment.yml +++ b/environment.yml @@ -28,4 +28,6 @@ dependencies: - bioconda::fastp=0.19.7 - bioconda::bamutil=1.0.14 - bioconda::mtnucratio=0.5 + - pysam=0.15.2 + - python=3.6 #Missing Schmutzi,snpAD diff --git a/main.nf b/main.nf index 967de9448..cec5c963f 100644 --- a/main.nf +++ b/main.nf @@ -106,7 +106,11 @@ def helpMessage() { --bamutils_clip_left / --bamutils_clip_right Specify the number of bases to clip off reads --bamutils_softclip Use softclip instead of hard masking - Other options: + Stripping + --strip_input_fastq Create pre-Adapter Removal FASTQ files without reads that mapped to reference (e.g. for public upload of privacy sensitive non-host data) + --strip_mode Read removal mode. Strip mapped reads completely (strip) or just replace mapped reads sequence by N (replace) + + Other options: --outdir The output directory where the results will be saved --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits --plaintext_email Receive plain text emails rather than HTML @@ -216,6 +220,14 @@ params.bamutils_clip_left = 1 params.bamutils_clip_right = 1 params.bamutils_softclip = false +//unmap + +params.strip_input_fastq = false +params.strip_mode = 'strip' + + +ch_multiqc_config = Channel.fromPath(params.multiqc_config) +ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") Channel.fromPath("$baseDir/assets/where_are_my_files.txt") .into{ ch_where_for_bwa_index; ch_where_for_fasta_index; ch_where_for_seqdict} @@ -268,6 +280,21 @@ if (params.skip_collapse && params.singleEnd){ exit 1, "--skip_collapse can only be set for pairedEnd samples!" } +//AWSBatch sanity checking +if(workflow.profile == 'awsbatch'){ + if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + if (!workflow.workDir.startsWith('s3') || !params.outdir.startsWith('s3')) exit 1, "Specify S3 URLs for workDir and outdir parameters on AWSBatch!" +} + +//Strip mode sanity checking +if (params.strip_input_fastq){ + if (!(['strip','replace'].contains(params.strip_mode))) { + exit 1, "--strip_mode can only be set to strip or replace" + } +} + + + // Has the run name been specified by the user? // this has the bonus effect of catching both -name and --name custom_runName = params.name @@ -300,14 +327,14 @@ if( params.readPaths ){ .from( params.readPaths ) .map { row -> [ row[0], [ file( row[1][0] ) ] ] } .ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" } - .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g } + .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g ; ch_read_unmap} ch_bam_to_fastq_convert = Channel.empty() } else if (!params.bam){ Channel .from( params.readPaths ) .map { row -> [ row[0], [ file( row[1][0] ), file( row[1][1] ) ] ] } .ifEmpty { exit 1, "params.readPaths or params.bams was empty - no input files supplied!" } - .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g } + .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g; ch_read_unmap } ch_bam_to_fastq_convert = Channel.empty() } else { Channel @@ -327,7 +354,7 @@ if( params.readPaths ){ .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs" + "to be enclosed in quotes!\nNB: Path requires at least one * wildcard!\nIf this is single-end data, please specify --singleEnd on the command line." } - .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g } + .into { ch_read_files_clip; ch_read_files_fastqc; ch_read_files_complexity_filter_poly_g; ch_read_unmap } ch_bam_to_fastq_convert = Channel.empty() } else { Channel @@ -358,6 +385,10 @@ if(params.bwa_index) summary['BWA Index'] = params.bwa_index summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' summary['Skip Collapsing'] = params.skip_collapse ? 'Yes' : 'No' summary['Skip Trimming'] = params.skip_trim ? 'Yes' : 'No' +summary['Output stripped fastq'] = params.strip_input_fastq ? 'Yes' : 'No' +if (params.strip_input_fastq){ + summary['Strip mode'] = params.strip_mode +} summary['Max Memory'] = params.max_memory summary['Max CPUs'] = params.max_cpus summary['Max Time'] = params.max_time @@ -696,7 +727,7 @@ process bwa { output: - file "*.sorted.bam" into ch_mapped_reads_idxstats,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler + file "*.sorted.bam" into ch_mapped_reads_idxstats,ch_mapped_reads_filter,ch_mapped_reads_preseq, ch_mapped_reads_damageprofiler, ch_bwa_mapped_reads_strip file "*.{bai,csi}" into ch_bam_index_for_damageprofiler @@ -765,7 +796,7 @@ process circularmapper{ file index from ch_circularmapper_indices.first() output: - file "*.sorted.bam" into ch_mapped_reads_idxstats_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm + file "*.sorted.bam" into ch_mapped_reads_idxstats_cm,ch_mapped_reads_filter_cm,ch_mapped_reads_preseq_cm, ch_mapped_reads_damageprofiler_cm, ch_circular_mapped_reads_strip file "*.{bai,csi}" script: @@ -808,7 +839,7 @@ process bwamem { file index from ch_bwa_index_bwamem.first() output: - file "*.sorted.bam" into ch_bwamem_mapped_reads_idxstats,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler + file "*.sorted.bam" into ch_bwamem_mapped_reads_idxstats,ch_bwamem_mapped_reads_filter,ch_bwamem_mapped_reads_preseq, ch_bwamem_mapped_reads_damageprofiler, ch_bwamem_mapped_reads_strip file "*.{bai,csi}" @@ -911,6 +942,38 @@ process samtools_filter { } } +process strip_input_fastq { + tag "${bam.baseName}" + publishDir "${params.outdir}/samtools/stripped_fastq", mode: 'copy' + + when: params.strip_input_fastq + + input: + set val(name), file(fq) from ch_read_unmap + file bam from ch_bwa_mapped_reads_strip.mix(ch_circular_mapped_reads_strip, ch_bwamem_mapped_reads_strip) + + output: + file "*.fq.gz" into unmapped_fq_ch + + + script: + if (params.singleEnd) { + out_fwd = bam.baseName+'.stripped.fq.gz' + """ + samtools index $bam + extract_map_reads.py $bam ${fq[0]} -of $out_fwd -p ${task.cpus} + """ + } else { + out_fwd = bam.baseName+'.stripped.fwd.fq.gz' + out_rev = bam.baseName+'.stripped.rev.fq.gz' + """ + samtools index $bam + extract_map_reads.py $bam ${fq[0]} -2 ${fq[0]} -of $out_fwd -or $out_rev -p ${task.cpus} + """ + } + +} + /* Step 6: DeDup / MarkDuplicates