From b76988e69f1fbbf8334deb94fd98e1f840465f42 Mon Sep 17 00:00:00 2001 From: maxibor Date: Sun, 22 Dec 2019 18:33:33 +0100 Subject: [PATCH 1/8] add dependancies for fastq parsing and parallel python gzip --- environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index a1eb6c206..be57fbf86 100644 --- a/environment.yml +++ b/environment.yml @@ -35,4 +35,6 @@ dependencies: - bioconda::sexdeterrmine=1.1.1 - bioconda::multivcfanalyzer=0.85.1 - bioconda::hops=0.33 + - conda-forge::biopython=1.75 + - conda-forge::xopen=0.8.4 #Missing Schmutzi,snpAD From 8f84a833ea836a8af8a213ff38f4d1d8abcb4e54 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 6 Jan 2020 11:17:46 +0100 Subject: [PATCH 2/8] refactor extract_map_reads --- bin/extract_map_reads.py | 307 +++++++++++++++++++++++++------------- dockerdir/Dockerfile | 6 + dockerdir/environment.yml | 40 +++++ main.nf | 11 +- 4 files changed, 252 insertions(+), 112 deletions(-) create mode 100644 dockerdir/Dockerfile create mode 100644 dockerdir/environment.yml diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py index 48358fe23..f21d9cfd6 100755 --- a/bin/extract_map_reads.py +++ b/bin/extract_map_reads.py @@ -3,9 +3,11 @@ import argparse import multiprocessing import pysam +from xopen import xopen from functools import partial -import gzip import sys +from Bio.SeqIO.QualityIO import FastqGeneralIterator +from io import StringIO def _get_args(): @@ -57,34 +59,36 @@ def _get_args(): return(bam, in_fwd, in_rev, out_fwd, out_rev, mode, proc) -def extract_mapped_chr(chr, bam): +def extract_mapped_chr(chr): """ Get mapped reads per chromosome - INPUT: - - chr(str): chromosome - - bam(str): bamfile path - OUTPUT: - - res(list): list of mapped reads (str) name per chromosome + Args: + chr(str): chromosome + bam(str): bamfile path + Returns: + res(list): list of mapped reads (str) name per chromosome """ res = [] - bamfile = pysam.AlignmentFile(bam, "rb") - reads = bamfile.fetch(chr, multiple_iterators=True) + reads = BAMFILE.fetch(chr, multiple_iterators=True) for read in reads: - res.append(read.query_name) + if read.is_unmapped == False: + if read.query_name.startswith("M_"): + read_name = read.query_name.replace( + "M_", "").split()[0].split("/")[0] + else: + read_name = read.query_name.split()[0].split("/")[0] + res.append(read_name) return(res) -def extract_mapped(bam, processes): - """ - Get mapped reads in parallel - INPUT: - - bam(str): bamfile path - OUTPUT: - - result(list) list of mapped reads name (str) +def extract_mapped(proc): + """Get mapped reads in parallel + Returns: + bamfile(str): path to bam alignment file + result(list): list of mapped reads name (str) """ try: - bamfile = pysam.AlignmentFile(bam, "rb") - chrs = bamfile.references + chrs = BAMFILE.references except ValueError as e: print(e) @@ -93,50 +97,50 @@ def extract_mapped(bam, processes): return([]) # Checking that nb_process is not > nb_chromosomes - elif len(chrs) < processes: - print( - f"""Requested {processes} processe(s), - but can only be parallelized on {len(chrs)} - processes with these data""") - processes = len(chrs) - - extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam) - p = multiprocessing.Pool(processes) - res = p.map(extract_mapped_chr_partial, chrs) - p.close() - p.join() - result = [i for ares in res for i in ares] + proc = min(proc, len(chrs)) + with multiprocessing.Pool(proc) as p: + res = p.map(extract_mapped_chr, chrs) + result = [i for ares in res for i in ares if len(i) > 0] return(result) def parse_fq(fq): - """ - Parse a FASTQ file - INPUT: - - fq(str): path to fastq file - OUTPUT: - - fqd(dict): dictionary with read names as keys, seq and quality as values + """Parse a FASTQ file + Args: + fq(str): path to fastq file + Returns: + fqd(dict): dictionary with read names as keys, seq and quality as values in a list """ - def get_fq_reads(allreads): - fqd = {} - myflag = True - for line in allreads: - line = line.decode('utf-8').rstrip() - if myflag == True: - instrument = line.split()[0].split(":")[0] - myflag = False - if line.startswith(instrument): - seqname = line[1:].split()[0] - fqd[seqname] = [] - continue - else: - fqd[seqname].append(line) - return(fqd) + read_dict = {} + for title, seq, qual in FastqGeneralIterator(allreads): + # NEED TO ONLY KEEP THE FIRST PART OF THE FASTQ READ NAME FOR CROSS + # REFERENCING WITH BAM FILE: ONLY THIS INFORMATION IS KEPT WHEN + # COLLAPSING READS WITH ADAPTERREMOVAL + + # Until fastq format 1.8 + # Split after slash + # @HWUSI-EAS100R:6:73:941:1973#0/1 + suf_title = "" + title_space = title.split() + if len(title_space) > 1: + title = title_space[0] + suf_title = f" {title_space[1]}" + + # From fastq format 1.8 + # Split after space + # @EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG + title_slash = title.split("/") + if len(title_slash) > 1: + title = title_slash[0] + suf_title = f"/{title_slash[1]}" + + read_dict[title] = [suf_title, seq, "+", qual] + return(read_dict) if fq.endswith('.gz'): - with gzip.open(fq, 'rb') as allreads: + with xopen(fq, 'r') as allreads: fqd = get_fq_reads(allreads) else: with open(fq, 'r') as allreads: @@ -145,83 +149,176 @@ def get_fq_reads(allreads): return(fqd) -def sort_mapped(fq_dict, mapped_reads): - """ - Sort mapped reads from dictionary of fastq reads - INPUT: - - fq_dict(dict) dictionary with read names as keys, seq and quality as values +def get_mapped_reads(fq_dict, mapped_reads): + """Sort mapped reads from dictionary of fastq reads + Args: + fq_dict(dict) dictionary with read names as keys, seq and quality as values in a list - - mapped_reads(list) list of mapped reads - OUTPUT: - - mfqd(dict) dictionary with mapped read names as keys, seq and quality as values - in a list - - fqd(dict) dictionary with unmapped read names as key, unmapped/mapped (u|m), + mapped_reads(list) list of mapped reads + Returns: + fqd(dict) dictionary with read names as key, unmapped/mapped (u|m), seq and quality as values in a list """ + + def intersection(list1, list2): + return(set(list1).intersection(list2)) + + def difference(list1, list2): + return(set(list1).difference(list2)) + fqd = {} - unmapped = [i for i in list(fq_dict.keys()) if i not in mapped_reads] - mapped = [i for i in list(fq_dict.keys()) if i in mapped_reads] - # print(unmap) - for r in unmapped: - fqd[r] = ['u']+fq_dict[r] - for r in mapped: - fqd[r] = ['m']+fq_dict[r] + all_reads = list(fq_dict.keys()) + mapped = intersection(all_reads, mapped_reads) + unmapped = difference(all_reads, mapped_reads) + + for rm in mapped: + fqd[rm] = ['m']+fq_dict[rm] + for ru in unmapped: + fqd[ru] = ['u']+fq_dict[ru] return(fqd) -def write_fq(fq_dict, fname, mode): +def prepare_fq(fq_dict_key, fq_dict, write_mode, strip_mode): + """Prepare fastq for writing + + Args: + fq_dict_key(str): dictionary key of fq_dict + fq_dict(dict): fq_dict with unmapped read names as keys, + unmapped/mapped (u|m), seq, and quality as values in a list + write_mode (str): 'rb' or 'r' + strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns """ - Write to fastq file - INPUT: - - fq_dict(dict) dictionary with unmapped read names as keys, - unmapped/mapped (u|m), seq, and quality as values in a list - - fname(string) Path to output fastq file - - mode(string) strip (remove read) or replace (replace read sequence) by Ns + wstring = "" + if strip_mode == 'strip': + # print("strip") + # if unmapped, write all the read lines + # print(fq_dict_key) + print(fq_dict[fq_dict_key]) + if fq_dict[fq_dict_key][0] == 'u': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + for i in fq_dict[fq_dict_key][2:]: + wstring += f"{i}\n" + # if mapped, do not write the read lines + print(wstring) + else: + return("") + + else: + # if unmapped, write all the read lines + if fq_dict[fq_dict_key][0] == 'u': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + for i in fq_dict[fq_dict_key][2:]: + wstring += f"{i}\n" + # if mapped, write all the read lines, but replace sequence + # by N*(len(sequence)) + elif fq_dict[fq_dict_key][0] == 'm': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n" + for i in fq_dict[fq_dict_key][3:]: + wstring += f"{i}\n" + + if write_mode == 'wb': + return(wstring.encode()) + else: + return(wstring) + + +def prepare_fq_multi(fq_dict, write_mode, strip_mode, proc): + """Multiprocess of prepare fastq for writing + + Args: + fq_dict(dict): fq_dict with unmapped read names as keys, + unmapped/mapped (u|m), seq, and quality as values in a list + write_mode (str): [description] + write_mode (str): 'wb' or 'r' + strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns + proc(int): number of processes + Returns: + res(list): list of fastq entries + """ + fq_dict_keys = list(fq_dict.keys()) + prep_fq = partial(prepare_fq, fq_dict=fq_dict, + write_mode=write_mode, strip_mode=strip_mode) + with multiprocessing.Pool(proc) as p: + res = p.map(prep_fq, fq_dict_keys) + return(res) + + +def write_fq(write_list, fname, write_mode, proc): + """Write to fastq file + Args: + write_list(list): list of fastq entries + fname(string) Path to output fastq file + write_mode (str): 'rb' or 'r' + proc(int) number of processes """ - with open(fname, 'w') as f: - for k in list(fq_dict.keys()): - if mode == 'strip': - if fq_dict[k][0] == 'u': - f.write(f"@{k}\n") - for i in fq_dict[k][1:]: - f.write(f"{i}\n") - elif fq_dict[k][0] == 'm': + if write_mode == 'wb': + with xopen(fname, mode='wb', threads=proc) as fw: + for entry in write_list: + if len(entry) == 0: + continue + else: + fw.write(entry) + else: + with open(fname, 'w') as fw: + for entry in write_list: + if len(entry) == 0: continue - elif mode == 'replace': - if fq_dict[k][0] == 'u': - f.write(f"@{k}\n") - for i in fq_dict[k][1:]: - f.write(f"{i}\n") - elif fq_dict[k][0] == 'm': - f.write(f"@{k}\n") - f.write(f"{'N'*len(fq_dict[k][1])}\n") - for i in fq_dict[k][2:]: - f.write(f"{i}\n") + else: + fw.write(entry) def check_strip_mode(mode): if mode.lower() not in ['replace', 'strip']: print(f"Mode must be {' or '.join(mode)}") + return(mode.lower()) if __name__ == "__main__": BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args() if OUT_FWD == None: - out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq" + out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz" else: out_fwd = OUT_FWD - mapped_reads = extract_mapped(BAM, PROC) - fwd_dict = parse_fq(IN_FWD) - fwd_reads = sort_mapped(fwd_dict, mapped_reads) - write_fq(fwd_reads, out_fwd, MODE) + if out_fwd.endswith(".gz"): + write_mode = "wb" + else: + write_mode = "w" + + strip_mode = check_strip_mode(MODE) + BAMFILE = pysam.AlignmentFile(BAM, 'r') + + # FORWARD OR SE FILE + print(f"- Extracting mapped reads from {BAM}") + mapped_reads = extract_mapped(proc=PROC) + print(f"- Parsing forward fq file {IN_FWD}") + fqd_fwd = parse_fq(IN_FWD) + print("- Cross referencing mapped reads in forward fq") + fq_dict_fwd = get_mapped_reads(fqd_fwd, mapped_reads) + # print(fq_dict_fwd) + print("- Preparing to write forward fq") + fq_write_fwd = prepare_fq_multi( + fq_dict_fwd, write_mode=write_mode, strip_mode=strip_mode, proc=PROC) + print(f"- Writing forward fq to {out_fwd}") + write_fq(write_list=fq_write_fwd, fname=out_fwd, + write_mode=write_mode, proc=PROC) + + # REVERSE FILE if IN_REV: if OUT_REV == None: - out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq" + out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz" else: out_rev = OUT_REV - rev_dict = parse_fq(IN_REV) - rev_reads = sort_mapped(rev_dict, mapped_reads) - write_fq(rev_reads, out_rev, MODE) + print(f"- Parsing reverse fq file {IN_REV}") + fqd_rev = parse_fq(IN_REV) + print("- Cross referencing mapped reads in reverse fq") + fq_dict_rev = get_mapped_reads(fqd_rev, mapped_reads) + print("- Preparing to write reverse fq") + fq_write_rev = prepare_fq_multi( + fq_dict_rev, write_mode=write_mode, strip_mode=strip_mode, proc=PROC) + print(f"- Writing reverse fq to {out_rev}") + write_fq(write_list=fq_write_rev, fname=out_rev, + write_mode=write_mode, proc=PROC) diff --git a/dockerdir/Dockerfile b/dockerdir/Dockerfile new file mode 100644 index 000000000..7bd109a7d --- /dev/null +++ b/dockerdir/Dockerfile @@ -0,0 +1,6 @@ +FROM nfcore/base:1.7 + +LABEL description="Docker image containing all requirements for nf-core/eager pipeline" +COPY environment.yml / +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/nf-core-eager-2.1.0dev/bin:$PATH diff --git a/dockerdir/environment.yml b/dockerdir/environment.yml new file mode 100644 index 000000000..be57fbf86 --- /dev/null +++ b/dockerdir/environment.yml @@ -0,0 +1,40 @@ +name: nf-core-eager-2.1.0dev +channels: + - defaults + - bioconda + - conda-forge +dependencies: + - conda-forge::openjdk=8.0.144 + - bioconda::fastqc=0.11.8 + - bioconda::adapterremoval=2.3.1 + - bioconda::adapterremovalfixprefix=0.0.5 + - bioconda::bwa=0.7.17 + - bioconda::picard=2.21.4 + - bioconda::samtools=1.9 + - bioconda::dedup=0.12.5 + - bioconda::angsd=0.931 + - bioconda::circularmapper=1.93.4 + - bioconda::gatk4=4.1.4.1 + - bioconda::qualimap=2.2.2d + - bioconda::vcf2genome=0.91 + - bioconda::damageprofiler=0.4.9 + - bioconda::multiqc=1.8 + - bioconda::pmdtools=0.60 + - bioconda::bedtools=2.29.0 + - conda-forge::r-rmarkdown=1.18 + - conda-forge::libiconv=1.15 + - conda-forge::pigz=2.3.4 + - bioconda::sequencetools=1.2.2 + - bioconda::preseq=2.0.3 + - bioconda::fastp=0.20.0 + - bioconda::bamutil=1.0.14 + - bioconda::mtnucratio=0.7 + - pysam=0.15.3 + - python=3.7.1 + - bioconda::freebayes=1.3.1 + - bioconda::sexdeterrmine=1.1.1 + - bioconda::multivcfanalyzer=0.85.1 + - bioconda::hops=0.33 + - conda-forge::biopython=1.75 + - conda-forge::xopen=0.8.4 + #Missing Schmutzi,snpAD diff --git a/main.nf b/main.nf index e9b05187d..1923632c4 100644 --- a/main.nf +++ b/main.nf @@ -1292,20 +1292,17 @@ process strip_input_fastq { script: if (params.singleEnd) { - out_fwd = bam.baseName+'.stripped.fq' + out_fwd = bam.baseName+'.stripped.fq.gz' """ samtools index $bam extract_map_reads.py $bam ${fq[0]} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus} - pigz -p ${task.cpus} $out_fwd """ } else { - out_fwd = bam.baseName+'.stripped.fwd.fq' - out_rev = bam.baseName+'.stripped.rev.fq' + out_fwd = bam.baseName+'.stripped.fwd.fq.gz' + out_rev = bam.baseName+'.stripped.rev.fq.gz' """ samtools index $bam - extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus} - pigz -p ${task.cpus} $out_fwd - pigz -p ${task.cpus} $out_rev + extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus} """ } From f09ecbf2a6802f874b781e7e2ef61ef457299d55 Mon Sep 17 00:00:00 2001 From: maxibor Date: Mon, 6 Jan 2020 13:09:15 +0100 Subject: [PATCH 3/8] back to pigz --- main.nf | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 1923632c4..e9b05187d 100644 --- a/main.nf +++ b/main.nf @@ -1292,17 +1292,20 @@ process strip_input_fastq { script: if (params.singleEnd) { - out_fwd = bam.baseName+'.stripped.fq.gz' + out_fwd = bam.baseName+'.stripped.fq' """ samtools index $bam extract_map_reads.py $bam ${fq[0]} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus} + pigz -p ${task.cpus} $out_fwd """ } else { - out_fwd = bam.baseName+'.stripped.fwd.fq.gz' - out_rev = bam.baseName+'.stripped.rev.fq.gz' + out_fwd = bam.baseName+'.stripped.fwd.fq' + out_rev = bam.baseName+'.stripped.rev.fq' """ samtools index $bam - extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus} + extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus} + pigz -p ${task.cpus} $out_fwd + pigz -p ${task.cpus} $out_rev """ } From aa2a6a644192078affe429a508e3d19a0bbda213 Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 7 Jan 2020 10:57:50 +0100 Subject: [PATCH 4/8] prepare fastq at write time --- bin/extract_map_reads.py | 139 ++++++++++++++------------------------- main.nf | 9 +-- 2 files changed, 54 insertions(+), 94 deletions(-) diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py index f21d9cfd6..563e7ed5c 100755 --- a/bin/extract_map_reads.py +++ b/bin/extract_map_reads.py @@ -179,94 +179,63 @@ def difference(list1, list2): return(fqd) -def prepare_fq(fq_dict_key, fq_dict, write_mode, strip_mode): - """Prepare fastq for writing - - Args: - fq_dict_key(str): dictionary key of fq_dict - fq_dict(dict): fq_dict with unmapped read names as keys, - unmapped/mapped (u|m), seq, and quality as values in a list - write_mode (str): 'rb' or 'r' - strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns - """ - wstring = "" - if strip_mode == 'strip': - # print("strip") - # if unmapped, write all the read lines - # print(fq_dict_key) - print(fq_dict[fq_dict_key]) - if fq_dict[fq_dict_key][0] == 'u': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - for i in fq_dict[fq_dict_key][2:]: - wstring += f"{i}\n" - # if mapped, do not write the read lines - print(wstring) - else: - return("") - - else: - # if unmapped, write all the read lines - if fq_dict[fq_dict_key][0] == 'u': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - for i in fq_dict[fq_dict_key][2:]: - wstring += f"{i}\n" - # if mapped, write all the read lines, but replace sequence - # by N*(len(sequence)) - elif fq_dict[fq_dict_key][0] == 'm': - wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" - wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n" - for i in fq_dict[fq_dict_key][3:]: - wstring += f"{i}\n" - - if write_mode == 'wb': - return(wstring.encode()) - else: - return(wstring) - - -def prepare_fq_multi(fq_dict, write_mode, strip_mode, proc): - """Multiprocess of prepare fastq for writing - +def write_fq(fq_dict, fname, write_mode, strip_mode, proc): + """Write to fastq file Args: fq_dict(dict): fq_dict with unmapped read names as keys, unmapped/mapped (u|m), seq, and quality as values in a list - write_mode (str): [description] - write_mode (str): 'wb' or 'r' - strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns - proc(int): number of processes - Returns: - res(list): list of fastq entries - """ - fq_dict_keys = list(fq_dict.keys()) - prep_fq = partial(prepare_fq, fq_dict=fq_dict, - write_mode=write_mode, strip_mode=strip_mode) - with multiprocessing.Pool(proc) as p: - res = p.map(prep_fq, fq_dict_keys) - return(res) - - -def write_fq(write_list, fname, write_mode, proc): - """Write to fastq file - Args: - write_list(list): list of fastq entries fname(string) Path to output fastq file write_mode (str): 'rb' or 'r' + strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns proc(int) number of processes """ + fq_dict_keys = list(fq_dict.keys()) if write_mode == 'wb': with xopen(fname, mode='wb', threads=proc) as fw: - for entry in write_list: - if len(entry) == 0: - continue - else: - fw.write(entry) + for fq_dict_key in fq_dict_keys: + wstring = "" + if strip_mode == 'strip': + if fq_dict[fq_dict_key][0] == 'u': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + for i in fq_dict[fq_dict_key][2:]: + wstring += f"{i}\n" + elif strip_mode == 'replace': + # if unmapped, write all the read lines + if fq_dict[fq_dict_key][0] == 'u': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + for i in fq_dict[fq_dict_key][2:]: + wstring += f"{i}\n" + # if mapped, write all the read lines, but replace sequence + # by N*(len(sequence)) + elif fq_dict[fq_dict_key][0] == 'm': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n" + for i in fq_dict[fq_dict_key][3:]: + wstring += f"{i}\n" + fw.write(wstring.encode()) else: with open(fname, 'w') as fw: - for entry in write_list: - if len(entry) == 0: - continue - else: - fw.write(entry) + for fq_dict_key in fq_dict_keys: + wstring = "" + if strip_mode == 'strip': + if fq_dict[fq_dict_key][0] == 'u': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + for i in fq_dict[fq_dict_key][2:]: + wstring += f"{i}\n" + elif strip_mode == 'replace': + # if unmapped, write all the read lines + if fq_dict[fq_dict_key][0] == 'u': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + for i in fq_dict[fq_dict_key][2:]: + wstring += f"{i}\n" + # if mapped, write all the read lines, but replace sequence + # by N*(len(sequence)) + elif fq_dict[fq_dict_key][0] == 'm': + wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n" + wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n" + for i in fq_dict[fq_dict_key][3:]: + wstring += f"{i}\n" + fw.write(wstring) def check_strip_mode(mode): @@ -291,7 +260,7 @@ def check_strip_mode(mode): strip_mode = check_strip_mode(MODE) BAMFILE = pysam.AlignmentFile(BAM, 'r') - # FORWARD OR SE FILE + # FORWARD OR SE FILE print(f"- Extracting mapped reads from {BAM}") mapped_reads = extract_mapped(proc=PROC) print(f"- Parsing forward fq file {IN_FWD}") @@ -299,12 +268,9 @@ def check_strip_mode(mode): print("- Cross referencing mapped reads in forward fq") fq_dict_fwd = get_mapped_reads(fqd_fwd, mapped_reads) # print(fq_dict_fwd) - print("- Preparing to write forward fq") - fq_write_fwd = prepare_fq_multi( - fq_dict_fwd, write_mode=write_mode, strip_mode=strip_mode, proc=PROC) print(f"- Writing forward fq to {out_fwd}") - write_fq(write_list=fq_write_fwd, fname=out_fwd, - write_mode=write_mode, proc=PROC) + write_fq(fq_dict=fq_dict_fwd, fname=out_fwd, + write_mode=write_mode, strip_mode=strip_mode, proc=PROC) # REVERSE FILE if IN_REV: @@ -316,9 +282,6 @@ def check_strip_mode(mode): fqd_rev = parse_fq(IN_REV) print("- Cross referencing mapped reads in reverse fq") fq_dict_rev = get_mapped_reads(fqd_rev, mapped_reads) - print("- Preparing to write reverse fq") - fq_write_rev = prepare_fq_multi( - fq_dict_rev, write_mode=write_mode, strip_mode=strip_mode, proc=PROC) print(f"- Writing reverse fq to {out_rev}") - write_fq(write_list=fq_write_rev, fname=out_rev, - write_mode=write_mode, proc=PROC) + write_fq(fq_dict=fq_dict_rev, fname=out_rev, + write_mode=write_mode, strip_mode=strip_mode, proc=PROC) diff --git a/main.nf b/main.nf index e9b05187d..a1b8324b7 100644 --- a/main.nf +++ b/main.nf @@ -1292,20 +1292,17 @@ process strip_input_fastq { script: if (params.singleEnd) { - out_fwd = bam.baseName+'.stripped.fq' + out_fwd = bam.baseName+'.stripped.fq.gz' """ samtools index $bam extract_map_reads.py $bam ${fq[0]} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus} - pigz -p ${task.cpus} $out_fwd """ } else { - out_fwd = bam.baseName+'.stripped.fwd.fq' - out_rev = bam.baseName+'.stripped.rev.fq' + out_fwd = bam.baseName+'.stripped.fwd.fq.gz' + out_rev = bam.baseName+'.stripped.rev.fq.gz' """ samtools index $bam extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus} - pigz -p ${task.cpus} $out_fwd - pigz -p ${task.cpus} $out_rev """ } From f4bba0039a2ec7f8a13eed6aa28ba63cb6966879 Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 7 Jan 2020 11:56:04 +0100 Subject: [PATCH 5/8] reduce resource requirements for strip_input_fastq --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index 04606b41f..684def275 100644 --- a/conf/base.config +++ b/conf/base.config @@ -65,7 +65,7 @@ process { time = params.large_ref ? { check_max(8.h * task.attempt, 'time') } : { check_max(2.h * task.attempt, 'time')} } withName: strip_input_fastq { - cpus = { check_max(8 * task.attempt, 'cpus') } + cpus = { check_max(4 * task.attempt, 'cpus') } memory = { check_max( 8.GB * task.attempt, 'memory' ) } } withName: malt { From 39412703dcaf471bd6b80c0b441d42c000a7d301 Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 7 Jan 2020 15:42:00 +0100 Subject: [PATCH 6/8] remove dockerdir --- dockerdir/Dockerfile | 6 ------ dockerdir/environment.yml | 40 --------------------------------------- 2 files changed, 46 deletions(-) delete mode 100644 dockerdir/Dockerfile delete mode 100644 dockerdir/environment.yml diff --git a/dockerdir/Dockerfile b/dockerdir/Dockerfile deleted file mode 100644 index 7bd109a7d..000000000 --- a/dockerdir/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM nfcore/base:1.7 - -LABEL description="Docker image containing all requirements for nf-core/eager pipeline" -COPY environment.yml / -RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-eager-2.1.0dev/bin:$PATH diff --git a/dockerdir/environment.yml b/dockerdir/environment.yml deleted file mode 100644 index be57fbf86..000000000 --- a/dockerdir/environment.yml +++ /dev/null @@ -1,40 +0,0 @@ -name: nf-core-eager-2.1.0dev -channels: - - defaults - - bioconda - - conda-forge -dependencies: - - conda-forge::openjdk=8.0.144 - - bioconda::fastqc=0.11.8 - - bioconda::adapterremoval=2.3.1 - - bioconda::adapterremovalfixprefix=0.0.5 - - bioconda::bwa=0.7.17 - - bioconda::picard=2.21.4 - - bioconda::samtools=1.9 - - bioconda::dedup=0.12.5 - - bioconda::angsd=0.931 - - bioconda::circularmapper=1.93.4 - - bioconda::gatk4=4.1.4.1 - - bioconda::qualimap=2.2.2d - - bioconda::vcf2genome=0.91 - - bioconda::damageprofiler=0.4.9 - - bioconda::multiqc=1.8 - - bioconda::pmdtools=0.60 - - bioconda::bedtools=2.29.0 - - conda-forge::r-rmarkdown=1.18 - - conda-forge::libiconv=1.15 - - conda-forge::pigz=2.3.4 - - bioconda::sequencetools=1.2.2 - - bioconda::preseq=2.0.3 - - bioconda::fastp=0.20.0 - - bioconda::bamutil=1.0.14 - - bioconda::mtnucratio=0.7 - - pysam=0.15.3 - - python=3.7.1 - - bioconda::freebayes=1.3.1 - - bioconda::sexdeterrmine=1.1.1 - - bioconda::multivcfanalyzer=0.85.1 - - bioconda::hops=0.33 - - conda-forge::biopython=1.75 - - conda-forge::xopen=0.8.4 - #Missing Schmutzi,snpAD From 7518f3268dbf92932b471c26134e3f61c9e56a52 Mon Sep 17 00:00:00 2001 From: maxibor Date: Tue, 7 Jan 2020 15:56:28 +0100 Subject: [PATCH 7/8] add PR 327 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 360c10b7e..1296d6b69 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * [#266](https://github.com/nf-core/eager/issues/266) - Added sanity checks for input filetypes (i.e. only BAM files can be supplied if `--bam`) * [#237](https://github.com/nf-core/eager/issues/237) - Fixed and Updated script scrape_software_versions * [#322](https://github.com/nf-core/eager/pull/322) - Move extract map reads fastq compression to pigz +* [#327](https://github.com/nf-core/eager/pull/327) - Speed up strip_input_fastq process and make it more robust ### `Dependencies` From d04ab6521e91b6833ba4310bee8b297691565287 Mon Sep 17 00:00:00 2001 From: maxibor Date: Thu, 9 Jan 2020 13:57:54 +0100 Subject: [PATCH 8/8] stripped fastq output in its own dir --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7c107f5bf..0530d2203 100644 --- a/main.nf +++ b/main.nf @@ -1298,7 +1298,7 @@ if (params.run_bam_filtering) { process strip_input_fastq { label 'mc_medium' tag "${bam.baseName}" - publishDir "${params.outdir}/samtools/stripped_fastq", mode: 'copy' + publishDir "${params.outdir}/stripped_fastq", mode: 'copy' when: params.strip_input_fastq