From b76988e69f1fbbf8334deb94fd98e1f840465f42 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Sun, 22 Dec 2019 18:33:33 +0100
Subject: [PATCH 1/8] add dependancies for fastq parsing and parallel python
 gzip

---
 environment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/environment.yml b/environment.yml
index a1eb6c206..be57fbf86 100644
--- a/environment.yml
+++ b/environment.yml
@@ -35,4 +35,6 @@ dependencies:
   - bioconda::sexdeterrmine=1.1.1
   - bioconda::multivcfanalyzer=0.85.1
   - bioconda::hops=0.33
+  - conda-forge::biopython=1.75
+  - conda-forge::xopen=0.8.4
   #Missing Schmutzi,snpAD

From 8f84a833ea836a8af8a213ff38f4d1d8abcb4e54 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Mon, 6 Jan 2020 11:17:46 +0100
Subject: [PATCH 2/8] refactor extract_map_reads

---
 bin/extract_map_reads.py  | 307 +++++++++++++++++++++++++-------------
 dockerdir/Dockerfile      |   6 +
 dockerdir/environment.yml |  40 +++++
 main.nf                   |  11 +-
 4 files changed, 252 insertions(+), 112 deletions(-)
 create mode 100644 dockerdir/Dockerfile
 create mode 100644 dockerdir/environment.yml

diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py
index 48358fe23..f21d9cfd6 100755
--- a/bin/extract_map_reads.py
+++ b/bin/extract_map_reads.py
@@ -3,9 +3,11 @@
 import argparse
 import multiprocessing
 import pysam
+from xopen import xopen
 from functools import partial
-import gzip
 import sys
+from Bio.SeqIO.QualityIO import FastqGeneralIterator
+from io import StringIO
 
 
 def _get_args():
@@ -57,34 +59,36 @@ def _get_args():
     return(bam, in_fwd, in_rev, out_fwd, out_rev, mode, proc)
 
 
-def extract_mapped_chr(chr, bam):
+def extract_mapped_chr(chr):
     """
     Get mapped reads per chromosome
-    INPUT:
-    - chr(str): chromosome
-    - bam(str): bamfile path
-    OUTPUT:
-    - res(list): list of mapped reads (str) name per chromosome
+    Args:
+        chr(str): chromosome
+        bam(str): bamfile path
+    Returns:
+        res(list): list of mapped reads (str) name per chromosome
     """
     res = []
-    bamfile = pysam.AlignmentFile(bam, "rb")
-    reads = bamfile.fetch(chr, multiple_iterators=True)
+    reads = BAMFILE.fetch(chr, multiple_iterators=True)
     for read in reads:
-        res.append(read.query_name)
+        if read.is_unmapped == False:
+            if read.query_name.startswith("M_"):
+                read_name = read.query_name.replace(
+                    "M_", "").split()[0].split("/")[0]
+            else:
+                read_name = read.query_name.split()[0].split("/")[0]
+            res.append(read_name)
     return(res)
 
 
-def extract_mapped(bam, processes):
-    """
-    Get mapped reads in parallel
-    INPUT:
-    - bam(str): bamfile path
-    OUTPUT:
-    - result(list) list of mapped reads name (str)
+def extract_mapped(proc):
+    """Get mapped reads in parallel
+    Returns:
+        bamfile(str): path to bam alignment file
+        result(list): list of mapped reads name (str)
     """
     try:
-        bamfile = pysam.AlignmentFile(bam, "rb")
-        chrs = bamfile.references
+        chrs = BAMFILE.references
     except ValueError as e:
         print(e)
 
@@ -93,50 +97,50 @@ def extract_mapped(bam, processes):
         return([])
 
     # Checking that nb_process is not > nb_chromosomes
-    elif len(chrs) < processes:
-        print(
-            f"""Requested {processes} processe(s), 
-            but can only be parallelized on {len(chrs)} 
-            processes with these data""")
-        processes = len(chrs)
-
-    extract_mapped_chr_partial = partial(extract_mapped_chr, bam=bam)
-    p = multiprocessing.Pool(processes)
-    res = p.map(extract_mapped_chr_partial, chrs)
-    p.close()
-    p.join()
-    result = [i for ares in res for i in ares]
+    proc = min(proc, len(chrs))
+    with multiprocessing.Pool(proc) as p:
+        res = p.map(extract_mapped_chr, chrs)
+    result = [i for ares in res for i in ares if len(i) > 0]
     return(result)
 
 
 def parse_fq(fq):
-    """
-    Parse a FASTQ file
-    INPUT:
-    - fq(str): path to fastq file
-    OUTPUT:
-    - fqd(dict): dictionary with read names as keys, seq and quality as values
+    """Parse a FASTQ file
+    Args:
+        fq(str): path to fastq file
+    Returns:
+        fqd(dict): dictionary with read names as keys, seq and quality as values
         in a list
     """
-
     def get_fq_reads(allreads):
-        fqd = {}
-        myflag = True
-        for line in allreads:
-            line = line.decode('utf-8').rstrip()
-            if myflag == True:
-                instrument = line.split()[0].split(":")[0]
-                myflag = False
-            if line.startswith(instrument):
-                seqname = line[1:].split()[0]
-                fqd[seqname] = []
-                continue
-            else:
-                fqd[seqname].append(line)
-        return(fqd)
+        read_dict = {}
+        for title, seq, qual in FastqGeneralIterator(allreads):
+            # NEED TO ONLY KEEP THE FIRST PART OF THE FASTQ READ NAME FOR CROSS
+            # REFERENCING WITH BAM FILE: ONLY THIS INFORMATION IS KEPT WHEN
+            # COLLAPSING READS WITH ADAPTERREMOVAL
+
+            # Until fastq format 1.8
+            # Split after slash
+            # @HWUSI-EAS100R:6:73:941:1973#0/1
+            suf_title = ""
+            title_space = title.split()
+            if len(title_space) > 1:
+                title = title_space[0]
+                suf_title = f" {title_space[1]}"
+
+            # From fastq format 1.8
+            # Split after space
+            # @EAS139:136:FC706VJ:2:2104:15343:197393 1:Y:18:ATCACG
+            title_slash = title.split("/")
+            if len(title_slash) > 1:
+                title = title_slash[0]
+                suf_title = f"/{title_slash[1]}"
+
+            read_dict[title] = [suf_title, seq, "+", qual]
+        return(read_dict)
 
     if fq.endswith('.gz'):
-        with gzip.open(fq, 'rb') as allreads:
+        with xopen(fq, 'r') as allreads:
             fqd = get_fq_reads(allreads)
     else:
         with open(fq, 'r') as allreads:
@@ -145,83 +149,176 @@ def get_fq_reads(allreads):
     return(fqd)
 
 
-def sort_mapped(fq_dict, mapped_reads):
-    """
-    Sort mapped reads from dictionary of fastq reads
-    INPUT:
-    - fq_dict(dict) dictionary with read names as keys, seq and quality as values
+def get_mapped_reads(fq_dict, mapped_reads):
+    """Sort mapped reads from dictionary of fastq reads
+    Args:
+        fq_dict(dict) dictionary with read names as keys, seq and quality as values
         in a list
-    - mapped_reads(list) list of mapped reads
-    OUTPUT:
-    - mfqd(dict) dictionary with mapped read names as keys, seq and quality as values
-        in a list
-    - fqd(dict) dictionary with unmapped read names as key, unmapped/mapped (u|m), 
+        mapped_reads(list) list of mapped reads
+    Returns:
+        fqd(dict) dictionary with read names as key, unmapped/mapped (u|m),
         seq and quality as values in a list
     """
+
+    def intersection(list1, list2):
+        return(set(list1).intersection(list2))
+
+    def difference(list1, list2):
+        return(set(list1).difference(list2))
+
     fqd = {}
-    unmapped = [i for i in list(fq_dict.keys()) if i not in mapped_reads]
-    mapped = [i for i in list(fq_dict.keys()) if i in mapped_reads]
-    # print(unmap)
-    for r in unmapped:
-        fqd[r] = ['u']+fq_dict[r]
-    for r in mapped:
-        fqd[r] = ['m']+fq_dict[r]
+    all_reads = list(fq_dict.keys())
+    mapped = intersection(all_reads, mapped_reads)
+    unmapped = difference(all_reads, mapped_reads)
+
+    for rm in mapped:
+        fqd[rm] = ['m']+fq_dict[rm]
+    for ru in unmapped:
+        fqd[ru] = ['u']+fq_dict[ru]
 
     return(fqd)
 
 
-def write_fq(fq_dict, fname, mode):
+def prepare_fq(fq_dict_key, fq_dict, write_mode, strip_mode):
+    """Prepare fastq for writing
+
+    Args:
+        fq_dict_key(str): dictionary key of fq_dict
+        fq_dict(dict): fq_dict with unmapped read names as keys,
+            unmapped/mapped (u|m), seq, and quality as values in a list
+        write_mode (str): 'rb' or 'r'
+        strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns
     """
-    Write to fastq file
-    INPUT:
-    - fq_dict(dict) dictionary with unmapped read names as keys, 
-        unmapped/mapped (u|m), seq, and quality as values in a list
-    - fname(string) Path to output fastq file
-    - mode(string) strip (remove read) or replace (replace read sequence) by Ns
+    wstring = ""
+    if strip_mode == 'strip':
+        # print("strip")
+        # if unmapped, write all the read lines
+        # print(fq_dict_key)
+        print(fq_dict[fq_dict_key])
+        if fq_dict[fq_dict_key][0] == 'u':
+            wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+            for i in fq_dict[fq_dict_key][2:]:
+                wstring += f"{i}\n"
+        # if mapped, do not write the read lines
+            print(wstring)
+        else:
+            return("")
+
+    else:
+        # if unmapped, write all the read lines
+        if fq_dict[fq_dict_key][0] == 'u':
+            wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+            for i in fq_dict[fq_dict_key][2:]:
+                wstring += f"{i}\n"
+        # if mapped, write all the read lines, but replace sequence
+        # by N*(len(sequence))
+        elif fq_dict[fq_dict_key][0] == 'm':
+            wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+            wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n"
+            for i in fq_dict[fq_dict_key][3:]:
+                wstring += f"{i}\n"
+
+    if write_mode == 'wb':
+        return(wstring.encode())
+    else:
+        return(wstring)
+
+
+def prepare_fq_multi(fq_dict, write_mode, strip_mode, proc):
+    """Multiprocess of prepare fastq for writing
+
+    Args:
+        fq_dict(dict): fq_dict with unmapped read names as keys,
+            unmapped/mapped (u|m), seq, and quality as values in a list
+        write_mode (str): [description]
+        write_mode (str): 'wb' or 'r'
+        strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns
+        proc(int): number of processes
+    Returns:
+        res(list): list of fastq entries
+    """
+    fq_dict_keys = list(fq_dict.keys())
+    prep_fq = partial(prepare_fq, fq_dict=fq_dict,
+                      write_mode=write_mode, strip_mode=strip_mode)
+    with multiprocessing.Pool(proc) as p:
+        res = p.map(prep_fq, fq_dict_keys)
+    return(res)
+
+
+def write_fq(write_list, fname, write_mode, proc):
+    """Write to fastq file
+    Args:
+        write_list(list): list of fastq entries
+        fname(string) Path to output fastq file
+        write_mode (str): 'rb' or 'r'
+        proc(int) number of processes
     """
-    with open(fname, 'w') as f:
-        for k in list(fq_dict.keys()):
-            if mode == 'strip':
-                if fq_dict[k][0] == 'u':
-                    f.write(f"@{k}\n")
-                    for i in fq_dict[k][1:]:
-                        f.write(f"{i}\n")
-                elif fq_dict[k][0] == 'm':
+    if write_mode == 'wb':
+        with xopen(fname, mode='wb', threads=proc) as fw:
+            for entry in write_list:
+                if len(entry) == 0:
+                    continue
+                else:
+                    fw.write(entry)
+    else:
+        with open(fname, 'w') as fw:
+            for entry in write_list:
+                if len(entry) == 0:
                     continue
-            elif mode == 'replace':
-                if fq_dict[k][0] == 'u':
-                    f.write(f"@{k}\n")
-                    for i in fq_dict[k][1:]:
-                        f.write(f"{i}\n")
-                elif fq_dict[k][0] == 'm':
-                    f.write(f"@{k}\n")
-                    f.write(f"{'N'*len(fq_dict[k][1])}\n")
-                    for i in fq_dict[k][2:]:
-                        f.write(f"{i}\n")
+                else:
+                    fw.write(entry)
 
 
 def check_strip_mode(mode):
     if mode.lower() not in ['replace', 'strip']:
         print(f"Mode must be {' or '.join(mode)}")
+    return(mode.lower())
 
 
 if __name__ == "__main__":
     BAM, IN_FWD, IN_REV, OUT_FWD, OUT_REV, MODE, PROC = _get_args()
 
     if OUT_FWD == None:
-        out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq"
+        out_fwd = f"{IN_FWD.split('/')[-1].split('.')[0]}.r1.fq.gz"
     else:
         out_fwd = OUT_FWD
 
-    mapped_reads = extract_mapped(BAM, PROC)
-    fwd_dict = parse_fq(IN_FWD)
-    fwd_reads = sort_mapped(fwd_dict, mapped_reads)
-    write_fq(fwd_reads, out_fwd, MODE)
+    if out_fwd.endswith(".gz"):
+        write_mode = "wb"
+    else:
+        write_mode = "w"
+
+    strip_mode = check_strip_mode(MODE)
+    BAMFILE = pysam.AlignmentFile(BAM, 'r')
+
+    # FORWARD OR SE FILE
+    print(f"- Extracting mapped reads from {BAM}")
+    mapped_reads = extract_mapped(proc=PROC)
+    print(f"- Parsing forward fq file {IN_FWD}")
+    fqd_fwd = parse_fq(IN_FWD)
+    print("- Cross referencing mapped reads in forward fq")
+    fq_dict_fwd = get_mapped_reads(fqd_fwd, mapped_reads)
+    # print(fq_dict_fwd)
+    print("- Preparing to write forward fq")
+    fq_write_fwd = prepare_fq_multi(
+        fq_dict_fwd, write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
+    print(f"- Writing forward fq to {out_fwd}")
+    write_fq(write_list=fq_write_fwd, fname=out_fwd,
+             write_mode=write_mode, proc=PROC)
+
+    # REVERSE FILE
     if IN_REV:
         if OUT_REV == None:
-            out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq"
+            out_rev = f"{IN_REV.split('/')[-1].split('.')[0]}.r2.fq.gz"
         else:
             out_rev = OUT_REV
-        rev_dict = parse_fq(IN_REV)
-        rev_reads = sort_mapped(rev_dict, mapped_reads)
-        write_fq(rev_reads, out_rev, MODE)
+        print(f"- Parsing reverse fq file {IN_REV}")
+        fqd_rev = parse_fq(IN_REV)
+        print("- Cross referencing mapped reads in reverse fq")
+        fq_dict_rev = get_mapped_reads(fqd_rev, mapped_reads)
+        print("- Preparing to write reverse fq")
+        fq_write_rev = prepare_fq_multi(
+            fq_dict_rev, write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
+        print(f"- Writing reverse fq to {out_rev}")
+        write_fq(write_list=fq_write_rev, fname=out_rev,
+                 write_mode=write_mode, proc=PROC)
diff --git a/dockerdir/Dockerfile b/dockerdir/Dockerfile
new file mode 100644
index 000000000..7bd109a7d
--- /dev/null
+++ b/dockerdir/Dockerfile
@@ -0,0 +1,6 @@
+FROM nfcore/base:1.7
+
+LABEL description="Docker image containing all requirements for nf-core/eager pipeline"
+COPY environment.yml /
+RUN conda env create -f /environment.yml && conda clean -a
+ENV PATH /opt/conda/envs/nf-core-eager-2.1.0dev/bin:$PATH
diff --git a/dockerdir/environment.yml b/dockerdir/environment.yml
new file mode 100644
index 000000000..be57fbf86
--- /dev/null
+++ b/dockerdir/environment.yml
@@ -0,0 +1,40 @@
+name: nf-core-eager-2.1.0dev
+channels:
+  - defaults
+  - bioconda
+  - conda-forge
+dependencies:
+  - conda-forge::openjdk=8.0.144
+  - bioconda::fastqc=0.11.8
+  - bioconda::adapterremoval=2.3.1
+  - bioconda::adapterremovalfixprefix=0.0.5
+  - bioconda::bwa=0.7.17
+  - bioconda::picard=2.21.4
+  - bioconda::samtools=1.9
+  - bioconda::dedup=0.12.5
+  - bioconda::angsd=0.931
+  - bioconda::circularmapper=1.93.4
+  - bioconda::gatk4=4.1.4.1
+  - bioconda::qualimap=2.2.2d
+  - bioconda::vcf2genome=0.91
+  - bioconda::damageprofiler=0.4.9
+  - bioconda::multiqc=1.8
+  - bioconda::pmdtools=0.60
+  - bioconda::bedtools=2.29.0
+  - conda-forge::r-rmarkdown=1.18
+  - conda-forge::libiconv=1.15
+  - conda-forge::pigz=2.3.4
+  - bioconda::sequencetools=1.2.2
+  - bioconda::preseq=2.0.3
+  - bioconda::fastp=0.20.0
+  - bioconda::bamutil=1.0.14
+  - bioconda::mtnucratio=0.7
+  - pysam=0.15.3
+  - python=3.7.1
+  - bioconda::freebayes=1.3.1
+  - bioconda::sexdeterrmine=1.1.1
+  - bioconda::multivcfanalyzer=0.85.1
+  - bioconda::hops=0.33
+  - conda-forge::biopython=1.75
+  - conda-forge::xopen=0.8.4
+  #Missing Schmutzi,snpAD
diff --git a/main.nf b/main.nf
index e9b05187d..1923632c4 100644
--- a/main.nf
+++ b/main.nf
@@ -1292,20 +1292,17 @@ process strip_input_fastq {
 
     script:
     if (params.singleEnd) {
-        out_fwd = bam.baseName+'.stripped.fq'
+        out_fwd = bam.baseName+'.stripped.fq.gz'
         """
         samtools index $bam
         extract_map_reads.py $bam ${fq[0]} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus}
-        pigz -p ${task.cpus} $out_fwd
         """
     } else {
-        out_fwd = bam.baseName+'.stripped.fwd.fq'
-        out_rev = bam.baseName+'.stripped.rev.fq'
+        out_fwd = bam.baseName+'.stripped.fwd.fq.gz'
+        out_rev = bam.baseName+'.stripped.rev.fq.gz'
         """
         samtools index $bam
-        extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m  ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
-        pigz -p ${task.cpus} $out_fwd
-        pigz -p ${task.cpus} $out_rev
+        extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
         """ 
     }
     

From f09ecbf2a6802f874b781e7e2ef61ef457299d55 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Mon, 6 Jan 2020 13:09:15 +0100
Subject: [PATCH 3/8] back to pigz

---
 main.nf | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/main.nf b/main.nf
index 1923632c4..e9b05187d 100644
--- a/main.nf
+++ b/main.nf
@@ -1292,17 +1292,20 @@ process strip_input_fastq {
 
     script:
     if (params.singleEnd) {
-        out_fwd = bam.baseName+'.stripped.fq.gz'
+        out_fwd = bam.baseName+'.stripped.fq'
         """
         samtools index $bam
         extract_map_reads.py $bam ${fq[0]} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus}
+        pigz -p ${task.cpus} $out_fwd
         """
     } else {
-        out_fwd = bam.baseName+'.stripped.fwd.fq.gz'
-        out_rev = bam.baseName+'.stripped.rev.fq.gz'
+        out_fwd = bam.baseName+'.stripped.fwd.fq'
+        out_rev = bam.baseName+'.stripped.rev.fq'
         """
         samtools index $bam
-        extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
+        extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m  ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
+        pigz -p ${task.cpus} $out_fwd
+        pigz -p ${task.cpus} $out_rev
         """ 
     }
     

From aa2a6a644192078affe429a508e3d19a0bbda213 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 7 Jan 2020 10:57:50 +0100
Subject: [PATCH 4/8] prepare fastq at write time

---
 bin/extract_map_reads.py | 139 ++++++++++++++-------------------------
 main.nf                  |   9 +--
 2 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/bin/extract_map_reads.py b/bin/extract_map_reads.py
index f21d9cfd6..563e7ed5c 100755
--- a/bin/extract_map_reads.py
+++ b/bin/extract_map_reads.py
@@ -179,94 +179,63 @@ def difference(list1, list2):
     return(fqd)
 
 
-def prepare_fq(fq_dict_key, fq_dict, write_mode, strip_mode):
-    """Prepare fastq for writing
-
-    Args:
-        fq_dict_key(str): dictionary key of fq_dict
-        fq_dict(dict): fq_dict with unmapped read names as keys,
-            unmapped/mapped (u|m), seq, and quality as values in a list
-        write_mode (str): 'rb' or 'r'
-        strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns
-    """
-    wstring = ""
-    if strip_mode == 'strip':
-        # print("strip")
-        # if unmapped, write all the read lines
-        # print(fq_dict_key)
-        print(fq_dict[fq_dict_key])
-        if fq_dict[fq_dict_key][0] == 'u':
-            wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
-            for i in fq_dict[fq_dict_key][2:]:
-                wstring += f"{i}\n"
-        # if mapped, do not write the read lines
-            print(wstring)
-        else:
-            return("")
-
-    else:
-        # if unmapped, write all the read lines
-        if fq_dict[fq_dict_key][0] == 'u':
-            wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
-            for i in fq_dict[fq_dict_key][2:]:
-                wstring += f"{i}\n"
-        # if mapped, write all the read lines, but replace sequence
-        # by N*(len(sequence))
-        elif fq_dict[fq_dict_key][0] == 'm':
-            wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
-            wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n"
-            for i in fq_dict[fq_dict_key][3:]:
-                wstring += f"{i}\n"
-
-    if write_mode == 'wb':
-        return(wstring.encode())
-    else:
-        return(wstring)
-
-
-def prepare_fq_multi(fq_dict, write_mode, strip_mode, proc):
-    """Multiprocess of prepare fastq for writing
-
+def write_fq(fq_dict, fname, write_mode, strip_mode, proc):
+    """Write to fastq file
     Args:
         fq_dict(dict): fq_dict with unmapped read names as keys,
             unmapped/mapped (u|m), seq, and quality as values in a list
-        write_mode (str): [description]
-        write_mode (str): 'wb' or 'r'
-        strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns
-        proc(int): number of processes
-    Returns:
-        res(list): list of fastq entries
-    """
-    fq_dict_keys = list(fq_dict.keys())
-    prep_fq = partial(prepare_fq, fq_dict=fq_dict,
-                      write_mode=write_mode, strip_mode=strip_mode)
-    with multiprocessing.Pool(proc) as p:
-        res = p.map(prep_fq, fq_dict_keys)
-    return(res)
-
-
-def write_fq(write_list, fname, write_mode, proc):
-    """Write to fastq file
-    Args:
-        write_list(list): list of fastq entries
         fname(string) Path to output fastq file
         write_mode (str): 'rb' or 'r'
+        strip_mode (str): strip (remove read) or replace (replace read sequence) by Ns
         proc(int) number of processes
     """
+    fq_dict_keys = list(fq_dict.keys())
     if write_mode == 'wb':
         with xopen(fname, mode='wb', threads=proc) as fw:
-            for entry in write_list:
-                if len(entry) == 0:
-                    continue
-                else:
-                    fw.write(entry)
+            for fq_dict_key in fq_dict_keys:
+                wstring = ""
+                if strip_mode == 'strip':
+                    if fq_dict[fq_dict_key][0] == 'u':
+                        wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+                        for i in fq_dict[fq_dict_key][2:]:
+                            wstring += f"{i}\n"
+                elif strip_mode == 'replace':
+                    # if unmapped, write all the read lines
+                    if fq_dict[fq_dict_key][0] == 'u':
+                        wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+                        for i in fq_dict[fq_dict_key][2:]:
+                            wstring += f"{i}\n"
+                    # if mapped, write all the read lines, but replace sequence
+                    # by N*(len(sequence))
+                    elif fq_dict[fq_dict_key][0] == 'm':
+                        wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+                        wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n"
+                        for i in fq_dict[fq_dict_key][3:]:
+                            wstring += f"{i}\n"
+                fw.write(wstring.encode())
     else:
         with open(fname, 'w') as fw:
-            for entry in write_list:
-                if len(entry) == 0:
-                    continue
-                else:
-                    fw.write(entry)
+            for fq_dict_key in fq_dict_keys:
+                wstring = ""
+                if strip_mode == 'strip':
+                    if fq_dict[fq_dict_key][0] == 'u':
+                        wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+                        for i in fq_dict[fq_dict_key][2:]:
+                            wstring += f"{i}\n"
+                elif strip_mode == 'replace':
+                    # if unmapped, write all the read lines
+                    if fq_dict[fq_dict_key][0] == 'u':
+                        wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+                        for i in fq_dict[fq_dict_key][2:]:
+                            wstring += f"{i}\n"
+                    # if mapped, write all the read lines, but replace sequence
+                    # by N*(len(sequence))
+                    elif fq_dict[fq_dict_key][0] == 'm':
+                        wstring += f"@{fq_dict_key+fq_dict[fq_dict_key][1]}\n"
+                        wstring += f"{'N'*len(fq_dict[fq_dict_key][2])}\n"
+                        for i in fq_dict[fq_dict_key][3:]:
+                            wstring += f"{i}\n"
+                fw.write(wstring)
 
 
 def check_strip_mode(mode):
@@ -291,7 +260,7 @@ def check_strip_mode(mode):
     strip_mode = check_strip_mode(MODE)
     BAMFILE = pysam.AlignmentFile(BAM, 'r')
 
-    # FORWARD OR SE FILE
+   # FORWARD OR SE FILE
     print(f"- Extracting mapped reads from {BAM}")
     mapped_reads = extract_mapped(proc=PROC)
     print(f"- Parsing forward fq file {IN_FWD}")
@@ -299,12 +268,9 @@ def check_strip_mode(mode):
     print("- Cross referencing mapped reads in forward fq")
     fq_dict_fwd = get_mapped_reads(fqd_fwd, mapped_reads)
     # print(fq_dict_fwd)
-    print("- Preparing to write forward fq")
-    fq_write_fwd = prepare_fq_multi(
-        fq_dict_fwd, write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
     print(f"- Writing forward fq to {out_fwd}")
-    write_fq(write_list=fq_write_fwd, fname=out_fwd,
-             write_mode=write_mode, proc=PROC)
+    write_fq(fq_dict=fq_dict_fwd, fname=out_fwd,
+             write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
 
     # REVERSE FILE
     if IN_REV:
@@ -316,9 +282,6 @@ def check_strip_mode(mode):
         fqd_rev = parse_fq(IN_REV)
         print("- Cross referencing mapped reads in reverse fq")
         fq_dict_rev = get_mapped_reads(fqd_rev, mapped_reads)
-        print("- Preparing to write reverse fq")
-        fq_write_rev = prepare_fq_multi(
-            fq_dict_rev, write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
         print(f"- Writing reverse fq to {out_rev}")
-        write_fq(write_list=fq_write_rev, fname=out_rev,
-                 write_mode=write_mode, proc=PROC)
+        write_fq(fq_dict=fq_dict_rev, fname=out_rev,
+                 write_mode=write_mode, strip_mode=strip_mode, proc=PROC)
diff --git a/main.nf b/main.nf
index e9b05187d..a1b8324b7 100644
--- a/main.nf
+++ b/main.nf
@@ -1292,20 +1292,17 @@ process strip_input_fastq {
 
     script:
     if (params.singleEnd) {
-        out_fwd = bam.baseName+'.stripped.fq'
+        out_fwd = bam.baseName+'.stripped.fq.gz'
         """
         samtools index $bam
         extract_map_reads.py $bam ${fq[0]} -m ${params.strip_mode} -of $out_fwd -p ${task.cpus}
-        pigz -p ${task.cpus} $out_fwd
         """
     } else {
-        out_fwd = bam.baseName+'.stripped.fwd.fq'
-        out_rev = bam.baseName+'.stripped.rev.fq'
+        out_fwd = bam.baseName+'.stripped.fwd.fq.gz'
+        out_rev = bam.baseName+'.stripped.rev.fq.gz'
         """
         samtools index $bam
         extract_map_reads.py $bam ${fq[0]} -rev ${fq[1]} -m  ${params.strip_mode} -of $out_fwd -or $out_rev -p ${task.cpus}
-        pigz -p ${task.cpus} $out_fwd
-        pigz -p ${task.cpus} $out_rev
         """ 
     }
     

From f4bba0039a2ec7f8a13eed6aa28ba63cb6966879 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 7 Jan 2020 11:56:04 +0100
Subject: [PATCH 5/8] reduce resource requirements for strip_input_fastq

---
 conf/base.config | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conf/base.config b/conf/base.config
index 04606b41f..684def275 100644
--- a/conf/base.config
+++ b/conf/base.config
@@ -65,7 +65,7 @@ process {
     time = params.large_ref ? { check_max(8.h * task.attempt, 'time') } : { check_max(2.h * task.attempt, 'time')}
   }
   withName: strip_input_fastq {
-    cpus = { check_max(8 * task.attempt, 'cpus') }
+    cpus = { check_max(4 * task.attempt, 'cpus') }
     memory = { check_max( 8.GB * task.attempt, 'memory' ) }
   }
   withName: malt {

From 39412703dcaf471bd6b80c0b441d42c000a7d301 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 7 Jan 2020 15:42:00 +0100
Subject: [PATCH 6/8] remove dockerdir

---
 dockerdir/Dockerfile      |  6 ------
 dockerdir/environment.yml | 40 ---------------------------------------
 2 files changed, 46 deletions(-)
 delete mode 100644 dockerdir/Dockerfile
 delete mode 100644 dockerdir/environment.yml

diff --git a/dockerdir/Dockerfile b/dockerdir/Dockerfile
deleted file mode 100644
index 7bd109a7d..000000000
--- a/dockerdir/Dockerfile
+++ /dev/null
@@ -1,6 +0,0 @@
-FROM nfcore/base:1.7
-
-LABEL description="Docker image containing all requirements for nf-core/eager pipeline"
-COPY environment.yml /
-RUN conda env create -f /environment.yml && conda clean -a
-ENV PATH /opt/conda/envs/nf-core-eager-2.1.0dev/bin:$PATH
diff --git a/dockerdir/environment.yml b/dockerdir/environment.yml
deleted file mode 100644
index be57fbf86..000000000
--- a/dockerdir/environment.yml
+++ /dev/null
@@ -1,40 +0,0 @@
-name: nf-core-eager-2.1.0dev
-channels:
-  - defaults
-  - bioconda
-  - conda-forge
-dependencies:
-  - conda-forge::openjdk=8.0.144
-  - bioconda::fastqc=0.11.8
-  - bioconda::adapterremoval=2.3.1
-  - bioconda::adapterremovalfixprefix=0.0.5
-  - bioconda::bwa=0.7.17
-  - bioconda::picard=2.21.4
-  - bioconda::samtools=1.9
-  - bioconda::dedup=0.12.5
-  - bioconda::angsd=0.931
-  - bioconda::circularmapper=1.93.4
-  - bioconda::gatk4=4.1.4.1
-  - bioconda::qualimap=2.2.2d
-  - bioconda::vcf2genome=0.91
-  - bioconda::damageprofiler=0.4.9
-  - bioconda::multiqc=1.8
-  - bioconda::pmdtools=0.60
-  - bioconda::bedtools=2.29.0
-  - conda-forge::r-rmarkdown=1.18
-  - conda-forge::libiconv=1.15
-  - conda-forge::pigz=2.3.4
-  - bioconda::sequencetools=1.2.2
-  - bioconda::preseq=2.0.3
-  - bioconda::fastp=0.20.0
-  - bioconda::bamutil=1.0.14
-  - bioconda::mtnucratio=0.7
-  - pysam=0.15.3
-  - python=3.7.1
-  - bioconda::freebayes=1.3.1
-  - bioconda::sexdeterrmine=1.1.1
-  - bioconda::multivcfanalyzer=0.85.1
-  - bioconda::hops=0.33
-  - conda-forge::biopython=1.75
-  - conda-forge::xopen=0.8.4
-  #Missing Schmutzi,snpAD

From 7518f3268dbf92932b471c26134e3f61c9e56a52 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Tue, 7 Jan 2020 15:56:28 +0100
Subject: [PATCH 7/8] add PR 327

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 360c10b7e..1296d6b69 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -33,6 +33,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 * [#266](https://github.com/nf-core/eager/issues/266) - Added sanity checks for input filetypes (i.e. only BAM files can be supplied if `--bam`)
 * [#237](https://github.com/nf-core/eager/issues/237) - Fixed and Updated script scrape_software_versions
 * [#322](https://github.com/nf-core/eager/pull/322) - Move extract map reads fastq compression to pigz
+* [#327](https://github.com/nf-core/eager/pull/327) - Speed up strip_input_fastq process and make it more robust
 
 ### `Dependencies`
 

From d04ab6521e91b6833ba4310bee8b297691565287 Mon Sep 17 00:00:00 2001
From: maxibor <maxime.borry@gmail.com>
Date: Thu, 9 Jan 2020 13:57:54 +0100
Subject: [PATCH 8/8] stripped fastq output in its own dir

---
 main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.nf b/main.nf
index 7c107f5bf..0530d2203 100644
--- a/main.nf
+++ b/main.nf
@@ -1298,7 +1298,7 @@ if (params.run_bam_filtering) {
 process strip_input_fastq {
     label 'mc_medium'
     tag "${bam.baseName}"
-    publishDir "${params.outdir}/samtools/stripped_fastq", mode: 'copy'
+    publishDir "${params.outdir}/stripped_fastq", mode: 'copy'
 
     when: 
     params.strip_input_fastq