In [None]:
%run ../config/init.py

## Setting workdir to `results/{{ cookiecutter.dataset_name }}/trimmomatic`

In [None]:
data_dir = os.path.join(DATA, DATASET)
result_dir = os.path.join(RESULTS, DATASET, 'trimmomatic')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)
{% if cookiecutter.sequencing_technology == 'paired-end' %}
samples = [ f.replace('_1.fastq.gz', '') for ds,dr,fs in os.walk(data_dir) for f in fs if f.endswith('_1.fastq.gz')]
{% else %}
samples = [ f.replace('.fastq.gz', '') for ds,dr,fs in os.walk(data_dir) for f in fs if f.endswith('.fastq.gz')]
{% endif %}

## Trimming samples with Trimmomatic

Trimmomatic options should be modified accordingly with the FastQC report taking into account:

{% if cookiecutter.sequencing_technology == 'paired-end' %}
 * IlluminaClip: TruSeq3-PE.fa:2:30:10
{% else %}
 * IlluminaClip: TruSeq3-SE.fa:2:30:10
{% endif %}
 * Minlen: 25
 * Avgqual: 30
 * Leading: 30
 * Trailing: 30
 
For more info about [Trimmomatic](http://www.usadellab.org/cms/?page=trimmomatic)

### More options from the Trimmomatic CWL workflow
```
$ cwl-runner https://gitlab.com/r78v10a07/cwl-workflow/raw/master/tools/trimmomatic/trimmomatic.cwl
/usr/bin/cwl-runner 1.0.20190228155703
usage: https://gitlab.com/r78v10a07/cwl-workflow/raw/master/tools/trimmomatic/trimmomatic.cwl
       --threads THREADS
       --end_mode END_MODE
       --reads1 READS1
       --reads1_out READS1_OUT
       [--reads2 READS2]
       [--reads1_out2 READS1_OUT2]
       [--reads2_out READS2_OUT]
       [--reads2_out2 READS2_OUT2]
       [--avgqual AVGQUAL] 
       [--crop CROP]       
       [--headcrop HEADCROP]
       [--illuminaClip ILLUMINACLIP]
       [--leading LEADING]
       [--maxinfo MAXINFO]
       [--minlen MINLEN]
       [--phred PHRED]
       [--tophred33]
       [--tophred64]
       [--trailing TRAILING]
```

In [None]:
# Edit these values accordingly with the FastQC report
{% if cookiecutter.sequencing_technology == 'paired-end' %}
ILLUMINACLIP = 'TruSeq3-PE.fa:2:30:10'
{% else %}
ILLUMINACLIP = 'TruSeq3-SE.fa:2:30:10'
{% endif %}
MINLEN = 25
AVGQUAL = 30
LEADING = 30
TRAILING = 30

cmd_header = '{0} {1}/trimmomatic/trimmomatic.cwl --threads 2 --phred=33 --illuminaClip={2} --minlen={3} --avgqual={4} --leading={5} --trailing={6} '.format(
        CWLRUNNER, CWLTOOLS, ILLUMINACLIP, MINLEN, AVGQUAL, LEADING, TRAILING)

log_suffix = 'trimming.log'
{% if cookiecutter.sequencing_technology == 'paired-end' %}
with open('commands_trimming', "w") as fin:
    for s in samples:
        r1 = os.path.join(DATA, DATASET, s + '_1.fastq.gz')
        r2 = os.path.join(DATA, DATASET, s + '_2.fastq.gz')
        if not os.path.exists(s + '_1.fastq.gz') or not os.path.exists('_2.fastq.gz'):
            fin.write('{0} --end_mode=PE --reads1 {1} --reads2 {2} --reads1_out {3}_1.fastq.gz --reads1_out2 {3}_1_OU.fastq.gz --reads2_out {3}_2.fastq.gz --reads2_out2 {3}_2_OU.fastq.gz > {3}_{4} 2>&1\n'.format(cmd_header, r1, r2, s, log_suffix))
{% else %}
with open('commands_trimming', "w") as fin:
    for s in samples:
        r = os.path.join(DATA, DATASET, s + '.fastq.gz')
        if not os.path.exists(s + '.fastq.gz'):
            fin.write('{0} --end_mode=SE --reads1 {1} --reads1_out {2}.fastq.gz > {2}_{3} 2>&1\n'.format(cmd_header, r, s, log_suffix))

{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands_trimming | parallel -j 8
{% else %}
!sh commands_trimming    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)

## Quality control of trimmed samples with FastQC

In [None]:
log_suffix = 'fastqc.log'
{% if cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/fastqc/fastqc.cwl --threads {2} '.format(
        CWLRUNNER, CWLTOOLS, 4)
with open('commands_fastqc', "w") as fin:
    for s in samples:
        if not os.path.exists(s + '_1_fastqc.html') or not os.path.exists(s + '_1_fastqc.zip') or\
            not os.path.exists(s + '_2_fastqc.html') or not os.path.exists(s + '_2_fastqc.zip'):
            fin.write('{0} --fastq {1}_1.fastq.gz --fastq {1}_2.fastq.gz > {1}_{2} 2>&1\n'.format(cmd_header, s, log_suffix))
{% else %}
cmd_header = '{0} {1}/fastqc/fastqc.cwl --threads {2} --fastq '.format(
        CWLRUNNER, CWLTOOLS, 4)
with open('commands_fastqc', "w") as fin:
    for s in samples:
        if not os.path.exists(s + '_fastqc.html') or not os.path.exists(s + '_fastqc.zip'):
            fin.write('{0} {1}.fastq.gz > {1}_{2} 2>&1\n'.format(cmd_header, s, log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands_fastqc | parallel -j 4
{% else %}
!sh commands_fastqc    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)
