In [None]:
%run ../config/init.py

### Creating results folders

In [None]:
data_dir = os.path.join(RESULTS, DATASET, 'trimmomatic')
result_dir = os.path.join(RESULTS, DATASET, 'alignments')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)
{% if cookiecutter.sequencing_technology == 'paired-end' %}
samples = [ f.replace('_1.fastq.gz', '') for ds,dr,fs in os.walk(data_dir) for f in fs if f.endswith('_1.fastq.gz')]
{% else %}
samples = [ f.replace('.fastq.gz', '') for ds,dr,fs in os.walk(data_dir) for f in fs if f.endswith('.fastq.gz')]
{% endif %}

### Processing samples
{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}

In [None]:
log_suffix = 'alignment.log'

{% if cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/Alignments/star-alignment-PE.cwl --threads {{ cookiecutter.max_number_threads }} --genomeDir {2}  '.format(
        CWLRUNNER, CWLWORKFLOWS, ALIGNER_INDEX)
{% else %}
cmd_header = '{0} {1}/Alignments/star-alignment-SE.cwl --threads {{ cookiecutter.max_number_threads }} --genomeDir {2}  '.format(
        CWLRUNNER, CWLWORKFLOWS, ALIGNER_INDEX)
{% endif %}

with open('commands', "w") as fin:
    for s in samples:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
        r1 = os.path.join(data_dir, s + '_1.fastq.gz')
        r2 = os.path.join(data_dir, s + '_2.fastq.gz')
        if not os.path.exists(s + '_sorted.bam'):
            fin.write('{0} --reads_1 {1} --reads_2 {2} > {3}_{4} 2>&1\n'.format(cmd_header, r1, r2, s, log_suffix))
{% else %}
        r = os.path.join(data_dir, s + '.fastq.gz')
        if not os.path.exists(s + '_sorted.bam'):
            fin.write('{0} --reads_1 {1} > {2}_{3} 2>&1\n'.format(cmd_header, r, s, log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j 1
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)
{% elif cookiecutter.ngs_data_type == 'ChIP-Seq' %}

In [None]:

READSQUALITY = 30
SUBSAMPLE_NREADS = 200000

log_suffix = 'alignment.log'
cmd_header = '{0} {1}/ChIP-Seq/chip-seq-alignment.cwl --threads {{ cookiecutter.max_number_threads }} --genome_index {2} --genome_prefix {3} --readsquality {4} --subsample_nreads {5} '.format(
        CWLRUNNER, CWLWORKFLOWS, ALIGNER_INDEX, os.path.basename(GENOME_FASTA), READSQUALITY, SUBSAMPLE_NREADS)

with open('commands', "w") as fin:
    for s in samples:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
        r1 = os.path.join(data_dir, s + '_1.fastq.gz')
        r2 = os.path.join(data_dir, s + '_2.fastq.gz')
        if not os.path.exists(s + '_sorted.bam'):
            fin.write('{0} --reads {1} --reads {2} > {3}_{4} 2>&1\n'.format(cmd_header, r1, r2, s, log_suffix))
{% else %}
        r = os.path.join(data_dir, s + '.fastq.gz')
        if not os.path.exists(s + '_sorted.bam'):
            fin.write('{0} --reads {1} > {2}_{3} 2>&1\n'.format(cmd_header, r, s, log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j 1
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)

## Generating BAM files per condition

Here we merge all BAM files generated for each condition in one single BAM file named [condition]_sorted.bam

These files will be used with the output of IDR for the differential binding analysis

In [None]:
factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')
log_suffix = 'merge.log'
cmd_header = '{0} {1}/samtools/samtools-merge.cwl '.format(CWLRUNNER, CWLTOOLS)

with open('commands_bam_merge', "w") as fin:
    for c in factors['condition'].unique():
        if not os.path.exists(c + '_sorted.bam'):
            ids = factors[factors['condition'] == c]['SampleID']
            if len(ids) > 1:
                cmd_options = '--out_bam {0}'.format(c + '_sorted.bam')
                for s in ids:
                    s = os.path.join(result_dir, s + '_sorted.bam')
                    cmd_options = '{0} --in_bam {1}'.format(cmd_options, s)
                fin.write('{0} {1} > {2}_{3} 2>&1\n'.format(cmd_header, cmd_options, c , log_suffix)) 
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands_bam_merge | parallel -j 16
{% else %}
!sh commands_bam_merge    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)

### Generating pooled tagAlign from replicates

In [None]:
log_suffix = 'R0.log'
log_files = []
with open('commands_pooled', "w") as fin:
    for c in factors['condition'].unique():
        ids = factors[factors['condition'] == c]['SampleID']
        if len(ids) > 1:
            cmd_header = 'zcat '
            for s in ids:
                s = os.path.join(result_dir, s + '_sorted.tagAlign.gz')
                cmd_header = '{0} {1}'.format(cmd_header, s)
            fin.write('{0} | gzip -n > {1}_R0.tagAlign.gz 2> {1}_{2}\n'.format(cmd_header, c , log_suffix))  
            log_files.append('{0}_{1}'.format(c , log_suffix))
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands_pooled | parallel -j 16
{% else %}
!sh commands    
{% endif %}
if log_files:
    all_good = True
    for l in log_files:
        if os.stat(l).st_size != 0:
            print('Error in file: ' + l)
            all_good = False
    if all_good:
        print('Pooled tagAlign files created correctly')
{% elif cookiecutter.ngs_data_type == 'ChIP-exo' %}

In [None]:

READSQUALITY = 30
SUBSAMPLE_NREADS = 200000

log_suffix = 'alignment.log'
cmd_header = '{0} {1}/ChIP-Seq/chip-seq-alignment.cwl --threads {{ cookiecutter.max_number_threads }} --genome_index {2} --genome_prefix {3} --readsquality {4} --subsample_nreads {5} '.format(
        CWLRUNNER, CWLWORKFLOWS, ALIGNER_INDEX, os.path.basename(GENOME_FASTA), READSQUALITY, SUBSAMPLE_NREADS)

with open('commands', "w") as fin:
    for s in samples:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
        r1 = os.path.join(data_dir, s + '_1.fastq.gz')
        r2 = os.path.join(data_dir, s + '_2.fastq.gz')
        if not os.path.exists(s + '_sorted.bam'):
            fin.write('{0} --reads {1} --reads {2} > {3}_{4} 2>&1\n'.format(cmd_header, r1, r2, s, log_suffix))
{% else %}
        r = os.path.join(data_dir, s + '.fastq.gz')
        if not os.path.exists(s + '_sorted.bam'):
            fin.write('{0} --reads {1} > {2}_{3} 2>&1\n'.format(cmd_header, r, s, log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j 1
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)

## Generating BAM files per condition

Here we merge all BAM files generated for each condition in one single BAM file named [condition]_sorted.bam

These files will be used with the output of IDR for the differential binding analysis

In [None]:
factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')
log_suffix = 'merge.log'
cmd_header = '{0} {1}/samtools/samtools-merge.cwl '.format(CWLRUNNER, CWLTOOLS)

with open('commands_bam_merge', "w") as fin:
    for c in factors['condition'].unique():
        if not os.path.exists(c + '_sorted.bam'):
            ids = factors[factors['condition'] == c]['SampleID']
            if len(ids) > 1:
                cmd_options = '--out_bam {0}'.format(c + '_sorted.bam')
                for s in ids:
                    s = os.path.join(result_dir, s + '_sorted.bam')
                    cmd_options = '{0} --in_bam {1}'.format(cmd_options, s)
                fin.write('{0} {1} > {2}_{3} 2>&1\n'.format(cmd_header, cmd_options, c , log_suffix)) 
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands_bam_merge | parallel -j 16
{% else %}
!sh commands_bam_merge    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)

### Generating pooled tagAlign from replicates

In [None]:
log_suffix = 'R0.log'
log_files = []
with open('commands_pooled', "w") as fin:
    for c in factors['condition'].unique():
        ids = factors[factors['condition'] == c]['SampleID']
        if len(ids) > 1:
            cmd_header = 'zcat '
            for s in ids:
                s = os.path.join(result_dir, s + '_sorted.tagAlign.gz')
                cmd_header = '{0} {1}'.format(cmd_header, s)
            fin.write('{0} | gzip -n > {1}_R0.tagAlign.gz 2> {1}_{2}\n'.format(cmd_header, c , log_suffix))  
            log_files.append('{0}_{1}'.format(c , log_suffix))
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands_pooled | parallel -j 16
{% else %}
!sh commands    
{% endif %}
if log_files:
    all_good = True
    for l in log_files:
        if os.stat(l).st_size != 0:
            print('Error in file: ' + l)
            all_good = False
    if all_good:
        print('Pooled tagAlign files created correctly')
{% endif %}
