In [None]:
%run ../config/init.py

## Setting workdir to `data/{{ cookiecutter.dataset_name }}`

In [None]:
data_dir = os.path.join(DATA, DATASET)
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
os.chdir(data_dir)

## Loading sample file
A **factors.txt** file should be in the `data/{{ cookiecutter.dataset_name }}`

The "factors.txt" file should have at least the following columns:  
`| SampleID | condition | replicate |`  
Columns:

Example:

| SampleID | condition | replicate |
| --- | --- | --- |
| SRR8308716 | Treatment | 1 |
| SRR8308717 | Treatment | 2 |
| SRR8308720 | Normal | 1 |
| SRR8308721 | Normal | 2 |


In [None]:
factors_file = os.path.join(DATA, DATASET, 'factors.txt')
factors = pandas.read_csv(factors_file, sep='\t')
factors

{% if cookiecutter.is_data_in_SRA == 'y' %}

## Retrieving data from the BioProject ID using fastq-dump

In [None]:
log_suffix = 'download.log'
{% if cookiecutter.sequencing_technology is defined and cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/sra/download_quality_control.cwl --threads {2} --split-files'.format(
        CWLRUNNER, CWLWORKFLOWS, 4)

{% if cookiecutter.create_demo == 'y' %}
cmd_header = '{0} -X {{ cookiecutter.number_spots}} '.format(cmd_header)
{% endif %}

with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '_1.fastq.gz') or not os.path.exists(r['SampleID'] + '_2.fastq.gz'):
            fin.write('{0} --accession {1} > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% else %}
cmd_header = '{0} {1}/sra/download_quality_control.cwl --threads {2}'.format(
        CWLRUNNER, CWLWORKFLOWS, 4)

{% if cookiecutter.create_demo is defined and cookiecutter.create_demo == 'y' %}
cmd_header = '{0} -X {{ cookiecutter.number_spots}} '.format(cmd_header)
{% endif %}

with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '.fastq.gz'):
            fin.write('{0} --accession {1} > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j 4
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(data_dir, log_suffix)
{% else %}

## Pre-processing data

In [None]:
log_suffix = 'fastqc.log'
{% if cookiecutter.sequencing_technology is defined and cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/fastqc/fastqc.cwl --threads {2} '.format(
        CWLRUNNER, CWLTOOLS, 4)
with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '_1_fastqc.html') or not os.path.exists(r['SampleID'] + '_1_fastqc.zip') or\
            not os.path.exists(r['SampleID'] + '_2_fastqc.html') or not os.path.exists(r['SampleID'] + '_2_fastqc.zip'):
            fin.write('{0} --fastq {1}_1.fastq.gz --fastq {1}_2.fastq.gz > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% else %}
cmd_header = '{0} {1}/fastqc/fastqc.cwl --threads {2} --fastq '.format(
        CWLRUNNER, CWLTOOLS, 4)
with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '_fastqc.html') or not os.path.exists(r['SampleID'] + '_fastqc.zip'):
            fin.write('{0} {1}.fastq.gz > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j 4
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(data_dir, log_suffix)
{% endif %}
