In [None]:
%run ../config/init.py

## Setting workdir to `data/{{ cookiecutter.dataset_name }}`

In [None]:
data_dir = working_dir(os.path.join(DATA, DATASET))

## Loading sample table file
The sample table file named: **sample_table.csv** file should be in the folder `data/{{ cookiecutter.dataset_name }}`

The "sample_table.csv" file should have at least the following columns:  
`sample_name,file,condition,replicate`  
Columns:

Example:

```
sample_name,file,condition,replicate
SRR2126784,,PRE_NACT,1
SRR2126785,,PRE_NACT,1
SRR2126786,,PRE_NACT,1
```

In [None]:
sample_table_file = os.path.join(DATA, DATASET, 'sample_table.csv')
sample_table = pandas.read_csv(sample_table_file, keep_default_na=False)
sample_table.head()
{% if cookiecutter.is_data_in_SRA == 'y' %}

## Retrieving data using fastq-dump

In [None]:
log_file = 'download.log'

{% if cookiecutter.sequencing_technology is defined and cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/sra/download_quality_control.cwl --threads {2} --split-files --ncbi_config {3} '.format(
    CWLRUNNER, CWLWORKFLOWS, 2, NCBI_DIR)
{% else %}
cmd_header = '{0} {1}/sra/download_quality_control.cwl --threads {2} --ncbi_config {3} '.format(
    CWLRUNNER, CWLWORKFLOWS, 2, NCBI_DIR)
{% endif %}

{% if cookiecutter.create_demo == 'y' %}
cmd_header += ' -X {{ cookiecutter.number_spots}} '
{% endif %}
samples_cmd = ''
for s in sample_table['sample_name'].unique():
{% if cookiecutter.sequencing_technology is defined and cookiecutter.sequencing_technology == 'paired-end' %}
    if not os.path.exists(s + '_1.fastq.gz') or not os.path.exists(s + '_2.fastq.gz'):
        samples_cmd += '--accession {} '.format(s)
{% else %}
    if not os.path.exists(s + '.fastq.gz'):
        samples_cmd += '--accession {} '.format(s)
{% endif %}
cmd_header += '{} > {} 2>&1 &'.format(samples_cmd, log_file)
run_command(cmd_header)

### Checking command output
Execute next cell until it prints: **Run completed**

In [None]:
check_cwl_command_log(log_file)

### Add files to the sample table

In [None]:
{% if cookiecutter.sequencing_technology is defined and cookiecutter.sequencing_technology == 'paired-end' %}
for i, r in sample_table.iterrows():
    if not r['file']:
        r1 = r['sample_name'] + '_1.fastq.gz'
        r2 = r['sample_name'] + '_2.fastq.gz'
        sample_table.at[i, 'file'] = '{}|{}'.format(r1,r2)
{% else %}
for i, r in sample_table.iterrows():
    if not r['file']:
        sample_table.at[i, 'file'] =  r['sample_name'] + '.fastq.gz'
{% endif %}
sample_table.to_csv(sample_table_file, index=None)

{% else %}

## Pre-processing QC 

In [None]:
log_file = 'fastqc.log'

fastqc_yml = {
    'threads': 2,
    'fastqs': []
}

for i,r in sample_table.iterrows():
    files = r['file'].split('|')
    for f in files:  
        fastqc_report = f.replace('.fastq.gz', '_fastqc.html')
        if not os.path.exists(fastqc_report):
            fastqc_yml['fastqs'].append({'class': 'File', 'path': os.path.join(data_dir, f)})
            
if fastqc_yml['fastqs']:
    write_to_yaml(fastqc_yml, 'fastqc.yml')  
    cmd_header = '{0} {1}/pre-processing/fastqc-parallel.cwl fastqc.yml > {2} 2>&1 & '.format(
            CWLRUNNER, CWLWORKFLOWS, log_file)
    run_command(cmd_header)

### Checking command output
Execute next cell until it prints: **Run completed**

In [None]:
check_cwl_command_log(log_file)
{% endif %}