In [None]:
%run ../config/init.py

## Setting workdir to `data/{{ cookiecutter.dataset_name }}`

In [None]:
data_dir = os.path.join(DATA, DATASET)
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
os.chdir(data_dir)

{% if cookiecutter.is_data_in_SRA == 'y' %}
## Loading sample file SRA Run table
This example table is for BioProject:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
PRJNA508970
{% else %}
PRJNA339968
{% endif %}
Save the **SRARunTable.txt** from the NCBI Run Selector and place it on:  
`data/{{ cookiecutter.dataset_name }}`

In [None]:
sraruntable_file = os.path.join(DATA, DATASET, 'SraRunTable.txt')
sra_df = pandas.read_csv(sraruntable_file, sep='\t')
sra_df.head()

### Filtering SRA Run table and creates the "factors.txt" file

The "factors.txt" file should have the columns:  
`| id | SampleID | condition | replicate |`  
Columns:

This example table is for BioProject:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
 * id: A sample name (GSM3505829)
 * SampleID: SRA run id (SRR8308716)
 * condition: Samples conditions to analyze (Wild-type). No less than two conditions in all samples.
 * replicate: Replicate number (1)

Example:

| id | SampleID | condition | replicate |
| --- | --- | --- | --- |
| GSM3505829 | SRR8308716 | Wild-type | 1 |
| GSM3505830 | SRR8308717 | Wild-type | 2 |
| GSM3505833 | SRR8308720 | Dp(16Lipi-Zbtb21)1TybEmcf | 1 |
| GSM3505834 | SRR8308721 | Dp(16Lipi-Zbtb21)1TybEmcf | 2 |
{% else %}
 * id: A sample name (classical01)
 * SampleID: SRA run id (SRR4053795)
 * condition: Samples conditions to analyze (classical). No less than two conditions in all samples.
 * replicate: Replicate number (1)

Example:

| id | SampleID | condition | replicate |
| --- | --- | --- | --- |
| classical01 | SRR4053795 | classical | 1 |
| classical02 | SRR4053796 | classical | 1 |
| nonclassical01 | SRR4053802 | nonclassical | 1 |
| nonclassical02 | SRR4053803 | nonclassical | 1 |
{% endif %}

Modify this cell to extract from the **SRARunTable.txt** the columns for your project.

In [None]:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
factors = sra_df[['Sample_Name', 'Run', 'treatment']]
factors = factors.rename(index=str, columns={'Run': 'SampleID', 'Sample_Name':'id', 'treatment': 'condition'})
factors['condition'] = factors['condition'].str.replace('[^a-zA-Z]', '')
factors['replicate'] = 1
{% else %}
factors = sra_df[['Sample_Name', 'Run']]
factors = factors.rename(index=str, columns={'Run': 'SampleID', 'Sample_Name':'id'})
factors['condition'] = factors['id'].str[:-2]
factors['condition'] = factors['condition'].str.replace('[^a-zA-Z]', '')
factors['replicate'] = 1 
{% endif %}
factors_file = os.path.join(DATA, DATASET, 'factors.txt')
factors.to_csv(factors_file, index=None, sep='\t')
factors

## Retriving data from the BioProject ID using fastq-dump

In [None]:
log_suffix = 'download.log'
{% if cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/sra/download_quality_control.cwl --threads {2} --split-files'.format(
        CWLRUNNER, CWLWORKFLOWS, 4)

{% if cookiecutter.create_demo == 'y' %}
cmd_header = '{0} -X {{ cookiecutter.number_spots}} '.format(cmd_header)
{% endif %}

with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '_1.fastq.gz') or not os.path.exists(r['SampleID'] + '_2.fastq.gz'):
            fin.write('{0} --accession {1} > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% else %}
cmd_header = '{0} {1}/sra/download_quality_control.cwl --threads {2}'.format(
        CWLRUNNER, CWLWORKFLOWS, 4)

{% if cookiecutter.create_demo == 'y' %}
cmd_header = '{0} -X {{ cookiecutter.number_spots}} '.format(cmd_header)
{% endif %}

with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '.fastq.gz'):
            fin.write('{0} --accession {1} > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j 4
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(data_dir, log_suffix)
{% else %}

## Loading data from "factors.txt" file
First copy your data to:  `data/{{ cookiecutter.dataset_name }}` 

Then, create a "factors.txt" file. 

The "factors.txt" file should have the columns:  
`| id | SampleID | condition | replicate |`  
Columns:

{% if cookiecutter.sequencing_technology == 'paired-end' %}
For example:
 * id: A sample name (classical01)
 * SampleID: Sampla id (classical01_L000_R1). The files in `data/{{ cookiecutter.dataset_name }}` should be named: classical01_L000_R1_1.fastq.gz nd classical01_L000_R1_2.fastq.gz
 * condition: Samples conditions to analyze (classical). No less than two conditions in all samples.
 * replicate: Replicate number (1)

Example:

| id | SampleID | condition | replicate |
| --- | --- | --- | --- |
| classical01 | classical01_L000_R1 | classical | 1 |
| classical02 | classical01_L000_R2 | classical | 1 |
| nonclassical01 | nonclassical01_L000_R1 | nonclassical | 1 |
| nonclassical02 | nonclassical01_L000_R1 | nonclassical | 2 |
{% else %}
For example:
 * id: A sample name (classical01)
 * SampleID: Sampla id (classical01_L000_R1). The file in `data/{{ cookiecutter.dataset_name }}` should be named: classical01_L000_R1.fastq.gz
 * condition: Samples conditions to analyze (classical). No less than two conditions in all samples.
 * replicate: Replicate number (1)

Example:

| id | SampleID | condition | replicate |
| --- | --- | --- | --- |
| classical01 | classical01_L000_R1 | classical | 1 |
| classical01 | classical01_L000_R2 | classical | 1 |
| nonclassical01 | nonclassical01_L000_R1 | nonclassical | 1 |
| nonclassical01 | nonclassical01_L000_R2 | nonclassical | 2 |
{% endif %}

In [None]:
factors_file = os.path.join(DATA, DATASET, 'factors.txt')
factors = pandas.read_csv(factors_file, sep='\t')
factors.head()

## Pre-processing data

In [None]:
log_suffix = 'fastqc.log'
{% if cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/fastqc/fastqc.cwl --threads {2} '.format(
        CWLRUNNER, CWLTOOLS, 4)
with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '_1_fastqc.html') or not os.path.exists(r['SampleID'] + '_1_fastqc.zip') or\
            not os.path.exists(r['SampleID'] + '_2_fastqc.html') or not os.path.exists(r['SampleID'] + '_2_fastqc.zip'):
            fin.write('{0} --fastq {1}_1.fastq.gz --fastq {1}_2.fastq.gz > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% else %}
cmd_header = '{0} {1}/fastqc/fastqc.cwl --threads {2} --fastq '.format(
        CWLRUNNER, CWLTOOLS, 4)
with open('commands', "w") as fin:
    for i,r in factors.iterrows():
        if not os.path.exists(r['SampleID'] + '_fastqc.html') or not os.path.exists(r['SampleID'] + '_fastqc.zip'):
            fin.write('{0} {1}.fastq.gz > {1}_{2} 2>&1\n'.format(cmd_header, r['SampleID'], log_suffix))
{% endif %}
{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j 4
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(data_dir, log_suffix)
{% endif %}
