In [None]:
%run ../config/init.py
import itertools

### Creating results folders
{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}

In [None]:
data_dir = os.path.join(RESULTS, DATASET, 'quantification')
result_dir = os.path.join(RESULTS, DATASET, 'dga')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)
matrix_file = os.path.join(data_dir, 'ExonReads.tsv')
factors_file = os.path.join(DATA, DATASET, 'factors.txt')
factors = pandas.read_csv(factors_file, sep='\t')
factors

### Creating comparisons
In this cell, an array with all combinations of **conditions** is created.  
 
If you just want to explore a set of comparisons remove this cell and add the **comparisons** list manually. 
```
comparisons = [
    ['cond1', 'cond2'],
    ['cond2', 'cond3']
]
``` 

In [None]:
comparisons = []
for s in itertools.combinations(factors['condition'].unique(), 2):
    comparisons.append(list(s)) 
comparisons

### Executing Deseq2 and EdgeR workflows

In [None]:
log_suffix = 'dga.log'

cmd_deseq2_header = '{0} {1}/R/deseq2-2conditions.cwl --factor {2} --matrix {3} --gene_column Gene_Chr_Start --sample_column SampleID --fc {4} --fdr {5}'.format(
        CWLRUNNER, CWLTOOLS, factors_file, matrix_file, fc, fdr)
cmd_edgeR_header = '{0} {1}/R/edgeR-2conditions.cwl --factor {2} --matrix {3} --gene_column Gene_Chr_Start --sample_column SampleID --fc {4} --fdr {5} --gene_length_column ExonLength '.format(
        CWLRUNNER, CWLTOOLS, factors_file, matrix_file, fc, fdr)

with open('commands_dga', "w") as fin:
    for c in comparisons:
        if not os.path.exists('condition_{0}_vs_{1}_deseq2.csv'.format(c[0], c[1])):
            fin.write('{0} --condition1 {1} --condition2 {2} > {1}_vs_{2}_deseq2_{3} 2>&1\n'.format(cmd_deseq2_header, c[0], c[1], log_suffix))
        if not os.path.exists('condition_{0}_vs_{1}_edgeR.csv'.format(c[0], c[1])):
            fin.write('{0} --condition1 {1} --condition2 {2} > {1}_vs_{2}_edgeR_{3} 2>&1\n'.format(cmd_edgeR_header, c[0], c[1], log_suffix))

{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands_dga | parallel -j {{ cookiecutter.max_number_threads }}
{% else %}
!sh commands_dga    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)

### Creates TPM and reads matrices for genes

In [None]:
log_suffix_union = 'union.log'
    
cmd_volcano_header = '{0} {1}/R/volcano_plot.cwl --fdr {2} --fc {3} '.format(CWLRUNNER, CWLTOOLS, fdr, fc)

cmd_heatmap_header = '{0} {1}/R/dga_heatmaps.cwl --factor {2} --matrix {3} --gene_column Gene_Chr_Start --sample_column SampleID --fdr {4} --fc {5}'.format(
        CWLRUNNER, CWLTOOLS, factors_file, matrix_file, fdr, fc)
    
with open('commands_union_plots', "w") as fin:
    for c in comparisons:
        deseq2_df = pandas.read_csv('condition_{0}_vs_{1}_deseq2.csv'.format(c[0], c[1]))
        edgeR_df = pandas.read_csv('condition_{0}_vs_{1}_edgeR.csv'.format(c[0], c[1]))
        df = edgeR_df.merge(deseq2_df, left_on='Gene_Chr_Start', right_on='Gene_Id')
        df = df.drop(['Gene_Chr_Start'], axis=1)
        union_df = df[(df['FDR'] <= fdr) & (abs(df['logFC']) >= fc) & (df['padj'] <= fdr) & (abs(df['log2FoldChange']) >= fc)]
        unifying_data = []
        for i, r in df.iterrows():
            if abs(r['logFC']) == r['logFC']:
                logFC = min(float(r['logFC']), float(r['log2FoldChange']))
            else:
                logFC = max(float(r['logFC']), float(r['log2FoldChange']))
            unifying_data.append({
                'Gene_Id': r['Gene_Id'],
                'FDR': max(float(r['FDR']), float(r['padj'])),
                'logFC': logFC
            })
        unifying_data_df = pandas.DataFrame(unifying_data)
        unifying_data_df = unifying_data_df[['Gene_Id', 'logFC', 'FDR']]
        if len(unifying_data_df) > 0:
            unifying_file = 'condition_{0}_vs_{1}_union.csv'.format(c[0], c[1])
            unifying_data_df.to_csv(unifying_file, index=None)
    
            over_df = unifying_data_df[(unifying_data_df['FDR'] <= fdr)&(unifying_data_df['logFC'] >= fc)]
            over_df.to_csv('condition_{0}_vs_{1}_union_over-expressed.csv'.format(c[0], c[1]), index=None)
            under_df = unifying_data_df[(unifying_data_df['FDR'] <= fdr)&(unifying_data_df['logFC'] <= -1.0 * fc)]
            under_df.to_csv('condition_{0}_vs_{1}_union_under-expressed.csv'.format(c[0], c[1]), index=None)
    
            print("Genes with FDR <= %.2f and logFC >= %.2f: %d" % (fdr, fc, len(over_df)))
            print("Genes with FDR <= %.2f and logFC <= -%.2f: %d" % (fdr, fc, len(under_df)))
    
            fin.write('{0} --data {1} --out condition_{2}_vs_{3}_union_volcano.pdf > {2}_vs_{3}_volcano_{4} 2>&1\n'.format(
                cmd_volcano_header, unifying_file,c[0], c[1], log_suffix_union))
            fin.write('{0} --dga_data {1} --out_expression {2} --out_correlation {3} --out_pca {4} > {5}_vs_{6}_heatmap_{7} 2>&1\n'.format(
                cmd_heatmap_header, unifying_file,
                'condition_{0}_vs_{1}_union_expression_heatmap.pdf'.format(c[0], c[1]),
                'condition_{0}_vs_{1}_union_correlation_heatmap.pdf'.format(c[0], c[1]),
                'condition_{0}_vs_{1}_union_pca.pdf'.format(c[0], c[1]),
                     c[0], c[1], log_suffix_union))
    
!cat commands_union_plots | parallel -j {{ cookiecutter.max_number_threads }}

check_errors_from_logs(result_dir, log_suffix_union)
{% elif cookiecutter.ngs_data_type == 'ChIP-exo' %} 

In [None]:
factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')

data_dir = os.path.join(RESULTS, DATASET, 'peak_calling')
os.chdir(data_dir)
files = [ f for ds, dr, files in os.walk('./') for f in files if f.endswith('.bed')]

result_dir = os.path.join(RESULTS, DATASET, 'motif')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)

### DNA binding motif finding
For using MEME you should download the databases from http://meme-suite.org/doc/download.html

The Morif database (file `motif_databases.X.X.tgz`) should be uncompressed into the `data/meme` folder resulting 
in a structure like this:

```
    data
    ├── meme
    │   └── motif_databases
    │       ├── ARABD
    │       ├── CIS-BP
    │       ├── CISBP-RNA
    │       ├── ECOLI
    │       ├── EUKARYOTE
    │       ├── FLY
    │       ├── HUMAN
    │       ├── JASPAR
    │       ├── MALARIA
    │       ├── MIRBASE
    │       ├── MOUSE
    │       ├── PROKARYOTE
    │       ├── PROTEIN
    │       ├── RNA
    │       ├── TFBSshape
    │       ├── WORM
    │       └── YEAST
    └── PRJNA238004
```

In this example we will be using E. Coli databases: 
    * SwissRegulon_e_coli.meme
    * dpinteract.meme

Therefore, the variable `MEME_DB_CATEGORY = 'ECOLI'` should be edited accordingly. 

In [None]:
MEME_DB_CATEGORY = 'ECOLI'

log_suffix = 'meme.log'
meme_db_path = os.path.join(DATA,'meme', 'motif_databases', MEME_DB_CATEGORY)
meme_db = [
    'SwissRegulon_e_coli.meme',
    'dpinteract.meme'
]
for d in meme_db:
    os.chdir(result_dir)
    print('MEME DB: ' + d)

    db = os.path.join(meme_db_path, d)
    cmd_header = '{0} {1}/ChIP-Seq/meme-motif.cwl --nmotifs 10 --memedb {2} --genome {3} --bed'.format(
        CWLRUNNER, CWLWORKFLOWS, db, GENOME_FASTA)
    
    if not os.path.exists(d.replace('.meme','')):
        os.mkdir(d.replace('.meme',''))
    os.chdir(d.replace('.meme',''))    
    
    with open('commands_meme', "w") as fin:
        for f in files:
            fin.write('{0} {1} > {2}_{3} 2>&1 \n'.format(cmd_header, os.path.join(data_dir, f), 
                      f.replace('.bed',''), log_suffix))
        
    !cat commands_meme | parallel -j {{ cookiecutter.max_number_threads }}

    check_errors_from_logs(os.path.join(result_dir,d.replace('.meme','')), log_suffix)   
{% elif cookiecutter.ngs_data_type == 'ChIP-Seq' %} 

In [None]:
bam_dir = os.path.join(RESULTS, DATASET, 'alignments')
bed_dir = os.path.join(RESULTS, DATASET, 'peak-calling')
result_dir = os.path.join(RESULTS, DATASET, 'diffbind')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)
factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')
factors

### Generating comparison array

In [None]:
comparisons = []
for s in itertools.combinations(factors['condition'].unique(), 2):
    comparisons.append(list(s))
comparisons

### Processing samples

In [None]:
cmd_header = '{0} {1}/R/DiffBind.cwl --bamDir {2} --bedDir {3} --factor samples.tsv'.format(
    CWLRUNNER, CWLTOOLS, bam_dir, bed_dir)
for c in comparisons:
    minMembers = min(len(factors[factors['condition'] == c[0]]), len(factors[factors['condition'] == c[1]]))
    if minMembers >= 2:
        os.chdir(result_dir)
        comp = '{0}_vs_{1}'.format(c[0], c[1])
        if not os.path.exists(comp):
            os.mkdir(comp)
        os.chdir(comp)
        df = factors[(factors['condition'] == c[0]) | (factors['condition'] == c[1])]
        df = df.rename(index=str, columns={'id': 'id', 'condition': 'Condition', 'replicate':'Replicate'})
        df.to_csv('samples.tsv', index=None, sep='\t')
        with open('commands', "w") as fin:
            fin.write('{0} --minMembers {1} > diffbind.log 2>&1\n'.format(cmd_header, minMembers))
        print('Processing {0} vs {1}'.format(c[0], c[1]))
        !cat commands | parallel -j 16
        check_errors_from_logs(os.path.join(result_dir, comp), '.log')
      
{% endif %}
