# Project Report
 
__Project Name__: {{ cookiecutter.project_name }}  
__Dataset Name__: {{ cookiecutter.dataset_name }}  
__Author__: {{ cookiecutter.author_name }}  
__Email__: {{ cookiecutter.email }}  
__Dataset__: {{ cookiecutter.dataset_name }}  
__Created Date__: {% now 'local', '%d/%m/%Y' %}  

## Outline

{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}
This is a Differential Gene Expression Analysis workflow for RNASeq data
{%- elif cookiecutter.ngs_data_type == 'ChIP-Seq' -%}
This is a Differential Binding Analysis workflow for ChIP-Seq data
{%- elif cookiecutter.ngs_data_type == 'ChIP-exo' -%}
This is a Differential Binding Analysis workflow for ChIP-exo data
{% endif %}

Our workflow is based on a set of Jupyter Notebooks and [CWL workflows](https://gitlab.com/r78v10a07/cwl-workflow/). 
The workflows excuted the analysis using the following tools: 

 * FastQC, for pre-processing quality control
 * Trimmomatic, for reads trimming
{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}
 * STAR, for reads alignment
 * RSeQC, for alignment quality control
 * TPMCalculator, for mRNA abundance quantification
 * Deseq2, for DGA
 * EdgeR, for DGA
{%- elif cookiecutter.ngs_data_type == 'ChIP-Seq' -%}
 * BWA, for reads alignment
 * Phantompeakqualtools, for ChIP-Seq quality control
 * MACS2, for peak calling
 * IDR, for Irreproducible Discovery Rate (IDR) peak classification
 * Homer, for peak annotation
 * Diffbind for Differential binding analysis 
{%- elif cookiecutter.ngs_data_type == 'ChIP-exo' -%}
 * BWA, for reads alignment
 * Phantompeakqualtools, for ChIP-Seq quality control
 * MACE, for peak calling
 * MEME, for motif finding
{% endif %} 


### Workflow  steps

 1. [Pre-processing QC](#1.-Pre-processing-QC)
 2. [Trimming](#2.-Trimming)
 3. [Alignment](#3.-Alignment)
     - [Alignment Quality Control](#3.1-Alignment-QC)
{% if cookiecutter.ngs_data_type == 'RNA-Seq' %} 
 4. [Quantification](#4.-Quantification)
 5. [Differential Gene Expression Analysis](#5.-Differential-Gene-Expression-Analysis)
 6. [GO enrichment](#6.-GO-enrichment)
{%- elif cookiecutter.ngs_data_type == 'ChIP-Seq' -%}
 4. [Peak Calling](#4.-Peak-Calling)
     - [Irreproducible Discovery Rate](#4.1.-Irreproducible-Discovery-Rate) 
 5. [Differential binding Detection](#5.-Differential-binding-Detection)
{%- elif cookiecutter.ngs_data_type == 'ChIP-exo' -%}
 4. [Peak Calling](#4.-Peak-Calling)
 5. [DNA Motif finding](#5.-DNA-Motif-finding)
{% endif %}
 
### Requirements

 1. Python 3.6+
    - numpy
    - scipy
    - pandas
    - seaborn
    - matplotlib
    - jupyter
    - cwltool
    - jupyterngsplugin
{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}
    - networkx
    - goenrichment
{% endif %}
 2. CWL Tools definition files: [cwl-workflow](https://gitlab.com/r78v10a07/cwl-workflow/)       

In [None]:
%run ../config/init.py

from jupyterngsplugin.markdown.fastqc import fastqc_table, fastqc_trimmomatic_table
from jupyterngsplugin.markdown.alignment import alignment_table
from jupyterngsplugin.markdown.alignment import reads_distribution_plot

{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}
from jupyterngsplugin.markdown.rnaseq.quantification import tpmcalculator_distribution_plot
from jupyterngsplugin.markdown.rnaseq.dga import dga_table, dga_gene_list_union
from jupyterngsplugin.markdown.rnaseq.go import go_plots_table, go_html_table
from jupyterngsplugin.markdown.rseqc import rseqc_table
from jupyterngsplugin.markdown.rseqc import rseqc_plot_table
{%- elif cookiecutter.ngs_data_type == 'ChIP-Seq' -%}
from jupyterngsplugin.markdown.phantompeakqualtools import qc_table
from jupyterngsplugin.markdown.chipseq import peak_calling_table_with_qc
from jupyterngsplugin.markdown.chipseq import idr_table
from jupyterngsplugin.markdown.chipseq import diffbind_table
{%- elif cookiecutter.ngs_data_type == 'ChIP-exo' -%}
from jupyterngsplugin.markdown.phantompeakqualtools import qc_table
from jupyterngsplugin.markdown.chipexo import peak_calling_table_with_qc
from jupyterngsplugin.markdown.chipexo import meme_motif_table
{% endif %}
from jupyterngsplugin.markdown.utils import hide_code_str
from jupyterngsplugin.utils.notebook import save_2_html
HTML(hide_code_str())

## 1. Pre-processing QC

In [None]:
data_dir = os.path.join(DATA, DATASET)
os.chdir(data_dir)
samples = [ f.replace('.fastq.gz','') for ds,dr,fs in os.walk('./') for f in fs if f.endswith('.fastq.gz')]
samples.sort()

os.chdir(NOTEBOOKS)
name = '01 - Pre-processing QC'
str_msg = '<a href="' + name.replace(' ', '%20') + '.ipynb" target="_blank">' + name + '</a>\n'
display(Markdown(str_msg))

str_msg = '#### FastQC report\n'
display(Markdown(str_msg))
samples_data, str_msg =  fastqc_table(samples, 
                                      os.path.join(DATA, DATASET),
                                      os.path.join(DATA, DATASET))
display(Markdown(str_msg))
del str_msg

## 2. Trimming

In [None]:
os.chdir(NOTEBOOKS)
name = '02 - Samples trimming'
str_msg = '<a href="' + name.replace(' ', '%20') + '.ipynb" target="_blank">' + name + '</a>\n\n'
display(Markdown(str_msg))

samples_data, str_msg =  fastqc_trimmomatic_table(samples_data, samples, 
                                      os.path.join(RESULTS, DATASET, 'trimmomatic'))

display(Markdown(str_msg))
del str_msg

{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}

## 3. Alignment

In [None]:
os.chdir(NOTEBOOKS)
name = '03 - Alignments'
str_msg = '<a href="' + name.replace(' ', '%20') + '.ipynb" target="_blank">' + name + '</a>\n'
str_msg += '### Reference genome\n**{{ cookiecutter.genome_name }}**\n\n'
display(Markdown(str_msg))
samples = [ f.replace('_sorted.bam', '') for ds,dr,fs in os.walk(os.path.join(RESULTS, DATASET, 'alignments')) for f in fs if f.endswith('_sorted.bam')]
{% if cookiecutter.sequencing_technology == 'paired-end' %}
method = 'STAR_paired'
{% else %}
method = 'STAR_single'
{% endif %}
samples_data, str_msg = alignment_table(samples_data, samples, 
                                        os.path.join(RESULTS, DATASET, 'alignments'), method)
display(Markdown(str_msg))
del str_msg

reads_distribution_plot(samples_data, samples,(18,6), method)
plt.show()
plt.close()

## 3.1. Alignment-QC

In [None]:
os.chdir(NOTEBOOKS)
str_msg = '### RSeQC BAM Stats\n\n'
display(Markdown(str_msg))
str_msg = rseqc_table(samples, os.path.join(RESULTS, DATASET, 'quantification'))
display(Markdown(str_msg))
del str_msg

In [None]:
img_size = 250
str_msg = '### RSeQC PDF plots\n\n'
display(Markdown(str_msg))

str_msg = rseqc_plot_table(samples, os.path.join(RESULTS, DATASET, 'quantification'),
                          img_size, img_size)
display(Markdown(str_msg))
del str_msg

## 4. Quantification

In [None]:
os.chdir(NOTEBOOKS)
name = '04 - Quantification'
str_msg = '<a href="' + name.replace(' ', '%20') + '.ipynb" target="_blank">' + name + '</a>\n\n'
display(Markdown(str_msg))
del str_msg

display(Markdown("### Exon TPM and reads distribution per sample"))
output_suffix = "_sorted_genes.out"
columns = ['ExonTPM', 'ExonReads']
for c in columns:
    tpmcalculator_distribution_plot(c, 
                                os.path.join(RESULTS, DATASET, 'quantification'), 
                                output_suffix, (10,12))
plt.show()
plt.close()

## 5. Differential Gene Expression Analysis

In [None]:
os.chdir(NOTEBOOKS)
tools = {
    'deseq2':'Deseq2',
    'edgeR': 'EdgeR',
    'union': 'Union'
}

t = next(iter(tools))
result_dir = os.path.join(RESULTS, DATASET, 'dga')
conditions = [ f.replace('condition_','').replace('_' + t +'.csv','')  for d,ds,files in os.walk(result_dir) for f in files if '_' + t +'.csv' in f ]
conditions.sort()

img_size = 350
str_msg = '### Plots\n\n'
display(Markdown(str_msg))
str_msg = dga_table(conditions, tools, result_dir, img_size, img_size)
display(Markdown(str_msg))
del str_msg

str_msg = '### Cutoff:\n\n'
str_msg += "logFC: {:.3f}".format(fc) + '\n\n'
str_msg += "FDR {:.3e}".format(fdr) + '\n\n'
display(Markdown(str_msg))

str_msg = dga_gene_list_union(conditions, result_dir, '{{ cookiecutter.organism}}')
        
display(Markdown(str_msg))
        
del str_msg

## 6. GO enrichment

In [None]:
os.chdir(NOTEBOOKS)
result_dir = os.path.join(RESULTS, DATASET, 'go')
img_size = 350
str_msg = go_plots_table(conditions, result_dir, img_size, img_size)    
display(Markdown(str_msg))

str_msg = go_html_table(conditions, result_dir)
display(Markdown(str_msg))
del str_msg
{%- elif cookiecutter.ngs_data_type == 'ChIP-Seq' -%}

## 3. Alignment

In [None]:
os.chdir(NOTEBOOKS)
name = '03 - Alignments'
str_msg = '<a href="' + name.replace(' ', '%20') + '.ipynb" target="_blank">' + name + '</a>\n'
str_msg += '### Reference genome\n**{{ cookiecutter.genome_name }}**\n\n'
display(Markdown(str_msg))
factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')
samples = factors['SampleID']
{% if cookiecutter.sequencing_technology == 'paired-end' %}
method = 'BWA_paired'
{% else %}
method = 'BWA_single'
{% endif %}
samples_data, str_msg =  alignment_table(samples_data, samples, os.path.join(RESULTS, DATASET, 'alignments'), method) 
display(Markdown(str_msg))
del str_msg

## 3.1. Alignment-QC

In [None]:
os.chdir(NOTEBOOKS)
samples_data, str_msg =  alignment_table(samples_data, samples, os.path.join(RESULTS, DATASET, 'alignments'), method) 
display(Markdown(str_msg))
del str_msg

### Distribution of Reads

In [None]:
reads_distribution_plot(samples_data, samples,(18,6), method)
plt.show()
plt.close()

## 3.2. Alignment QC

### Phantompeakqualtools

See https://github.com/kundajelab/phantompeakqualtools for more detailed info about the Phantompeakqualtools parameters

In [None]:
str_msg =  qc_table(samples, os.path.join(RESULTS, DATASET, 'alignments'), 450, 450)
display(Markdown(str_msg))
del str_msg

## 4. Peak Calling
### Phantompeakqualtools savp PDF plots with MACS2 identified peaks for pooled samples

In [None]:
img_size = 250
os.chdir(NOTEBOOKS)

alignment_path = os.path.join(RESULTS, DATASET, 'alignments')
peak_calling_path = os.path.join(RESULTS, DATASET, 'peak-calling')
str_msg = peak_calling_table_with_qc(factors, alignment_path, peak_calling_path, img_size, img_size)
display(Markdown(str_msg))
del str_msg

## 4.1. Irreproducible Discovery Rate

In [None]:
os.chdir(NOTEBOOKS)
img_size = 450
str_msg = idr_table(factors, os.path.join(RESULTS, DATASET, 'idr'), img_size, img_size)
display(Markdown(str_msg))
del str_msg

## 5. Differential binding Detection

In [None]:
os.chdir(NOTEBOOKS)
img_size = 450
os.chdir(NOTEBOOKS)
result_dir = os.path.join(RESULTS, DATASET, 'diffbind')
for dirname, dirnames, filenames in os.walk(result_dir):
    for subdirname in dirnames:
        str_msg = '### Condition: ' + subdirname.replace('_', ' ') + '\n\n'
        str_msg += diffbind_table(os.path.join(dirname, subdirname), img_size, img_size)
        
        display(Markdown(str_msg))
del str_msg
os.chdir(NOTEBOOKS)
{%- elif cookiecutter.ngs_data_type == 'ChIP-exo' -%}

## 3. Alignment

In [None]:
os.chdir(NOTEBOOKS)
name = '03 - Alignments'
str_msg = '<a href="' + name.replace(' ', '%20') + '.ipynb" target="_blank">' + name + '</a>\n'
str_msg += '### Reference genome\n**{{ cookiecutter.genome_name }}**\n\n'
display(Markdown(str_msg))
factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')
samples = factors['SampleID']
{% if cookiecutter.sequencing_technology == 'paired-end' %}
method = 'BWA_paired'
{% else %}
method = 'BWA_single'
{% endif %}
samples_data, str_msg =  alignment_table(samples_data, samples, os.path.join(RESULTS, DATASET, 'alignments'), method) 
display(Markdown(str_msg))
del str_msg

### Distribution of Reads

In [None]:
reads_distribution_plot(samples_data, samples,(18,6), method)
plt.show()
plt.close()

## 4. Alignment QC

### Phantompeakqualtools

See https://github.com/kundajelab/phantompeakqualtools for more detailed info about the Phantompeakqualtools parameters

In [None]:
str_msg =  qc_table(samples, os.path.join(RESULTS, DATASET, 'alignments'), 450, 450)
display(Markdown(str_msg))
del str_msg

## 4. Peak Calling

### Phantompeakqualtools savp PDF plots with MACE identified peaks

In [None]:
img_size = 250
os.chdir(NOTEBOOKS)

factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')
str_msg = peak_calling_table_with_qc(factors, 
                                     os.path.join(RESULTS, DATASET, 'alignments'), 
                                     os.path.join(RESULTS, DATASET, 'peak_calling'), img_size, img_size)
display(Markdown(str_msg))
del str_msg

## 5. DNA Motif finding
### Phantompeakqualtools savp PDF plots with MACE identified peaks

In [None]:
img_size = 250
os.chdir(NOTEBOOKS)
factors = pandas.read_csv(os.path.join(DATA, DATASET, 'factors.txt'), sep='\t')
str_msg = meme_motif_table(factors, 
                           os.path.join(RESULTS, DATASET, 'motif'), 
                           img_size, img_size)
display(Markdown(str_msg))
del str_msg
{% endif %}

In [None]:
os.chdir(NOTEBOOKS)
save_2_html("00 - Project Report.ipynb")