# Project Report
 
__Project Name__: {{ cookiecutter.project_name }}  
__Dataset Name__: {{ cookiecutter.dataset_name }}  
__Author__: {{ cookiecutter.author_name }}  
__Email__: {{ cookiecutter.email }}  
__Dataset__: {{ cookiecutter.dataset_name }}  
__Created Date__: {% now 'local', '%d/%m/%Y' %}  

## Outline

This is a Differential Gene Expression Analysis workflow for RNASeq data

Our workflow is based on a set of Jupyter Notebooks and [CWL workflows](https://github.com/ncbi/cwl-ngs-workflows-cbb). 
The workflows excuted the analysis using the following tools: 

 * FastQC, for pre-processing quality control
 * Trimmomatic, for reads trimming
 * STAR, for reads alignment
 * RSeQC, for alignment quality control
 * TPMCalculator, for mRNA abundance quantification
 * Deseq2, for DGA
 * EdgeR, for DGA

### Workflow  steps

 1. [Pre-processing QC](#1.-Pre-processing-QC)
 2. [Trimming](#2.-Trimming)
 3. [Alignments and Quantification](#3.-Alignments-and-Quantification)
     - [Alignment Quality Control](#3.1-Alignment-QC)
     - [Quantification](#3.2.-Quantification)
 4. [Differential Gene Expression Analysis](#5.-Differential-Gene-Expression-Analysis)
 5. [GO enrichment](#6.-GO-enrichment)
 
### Requirements

 1. PM4NGS
 2. Poppler (https://poppler.freedesktop.org/)
 3. Full list of requirements *requirements/conda-env-dependencies.yaml*
{% if cookiecutter.use_docker == 'y' %}
 4. Docker
{% endif %}   

In [None]:
%run ../config/init.py
from pm4ngs.jupyterngsplugin.markdown.fastqc import fastqc_table, fastqc_trimmomatic_table
from pm4ngs.jupyterngsplugin.markdown.alignment import alignment_table
from pm4ngs.jupyterngsplugin.markdown.alignment import reads_distribution_plot
from pm4ngs.jupyterngsplugin.markdown.rnaseq.quantification import tpmcalculator_distribution_plot
from pm4ngs.jupyterngsplugin.markdown.rnaseq.dga import dga_table, dga_gene_list_intersection
from pm4ngs.jupyterngsplugin.markdown.rnaseq.go import go_plots_table, go_html_table
from pm4ngs.jupyterngsplugin.markdown.rseqc import rseqc_table
from pm4ngs.jupyterngsplugin.markdown.rseqc import rseqc_plot_table
from pm4ngs.jupyterngsplugin.markdown.utils import hide_code_str
from pm4ngs.jupyterngsplugin.markdown.utils import info_table
from pm4ngs.jupyterngsplugin.utils.notebook import save_2_html
HTML(hide_code_str())

## 1. Pre-processing QC

In [None]:
data_dir = os.path.join(DATA, DATASET)
os.chdir(data_dir)
samples = [ f.replace('.fastq.gz','') for ds,dr,fs in os.walk('./') for f in fs if f.endswith('.fastq.gz')]
samples.sort()

os.chdir(NOTEBOOKS)
display(Markdown(info_table('01 - Pre-processing QC', data_dir)))

str_msg = '#### FastQC report\n'
display(Markdown(str_msg))
samples_data, str_msg =  fastqc_table(samples, 
                                      os.path.join(DATA, DATASET),
                                      os.path.join(DATA, DATASET))
display(Markdown(str_msg))
del str_msg

## 2. Trimming

In [None]:
os.chdir(NOTEBOOKS)
result_dir = os.path.join(RESULTS, DATASET, 'trimmomatic')
display(Markdown(info_table('02 - Samples trimming', result_dir)))

samples_data, str_msg =  fastqc_trimmomatic_table(samples_data, samples, result_dir)

display(Markdown(str_msg))
del str_msg

## 3. Alignments and Quantification

In [None]:
os.chdir(NOTEBOOKS)
result_dir = os.path.join(RESULTS, DATASET, 'alignments')
display(Markdown(info_table('03 - Alignments and Quantification', result_dir)))
display(Markdown('### Reference genome\n**{{ cookiecutter.genome_name }}**\n\n'))

samples = [ f.replace('_sorted.bam', '') for ds,dr,fs in os.walk(os.path.join(RESULTS, DATASET, 'alignments')) for f in fs if f.endswith('_sorted.bam')]
{% if cookiecutter.sequencing_technology == 'paired-end' %}
method = 'STAR_paired'
{% else %}
method = 'STAR_single'
{% endif %}
samples_data, str_msg = alignment_table(samples_data, samples, 
                                        os.path.join(RESULTS, DATASET, 'alignments'), method)
display(Markdown(str_msg))
del str_msg

reads_distribution_plot(samples_data, samples,(18,6), method)
plt.show()
plt.close()

## 3.1. Alignment-QC

In [None]:
os.chdir(NOTEBOOKS)
str_msg = '### RSeQC BAM Stats\n\n'
display(Markdown(str_msg))
str_msg = rseqc_table(samples, os.path.join(RESULTS, DATASET, 'alignments'))
display(Markdown(str_msg))
del str_msg

In [None]:
img_size = 350
str_msg = '### RSeQC PDF plots\n\n'
display(Markdown(str_msg))

str_msg = rseqc_plot_table(samples, os.path.join(RESULTS, DATASET, 'alignments'),
                          img_size, img_size)
display(Markdown(str_msg))
del str_msg

## 3.2. Quantification

In [None]:
os.chdir(NOTEBOOKS)
display(Markdown("### Exon TPM and reads distribution per sample"))
output_suffix = "_sorted_genes.out"
columns = ['ExonTPM', 'ExonReads']
for c in columns:
    tpmcalculator_distribution_plot(c, 
                                os.path.join(RESULTS, DATASET, 'alignments'), 
                                output_suffix, (10,12))
plt.show()
plt.close()

## 4. Differential Gene Expression Analysis

In [None]:
os.chdir(NOTEBOOKS)
result_dir = os.path.join(RESULTS, DATASET, 'dga')
display(Markdown(info_table('04 - DGA', result_dir)))

tools = {
    'deseq2':'Deseq2',
    'edgeR': 'EdgeR',
    'intersection': 'Intersection'
}

t = next(iter(tools))
conditions = [ f.replace('condition_','').replace('_' + t +'.csv','')  for d,ds,files in os.walk(result_dir) for f in files if '_' + t +'.csv' in f ]
conditions.sort()

img_size = 350
str_msg = '### Plots\n\n'
display(Markdown(str_msg))
str_msg = dga_table(conditions, tools, result_dir, img_size, img_size)
display(Markdown(str_msg))
del str_msg

str_msg = '### Cutoff:\n\n'
str_msg += "logFC: {:.3f}".format(fc) + '\n\n'
str_msg += "FDR {:.3e}".format(fdr) + '\n\n'
display(Markdown(str_msg))

str_msg = dga_gene_list_intersection(conditions, result_dir, '{{ cookiecutter.organism}}')
        
display(Markdown(str_msg))
        
del str_msg

## 5. GO enrichment

In [None]:
os.chdir(NOTEBOOKS)
result_dir = os.path.join(RESULTS, DATASET, 'go')
display(Markdown(info_table('05 - GO enrichment', result_dir)))

img_size = 350
str_msg = go_plots_table(conditions, result_dir, img_size, img_size)    
display(Markdown(str_msg))

str_msg = go_html_table(conditions, result_dir)
display(Markdown(str_msg))
del str_msg

In [None]:
os.chdir(NOTEBOOKS)
save_2_html("00 - Project Report.ipynb")