In [None]:
%run ../config/init.py

### Creating results folders

In [None]:
data_dir = os.path.join(RESULTS, DATASET, 'alignments')
{% if cookiecutter.ngs_data_type == 'RNA-Seq' %}
result_dir = os.path.join(RESULTS, DATASET, 'quantification')
if not os.path.exists(result_dir):
    os.mkdir(result_dir) 
os.chdir(result_dir)
{% if cookiecutter.sequencing_technology == 'paired-end' %}
samples = [ f.replace('_sorted.bam', '') for ds,dr,fs in os.walk(data_dir) for f in fs if f.endswith('.bam')]
{% else %}
samples = [ f.replace('_sorted.bam', '') for ds,dr,fs in os.walk(data_dir) for f in fs if f.endswith('.bam')]
{% endif %}

### Processing samples

In [None]:
{% if cookiecutter.sequencing_technology == 'paired-end' %}
cmd_header = '{0} {1}/RNA-Seq/rnaseq-tpmcalculator.cwl -p --gtf {2} -q 255 -r {3} --genome_name {4} --sorted_bam'.format(
        CWLRUNNER, CWLWORKFLOWS, GENOME_GTF, GENOME_BED, GENOME_NAME)

{% else %}
cmd_header = '{0} {1}/RNA-Seq/rnaseq-tpmcalculator.cwl --gtf {2} -q 255 -r {3} --genome_name {4} --sorted_bam'.format(
        CWLRUNNER, CWLWORKFLOWS, GENOME_GTF, GENOME_BED, GENOME_NAME)
{% endif %}
log_suffix = 'quantification.log'
with open('commands', "w") as fin:
    for s in samples:
        r = os.path.join(data_dir, s + '_sorted.bam')
        if not os.path.exists(s + '_sorted_genes.out.gz'):
            fin.write('{0} {1} > {2}_{3} 2>&1\n'.format(cmd_header, r, s, log_suffix))

{% if cookiecutter.use_gnu_parallel == 'y' %}
!cat commands | parallel -j {{ cookiecutter.max_number_threads }}
{% else %}
!sh commands    
{% endif %}
check_errors_from_logs(result_dir, log_suffix)

### Creates TPM and reads matrices for genes

In [None]:
data = {}
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_sorted_genes.out"
files = [ f for ds, df, files in os.walk('./') for f in files if output_suffix in f]
for column in columns:
    print(column)
    data[column] = pandas.DataFrame()
    for f in files:
        # Get sample name removing the suffix and check if the output is compressed
        if f.endswith('.gz'):
            output_suffix_real = output_suffix + '.gz'
        else:
            output_suffix_real = output_suffix
        s = f.replace(output_suffix_real, '')
        df = pandas.read_csv(f, sep='\t')
        df = df[['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength', column]]
        df = df.rename(index=str, columns={column: s})
        if data[column].empty:
            data[column] = df
        else:
            data[column] = data[column].merge(df, on=['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength'], how='outer')
    print('Data columns: ' + str(len(data[column].columns)))
    print('Data rows: ' + str(len(data[column])))
    
    # Printing TSV matrices    
    data[column]['Gene_Chr_Start'] = data[column]['Gene_Id'] + '_' + data[column]["Chr"] + '_' + data[column]["Start"].map(str)
    data[column] = data[column].drop(['Gene_Id'], axis=1)
    cols = data[column].columns.tolist()
    cols = cols[-1:] + cols[:-1]
    data[column] = data[column][cols]
    data[column].to_csv( column + '.tsv', sep='\t', index=False, na_rep='0')

### Plotting Exon TPM and read count per sample

In [None]:
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_sorted_genes.out"
files = [ f for ds, df, files in os.walk('./') for f in files if output_suffix in f]
files.sort()
for column in columns:    
    plt.figure(figsize=(10, 12)) 
    toPlot = []
    for f in files:
        if f.endswith('.gz'):
            output_suffix_real = output_suffix + '.gz'
        else:
            output_suffix_real = output_suffix
        s = f.replace(output_suffix_real, '')
        if s in data[column]:
            for r in data[column][s]:
                toPlot.append([r, s])
    d = pandas.DataFrame(toPlot, columns=[column, 'Sample'])
    ax = sns.boxplot(y='Sample', x=column, data=d, orient="h", palette="Set2")
{% endif %}