In [None]:
%run ../config/init.py

### Creating results folders

In [None]:
data_dir = os.path.join(RESULTS, DATASET, 'trimmomatic')
result_dir = working_dir(os.path.join(RESULTS, DATASET, 'alignments'))
sample_table_file = os.path.join(DATA, DATASET, 'sample_table.csv')
sample_table = pandas.read_csv(sample_table_file, keep_default_na=False)
sample_table.head()

### Creating Genome indexes if they don't exists

In [None]:
if not os.path.exists(GENOME):
    working_dir(GENOME)
    !curl -o {GENOME_NAME}.tar.gz https://ftp.ncbi.nlm.nih.gov/pub/pm4ngs/resources/{GENOME_NAME}.tar.gz
    !tar xzfv {GENOME_NAME}.tar.gz --strip 1
    !rm {GENOME_NAME}.tar.gz
    working_dir(ALIGNER_INDEX)
    log_file = 'star_index.log'
    index_cmd = '{0} {1}/star/star-index.cwl --runThreadN {2} --genomeDir . '.format(CWLRUNNER, CWLTOOLS, 32)
    index_cmd += '--genomeFastaFiles ../genome.fa  --sjdbGTFfile ../genes.gtf > index.log 2>&1 &'
    run_command(index_cmd)

In [None]:
if os.path.exists('index.log'):
    check_cwl_command_log('index.log')

### Processing samples

In [None]:
working_dir(result_dir)
log_file = 'alignment.log'

alignment_yml = {
{% if cookiecutter.organism == 'human' %}
    'ramMaxRSeQC': 32000,
    'ramMaxSTAR': 32000,
{% else %}
    'ramMaxRSeQC': 16000,
    'ramMaxSTAR': 16000,
{% endif %}
{% if cookiecutter.sequencing_technology == 'paired-end' %}
    'p': True,
{% endif %}
    'threads': 16,
    'genomeDir': {'class': 'Directory', 'path': ALIGNER_INDEX },
    'genome_bed': {'class': 'File', 'path': GENOME_BED },
    'genome_gtf': {'class': 'File', 'path': GENOME_GTF },
    'q': 255,
    'reads': []
}

{% if cookiecutter.sequencing_technology == 'paired-end' %}
for i, r in sample_table.iterrows():
    files = r['file'].split('|')
    r1 = os.path.join(DATA, DATASET, files[0])
    r2 = os.path.join(DATA, DATASET, files[1])
    if not os.path.exists(files[0].replace('_1.fastq.gz', '_genes.out.gz')):
        alignment_yml['reads'].append([
            {'class': 'File', 'path': r1},
            {'class': 'File', 'path': r2}])
{% else %}
for i, r in sample_table.iterrows():
    f = os.path.join(DATA, DATASET, r['file'])
    if not os.path.exists(r['file'].replace('.fastq.gz', '_genes.out.gz')):
        alignment_yml['reads'].append([
            {'class': 'File', 'path': f}])
{% endif %}

if alignment_yml['reads']:
    write_to_yaml(alignment_yml, 'alignment.yml')  
    cmd_header = '{} {}/RNA-Seq/rnaseq-alignment-quantification.cwl alignment.yml > {} 2>&1 &'.format(
        CWLRUNNER, CWLWORKFLOWS, log_file)
    run_command(cmd_header)
    

### Checking command output
Execute next cell until it prints: **Run completed**

In [None]:
check_cwl_command_log(log_file)

### Creates TPM and reads matrices for genes

In [None]:
data = {}
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_sorted_genes.out.gz"
files = [ f for ds, df, files in os.walk('./') for f in files if f.endswith(output_suffix)]
for column in columns:
    print(column)
    data[column] = pandas.DataFrame()
    for f in files:
        s = f.replace(output_suffix, '')
        df = pandas.read_csv(f, sep='\t')
        df = df[['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength', column]]
        df = df.rename(index=str, columns={column: s})
        if data[column].empty:
            data[column] = df
        else:
            data[column] = data[column].merge(df, on=['Gene_Id', 'Chr', 'Start', 'End', 'ExonLength'], how='outer')
    print('Data columns: ' + str(len(data[column].columns)))
    print('Data rows: ' + str(len(data[column])))
    
    # Printing TSV matrices    
    data[column]['Gene_Chr_Start'] = data[column]['Gene_Id'] + '_' + data[column]["Chr"] + '_' + data[column]["Start"].map(str)
    data[column] = data[column].drop(['Gene_Id'], axis=1)
    cols = data[column].columns.tolist()
    cols = cols[-1:] + cols[:-1]
    data[column] = data[column][cols]
    data[column].to_csv( column + '.tsv', sep='\t', index=False, na_rep='0')

### Plotting Exon TPM and read count per sample

In [None]:
columns = ['ExonTPM', 'ExonReads']
output_suffix = "_sorted_genes.out.gz"
files = [ f for ds, df, files in os.walk('./') for f in files if f.endswith(output_suffix) ]
files.sort()
for column in columns:    
    plt.figure(figsize=(10, 12)) 
    toPlot = []
    for f in files:
        s = f.replace(output_suffix, '')
        if s in data[column]:
            for r in data[column][s]:
                toPlot.append([r, s])
    d = pandas.DataFrame(toPlot, columns=[column, 'Sample'])
    ax = sns.boxplot(y='Sample', x=column, data=d, orient="h", palette="Set2")
