### Overview intron fraction
- Plot the fraction intronic vs spliced reads for foursu and total libraries
- Add replicate heatmap to show reproducibility between the libraries for foursu and total libraries

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import *
from utilities import load_dataset, filter_low_exp

%load_ext autoreload
%autoreload 2

In [None]:
#Parse the data file
infile1 = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = load_dataset(infile1, '../Figures/summary_files/brain4sU_passed.csv')

# Make reproducibility heatmaps for the libraries
infile2 = os.path.join(results_dir_inctest, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df2a = load_dataset(infile2, '../Figures/summary_files/brainInc_mock_passed.csv')
df2b = load_dataset(infile2, '../Figures/summary_files/brainInc_foursu_passed.csv')
all_passed_genes_inc = df2a.index.intersection(df2b.index)
df2 = df2a.loc[all_passed_genes_inc].copy()

outdir = '../Figures/Overview'
os.makedirs(outdir, exist_ok = True)

#### Plot the fraction of intronic and exonic transcripts in each library type

In [None]:
#Calculate the fraction of the intronic and exonic transcripts
sum_df = df.groupby(['RNAtype', 'replicate']).sum()
sum_df['intronic_percent'] = (sum_df['intronic_tpm_recalc']*100)/1e6
sum_df['exonic_percent'] = (sum_df['exonic_tpm_recalc']*100)/1e6

In [None]:
#Plot fraction intronic transcripts for library type input or pulldown
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)

bar_width = 0.9
num_reps = 3
# positions of the left bar-boundaries, plotting by replicate
bar_p1 = np.array(range(num_reps))
bar_p2 = bar_p1 + 4
plot_order = ['intronic_percent', 'exonic_percent']
#plot_order = ['exonic_percent', 'intronic_percent']

#https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.set_prop_cycle.html?highlight=set_prop_cycle#matplotlib.axes.Axes.set_prop_cycle
#need to reset the color cycle between the two subplots
xlabels = ['total #1', 'total #2', 'total #3', 'foursu #1', 'foursu #2', 'foursu #3']
sample_l = ['input', 'pd']
for sample, bar_l in zip(sample_l, [bar_p1, bar_p2]):
    ax.set_prop_cycle(color = selected_colors[0:2][::-1])
    running_bottom = [0]*num_reps
    #this_df = biotype_df.loc[pd.IndexSlice[:, sample], 'percent_counts']
    this_df = sum_df.loc[pd.IndexSlice[sample, :]]
    for rna in plot_order:
        values = this_df[rna].values
        ax.bar(bar_l, 
               values,
               label = rna, 
               alpha = 0.9,
               bottom = running_bottom,
               width = bar_width)
        running_bottom += values
        
current_handles, current_labels = plt.gca().get_legend_handles_labels()

pretty_names = {'intronic_percent': 'unspliced', 'exonic_percent':'spliced'}

#get rid of redundancy in legend plotting
legend_len = int(len(current_handles)/len(sample_l))
new_labels = [pretty_names[i] for i in current_labels[0:legend_len]]

plt.legend(current_handles[0:legend_len], new_labels, bbox_to_anchor=(0.5, 1.05), loc=8,
           ncol = 2, fontsize = 8)

ax.set_ylabel('% of transcripts')
plt.xticks(np.append(bar_p1, bar_p2), ['1', '2', '3', '1', '2', '3'])

ax.text(1, -25, 'total RNA', horizontalalignment='center', verticalalignment='center',
            fontsize = 8)
ax.text(5, -25, '4sU'r'$^{+}$' ' RNA', horizontalalignment='center', verticalalignment='center',
            fontsize = 8)

plt.savefig('%s.%s' % (os.path.join(outdir, 'frac_intron'), out_fmt), dpi = out_dpi)

#### Look at the inter-replicate reproducibility

In [None]:
# To avoid plotting dropouts (like genes which are only present in one condition but not others and reduce the correlation considerably)
# Limit the plot to genes which pass the cutoff in all libraries (i.e. 10 counts in each library)
# This way were are showing correlation over the same group of genes in each library

n_samp1 = len(df['sample'].unique())
n_samp2 = len(df2['sample'].unique())

#Exp1 (Brain4sU)
res_file1 = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
passed_genes1_all = filter_low_exp(res_file1, filter_col='summed_est_counts', filter_co=10, npass=n_samp1, 
                              outname=os.path.join(outdir,'test'))

# #Exp2 (BrainInc)
res_file2 = os.path.join(results_dir_inctest, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
passed_genes2_all = filter_low_exp(res_file2, filter_col='summed_est_counts', filter_co=10, npass=n_samp2, 
                               outname=os.path.join(outdir, 'test'))

In [None]:
df = df.loc[df.index.isin(passed_genes1_all)].copy()
df2 = df2.loc[df2.index.isin(passed_genes2_all)].copy()

In [None]:
def get_correlation(df, sort_d=None):
    '''
    Get correlation between libraries.
    '''
    # df['expname'] = df.apply(lambda x: '_'.join([str(x['condition']), x['exptype'], str(x['replicate'])]), axis=1)
    df['expname'] = df.apply(lambda x: f'{x["exptype"]} r{x["replicate"]}', axis=1)
    pdf = df.reset_index().pivot(index = 'gene', columns = 'expname', values = 'summed_tpm_recalc')
    if sort_d:
        cols = pdf.columns.tolist()
        cols.sort(key=lambda x: sort_d[x.split(' r')[0]])
    else:
        cols = pdf.columns.tolist()
    pdf = pdf[cols].copy()
    # Add pseudocount of the lowest value to allow log transformation
    s = pd.concat([pdf[i] for i in pdf.columns])
    a = np.unique(s)
    lowest_nonzero = a[a>0][0]
    pdf += lowest_nonzero
    pdf = pdf.apply(np.log10)
    corrMatrix_f = pdf.corr(method = 'pearson')
    return corrMatrix_f**2

df['exptype'] = df['RNAtype'].map({'input':'total', 'pd':'4sU'})
corrMat1 = get_correlation(df, sort_d={'total':1, '4sU':2})
df2 = df2.query('RNAtype == "input"').copy()
df2['exptype'] = df2['condition'].map({'0mock':'0 min mock', '60mock':'60 min -4sU', '60foursu':'60 min +4sU'})
corrMat2 = get_correlation(df2, sort_d={'0 min mock':1, '60 min -4sU':2, '60 min +4sU':3})

In [None]:
# Plot the correlation between the replicates for both the incubation control libraries and the 20 min 4sU input and pulldown libraries
# https://seaborn.pydata.org/examples/many_pairwise_correlations.html
fig = plt.figure(figsize=(dfig*2, dfig), constrained_layout=True)
# ncols = 33
# gs = fig.add_gridspec(ncols=ncols, nrows=1, wspace=0)
# # gs.update(left=0.1,right=0.9,top=0.965,bottom=0.03,wspace=0.3,hspace=0.09)
# ax1 = fig.add_subplot(gs[1:16])
# ax2 = fig.add_subplot(gs[16:31])
# cbar_ax = fig.add_subplot(gs[31:])

ncols = 34
gs = fig.add_gridspec(ncols=ncols, nrows=1, wspace=0)
# gs.update(left=0.1,right=0.9,top=0.965,bottom=0.03,wspace=0.3,hspace=0.09)
ax1 = fig.add_subplot(gs[2:17])
ax2 = fig.add_subplot(gs[17:32])
cbar_ax = fig.add_subplot(gs[32:])

vmin = min([corrMat1.min().min(), corrMat2.min().min()])//0.1 * 0.1
# Add mask to avoid plotting the top part
mask1 = np.triu(np.ones_like(corrMat1, dtype=bool), k=1)
mask2 = np.triu(np.ones_like(corrMat2, dtype=bool), k=1)

# Plot the incubation control on the left and the 20 min libraries on the right
# Set vmin a little below data range so that the color change is more visible
# vmin = 0.7
ax1 = sns.heatmap(corrMat2, mask=mask2, annot=False, ax=ax1, cmap = 'magma', vmin=vmin, vmax=1, cbar=False)
ax2 = sns.heatmap(corrMat1, mask=mask1, annot=False, ax=ax2, cmap = 'magma', vmin=vmin, vmax=1, cbar_ax=cbar_ax,
                  cbar_kws={'label': 'correlation (Pearson r)'})

# cbar_ax.set_ylabel('correlation (Pearson r)', y=-0.18, ha='left')
cbar_ax.set_ylabel('correlation (Pearson r)', y=-0.1, ha='left')

for ax in [ax1, ax2]:
    ax.set_aspect('equal')
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.tick_params(length = 0)
# plt.subplots_adjust(left=0.15)
plt.savefig('%s.%s' % (os.path.join(outdir, 'reprod_hm_pearson'), out_fmt), dpi = out_dpi)