### Overview intron fraction
- Plot the fraction intronic vs spliced reads for foursu and total libraries

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import extract_gene_vals
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
#Parse the data file
infile = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = load_dataset(infile, '../Figures/summary_files/brain4sU_passed.csv')
# df = pd.read_csv(infile, index_col = 'gene')
df['exptype'] = df['sample'].apply(lambda x: x.split('_')[0])
outdir = '../Figures/Overview'
os.makedirs(outdir, exist_ok = True)

In [None]:
#Calculate the fraction of the intronic and exonic transcripts
sum_df = df.groupby(['exptype', 'replicate']).sum()
sum_df['intronic_percent'] = (sum_df['intronic_tpm_recalc']*100)/1e6
sum_df['exonic_percent'] = (sum_df['exonic_tpm_recalc']*100)/1e6

In [None]:
#Plot fraction intronic transcripts for library type input or pulldown
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)

bar_width = 0.9
num_reps = 3
# positions of the left bar-boundaries, plotting by replicate
bar_p1 = np.array(range(num_reps))
bar_p2 = bar_p1 + 4
plot_order = ['intronic_percent', 'exonic_percent']
#plot_order = ['exonic_percent', 'intronic_percent']

#https://matplotlib.org/api/_as_gen/matplotlib.axes.Axes.set_prop_cycle.html?highlight=set_prop_cycle#matplotlib.axes.Axes.set_prop_cycle
#need to reset the color cycle between the two subplots
xlabels = ['total #1', 'total #2', 'total #3', 'foursu #1', 'foursu #2', 'foursu #3']
sample_l = ['input', 'pd']
for sample, bar_l in zip(sample_l, [bar_p1, bar_p2]):
    ax.set_prop_cycle(color = selected_colors[0:2][::-1])
    running_bottom = [0]*num_reps
    #this_df = biotype_df.loc[pd.IndexSlice[:, sample], 'percent_counts']
    this_df = sum_df.loc[pd.IndexSlice[sample, :]]
    for rna in plot_order:
        values = this_df[rna].values
        ax.bar(bar_l, 
               values,
               label = rna, 
               alpha = 0.9,
               bottom = running_bottom,
               width = bar_width,
               edgecolor = '')
        running_bottom += values
        
current_handles, current_labels = plt.gca().get_legend_handles_labels()

pretty_names = {'intronic_percent': 'unspliced', 'exonic_percent':'spliced'}

#get rid of redundancy in legend plotting
legend_len = int(len(current_handles)/len(sample_l))
new_labels = [pretty_names[i] for i in current_labels[0:legend_len]]

plt.legend(current_handles[0:legend_len], new_labels, bbox_to_anchor=(0.5, 1.05), loc=8,
           ncol = 2, fontsize = 8)

ax.set_ylabel('% of transcripts')
plt.xticks(np.append(bar_p1, bar_p2), ['1', '2', '3', '1', '2', '3'])

ax.text(1, -25, 'total RNA', horizontalalignment='center', verticalalignment='center',
            fontsize = 8)
ax.text(5, -25, '4sU'r'$^{+}$' ' RNA', horizontalalignment='center', verticalalignment='center',
            fontsize = 8)
    
plt.savefig('%s.%s' % (os.path.join(outdir, 'frac_intron'), out_fmt), dpi = out_dpi)

In [None]:
sum_df