#### Overview histograms
- Plot histograms of the synthesis, processing, and decay rates

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset
from plotting_fxns import update_old_ids
from plotting_fxns import get_boxtop, add_stars
from plotting_fxns import PrettyBox

import pickle
import glob
# res_dir = '/Users/mkthompson/Desktop/Davislab/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/'
con_d = pickle.load(open('../../resources/id_conversion/dmel628_idconvert.p', 'rb'))
con_d41 = pickle.load(open('../../resources/id_conversion/dmel641_idconvert.p', 'rb'))

%load_ext autoreload
%autoreload 2

In [None]:
#Load the INSPEcT data
outdir = '../Figures/Overview'
os.makedirs(outdir, exist_ok = True)
rate_df = pd.read_csv('../Figures/summary_files/INSPEcT_rates.csv', index_col='gene')
infile = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = load_dataset(infile, '../Figures/summary_files/brain4sU_passed.csv')
passed_genes = df.index.unique()
#filter rate_df for passed genes
rate_df = rate_df[rate_df.index.isin(passed_genes)].copy()
log_rate_df = rate_df[['syn_rate', 'deg_rate', 'total_tpm']].apply(np.log10).replace([np.inf, -np.inf], np.nan).dropna(how='any')
log_rate_df = pd.merge(log_rate_df, rate_df[['biotype']], left_index=True, right_index=True)

In [None]:
#Plot histograms of the various rates
#Might look better if we get rid of the bars between the bins
from cgitb import text

#choose colors for synthesis and degradation
syn_color = selected_colors[2]
deg_color = selected_colors[1]

fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
ax = sns.histplot(x='syn_rate', data=log_rate_df, color = syn_color, label='synth\n(TPM / min)', element='step')
ax = sns.histplot(x='deg_rate', data=log_rate_df, color = deg_color, label='decay\n(1 / min)', element='step')
#ax = sns.histplot(x='log10_rate', data=syn_df, color = syn_color, label='syn (TPM/min)')
#ax = sns.histplot(x='log10_rate', data=deg_df, color = deg_color, label='deg (1/min)')
ax.set_ylabel('number of genes')
ax.set_xlabel('log'r'$_{10}$'' rate')
ax.set_xlim(-5, 3)
plt.legend(loc=1, bbox_to_anchor=(1.2,1.05), fontsize=6)
loc = plticker.MultipleLocator(base=2.0)
ax.xaxis.set_major_locator(loc)

# Report sigma values for log-transformed rates
# Non-transformed sd is not meaningful since the values are highly skewed
sigma_synth = log_rate_df['syn_rate'].std()
sigma_deg = log_rate_df['deg_rate'].std()
# ax.set_ylabel('log'r'$_{10}$'' rate')

ax.text(1.1, 0.5, r'$\sigma$'' = %1.2f' % sigma_synth, color=syn_color, transform=ax.transAxes, ha='right', va='center')
ax.text(1.1, 0.4, r'$\sigma$'' = %1.2f' % sigma_deg, color=deg_color, transform=ax.transAxes, ha='right', va='center')

#'4sU'r'$^{+}$' ' RNA'
plt.savefig('%s.%s' % (os.path.join(outdir, 'rate_histograms'), out_fmt), dpi = out_dpi)

print(f'synth sigma {sigma_synth}')
print(f'deg sigma {sigma_deg}')

# Should the rates themselves also be capped? Seems like it.

In [None]:
# Plot the halflives with the highest halflives capped at 1000 min
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)

ax = sns.histplot(x='halflife_capped', data=rate_df, color=deg_color, element='step')
ax.xaxis.set_major_locator(plticker.MultipleLocator(base=250))

ticks = ax.get_xticks()
cap = 1000
dic = {cap: r'$\geq$''%s' % cap}

#$\geq$
labels = [int(ticks[i]) if t not in dic.keys() else dic[t] for i,t in enumerate(ticks)]

median_t12 = rate_df['halflife'].median()
ax.axvline(x=median_t12, color='k', linestyle='--', alpha=0.5)
ax.text(median_t12+50, 600, 'median %1.0f min' % median_t12, color='k', alpha=0.5)

ax.set_xticklabels(labels)
ax.set_xlabel('half-life (min)')
ax.set_ylabel('number of genes')
plt.savefig('%s.%s' % (os.path.join(outdir, 'halflife_histogram2'), out_fmt), dpi = out_dpi)

In [None]:
# Now box plot for synth and deg rates for mRNAs and for non-coding RNAs
this_df = pd.melt(log_rate_df, id_vars=['biotype'], value_vars=['syn_rate', 'deg_rate'])
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
ax = PrettyBox(y='value', x='variable', hue='biotype', hue_order=['protein_coding', 'ncRNA'], fliersize=0, data=this_df, ax=ax)
ax.set_ylim(-3.5, 2)
ax.set_xlabel('')
ax.set_ylabel('log'r'$_{10}$'' rate')
ax.set_xticklabels(['synth', 'decay'])

x1 = log_rate_df[log_rate_df['biotype'] == 'protein_coding']['syn_rate']
y1 = log_rate_df[log_rate_df['biotype'] == 'ncRNA']['syn_rate']
x2 = log_rate_df[log_rate_df['biotype'] == 'protein_coding']['deg_rate']
y2 = log_rate_df[log_rate_df['biotype'] == 'ncRNA']['deg_rate']
_, p1 = stats.mannwhitneyu(x1, y1)
_, p2 = stats.mannwhitneyu(x2, y2)

print('diff. in synth ncRNA pvalue %s' % p1)
print('diff. in decay ncRNA pvalue %s' % p2)
    
h1 = max(get_boxtop(log_rate_df, col1='biotype', val1='protein_coding', val_col='syn_rate'),
         get_boxtop(log_rate_df, col1='biotype', val1='ncRNA', val_col='syn_rate'))
h2 = max(get_boxtop(log_rate_df, col1='biotype', val1='protein_coding', val_col='deg_rate'),
         get_boxtop(log_rate_df, col1='biotype', val1='ncRNA', val_col='deg_rate'))
    
add_stars(-0.2, 0.2, h1+0.1, 0.1, p1, ax)
add_stars(0.8, 1.2, h2+0.1, 0.1, p2, ax)

handles, labels = ax.get_legend_handles_labels()
#whenever you call ax.legend(), it resets the aesthetics from seaborn
ax.legend(handles=handles[0:], labels=['coding', 'non-coding'], bbox_to_anchor=(1, 1), loc='upper right', borderaxespad=0)
plt.savefig('%s.%s' % (os.path.join(outdir, 'ncRNA_rates'), out_fmt), dpi = out_dpi)

In [None]:
rate_df.query("biotype == 'ncRNA'").copy()

In [None]:
# Make a boxplot of the non-coding RNA half-lives to show the spread and highlight some unusually stable ones
this_df = rate_df.query("biotype == 'ncRNA'").copy()
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
gs = fig.add_gridspec(ncols = 2, nrows = 1)
ax = fig.add_subplot(gs[0])

ax = PrettyBox(data=this_df, y='halflife_capped', ax=ax, fliersize=0, color=color_dict['purple'])
h_nc = get_boxtop(this_df, col1='biotype', val1='ncRNA', val_col='halflife_capped')

#a hack to get seaborn to jitter the fliers
flier_df = this_df.query("halflife_capped > @h_nc")
ax = sns.swarmplot(data=flier_df, y='halflife_capped', ax=ax, color=color_dict['purple'], size=2)

ax.collections[0].set_ec(ax.collections[0].get_fc())
ax.collections[0].set_lw(0.5)
ax.collections[0].set_fc('none')

#Choose some annotated ncRNAs to label
to_label = {'lncRNA:roX1':'roX1', 'lncRNA:noe':'noe', '7SLRNA:CR42652':'7SL', '7SLRNA:CR32864':'7SL',
            'RNaseMRP:RNA':'RNAse MRP', 'lncRNA:cherub':'cherub'}

#allow labels to go down or up a bit to make room for others
to_label_offset = {'lncRNA:cherub':-20, '7SLRNA:CR32864':-50, '7SLRNA:CR42652':-100}
to_label_df = this_df.query("gene_name in @to_label").copy()
to_label_df['name'] = to_label_df['gene_name'].map(to_label)
#find x, y position of the pts to label
offsets = ax.collections[0].get_offsets()
to_label_df['pt_idx'] = to_label_df['halflife_capped'].apply(lambda x: (np.abs(offsets[:, 1] - x)).argmin())
to_label_df[['x_pos', 'y_pos']] = offsets[to_label_df['pt_idx']]

#for the genes with t1/2 = 500 min, we can't resolve which are which, just take the three rightmost ones to label
yvals = offsets[:,1]
xpos = offsets[yvals == 500][:,0]
xpos.sort()
xpos3 = xpos[::-1][0:3]
to_label_df.loc[to_label_df['gene_name'] == 'RNaseMRP:RNA', 'x_pos'] = xpos3[0]
to_label_df.loc[to_label_df['gene_name'] == '7SLRNA:CR32864', 'x_pos'] = xpos3[1]
to_label_df.loc[to_label_df['gene_name'] == '7SLRNA:CR42652', 'x_pos'] = xpos3[2]
max_x = max(offsets[:,0])
small_gap = 0.0
big_gap = 0.3
for i in range(len(to_label_df)):
    x, y = to_label_df.iloc[i][['x_pos', 'y_pos']]
    lab_string = to_label_df.iloc[i]['name']
    x1 = x+small_gap
#     x2 = x+big_gap
    x2 = 1
    if to_label_df.iloc[i]['gene_name'] in to_label_offset:
        y2 = y + to_label_offset[to_label_df.iloc[i]['gene_name']]
        
    else:
        y2 = y
    ax.annotate(lab_string, (x1,y), (x2,y2), va='center', ha='left', arrowprops=dict(arrowstyle='-', lw=0.75))
ax.set_xticks([])
ax.set_xlabel('ncRNAs (n=%s)' % len(this_df))
ax.set_ylabel('half-life (min)')
#change 500 -> >= 500
ticks = ax.get_yticks()
dic = {500: r'$\geq$''500'}
labels = [int(ticks[i]) if t not in dic.keys() else dic[t] for i,t in enumerate(ticks)]
ax.set_yticklabels(labels)
# ax.get_legend().remove()
# ax.get_legend_handles_labels()
plt.savefig('%s.%s' % (os.path.join(outdir, 'ncRNA_halflives'), out_fmt), dpi = out_dpi)

In [None]:
to_label_df.head()

In [None]:
# Make a boxplot of the non-coding RNA half-lives to show the spread and highlight some unusually stable ones
this_df = rate_df.query("biotype == 'ncRNA'").copy()
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
gs = fig.add_gridspec(ncols = 2, nrows = 1)
ax = fig.add_subplot(gs[0])

ax = PrettyBox(data=this_df, y='halflife_capped', ax=ax, fliersize=0, color=color_dict['purple'])
h_nc = get_boxtop(this_df, col1='biotype', val1='ncRNA', val_col='halflife_capped')

#a hack to get seaborn to jitter the fliers
flier_df = this_df.query("halflife_capped > @h_nc")
ax = sns.swarmplot(data=flier_df, y='halflife_capped', ax=ax, color=color_dict['purple'], size=2)

ax.collections[0].set_ec(ax.collections[0].get_fc())
ax.collections[0].set_lw(0.5)
ax.collections[0].set_fc('none')

#Choose some annotated ncRNAs to label
to_label = {'lncRNA:roX1':'roX1', 'lncRNA:noe':'noe', '7SLRNA:CR42652':'7SL', '7SLRNA:CR32864':'7SL',
            'RNaseMRP:RNA':'RNAse MRP', 'lncRNA:cherub':'cherub'}

#allow labels to go down or up a bit to 'make room for others
to_label_offset = {'lncRNA:cherub':-200, 'lncRNA:roX1':-100, '7SLRNA:CR32864':-100, '7SLRNA:CR42652':-200}
to_label_df = this_df.query("gene_name in @to_label").copy()
to_label_df['name'] = to_label_df['gene_name'].map(to_label)
#find x, y position of the pts to label
offsets = ax.collections[0].get_offsets()
to_label_df['pt_idx'] = to_label_df['halflife_capped'].apply(lambda x: (np.abs(offsets[:, 1] - x)).argmin())
to_label_df[['x_pos', 'y_pos']] = offsets[to_label_df['pt_idx']]

#for the genes with t1/2 = 500 min, we can't resolve which are which, just take the three rightmost ones to label
# yvals = offsets[:,1]
offsets.sort(axis=1)
# xpos = offsets[yvals == 1000][:,0]
xpos3 = offsets[-3:]
# xpos3 = xpos[::-1][0:3]
to_label_df.loc[to_label_df['gene_name'] == 'RNaseMRP:RNA', 'x_pos'] = xpos3[0][0]
to_label_df.loc[to_label_df['gene_name'] == '7SLRNA:CR32864', 'x_pos'] = xpos3[1][0]
to_label_df.loc[to_label_df['gene_name'] == '7SLRNA:CR42652', 'x_pos'] = xpos3[2][0]
max_x = max(offsets[:,0])
small_gap = 0.0
big_gap = 0.3
for i in range(len(to_label_df)):
    x, y = to_label_df.iloc[i][['x_pos', 'y_pos']]
    lab_string = to_label_df.iloc[i]['name']
    x1 = x+small_gap
#     x2 = x+big_gap
    x2 = 1
    if to_label_df.iloc[i]['gene_name'] in to_label_offset:
        y2 = y + to_label_offset[to_label_df.iloc[i]['gene_name']]
        
    else:
        y2 = y
    ax.annotate(lab_string, (x1,y), (x2,y2), va='center', ha='left', arrowprops=dict(arrowstyle='-', lw=0.75))
ax.set_xticks([])
ax.set_xlabel('ncRNAs (n=%s)' % len(this_df))
ax.set_ylabel('half-life (min)')
#change 500 -> >= 500
ticks = ax.get_yticks()
dic = {1000: r'$\geq$''1000'}
labels = [int(ticks[i]) if t not in dic.keys() else dic[t] for i,t in enumerate(ticks)]
ax.set_yticklabels(labels)
# ax.get_legend().remove()
# ax.get_legend_handles_labels()
plt.savefig('%s.%s' % (os.path.join(outdir, 'ncRNA_halflives'), out_fmt), dpi = out_dpi)

### Now look at synthesis and degradation rates for RNAs which are localized near synapses in other systems

#### Localization to neurites, from Kugelgen review
- Localized genes determined by taking genes which have significant enrichment (p<0.1) in at least 3 studies

#### Do we need to account for background somehow? For example do they only report coding genes in the review? Or is there a way to get the full set of genes which they analyzed -- independent of localization?

In [None]:
#Look at overlap with Kugelgen neurite enrichment review:
#Output the ENSMUS symbols for DIOPT tool
# nmj_dir = '/Users/mkthompson/Desktop/Davislab/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/'
# neur_file = os.path.join(nmj_dir, 'neural_loc/Kugelgen_enriched.csv')
# neur_df = pd.read_csv(neur_file)
# outfile = '../../resources/neural_loc/Kugelgen_enriched_ensmus.csv'
# neur_df[neur_df['Datasets with significant neurite enrichment (p<0.1)']>= 3]['gene_id'].to_csv(outfile, index=False, header=False)


In [None]:
#Get the fly homologs of neurite enriched genes (from Kugelgen review, with p<0.1 in at least 3 studies)
neurite_file = '/Users/mk/Desktop/Davislab_old/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/neural_loc/diopt_neurite_enriched.csv'
neurite_df = pd.read_csv(neurite_file)
neurite_fbgs = neurite_df.loc[neurite_df['DIOPT Score'] > 4, 'Fly Species Gene ID'].values
neurite_new_fbgs = update_old_ids(neurite_fbgs, con_d)

In [None]:
log_rate_df['neurite loc'] = log_rate_df.index.isin(neurite_new_fbgs)
gl_df = pd.read_csv('../../resources/glial_studies/glia-protrusion-localised-id-interest.txt', sep='\t')
log_rate_df['glial protrusion'] = log_rate_df.index.isin(gl_df['dmel_gene_id'])

In [None]:
# This shows that all but one of the neurite localized ones is protein coding, the other one is a pseudogene
# log_rate_df.query('`neurite loc`')['biotype'].value_counts()
# All of the glial protrusion ones are protein coding
# log_rate_df.query('`glial protrusion`')['biotype'].value_counts()
# In order to compare apples to apples, we're going to limit the glial and neural localized RNAs to protein coding only
coding_rate_df = log_rate_df.query('biotype=="protein_coding"').copy()

In [None]:
#Boxplot of synthesis, deg, and total RNA
# This explains why the whisker is not exactly at 1.5*IQR, thus why my stars are too high for that one
# https://stackoverflow.com/questions/49139299/whisker-is-defined-as-1-5-iqr-how-could-two-whikers-in-plot-from-python-seabor
fig = plt.figure(figsize=(dfig*1.5, dfig), constrained_layout=True)

gs = fig.add_gridspec(ncols=3, nrows=1)
ax1 = fig.add_subplot(gs[0, :2])
this_df = pd.melt(coding_rate_df, id_vars=['neurite loc'], value_vars=['syn_rate', 'deg_rate', 'total_tpm'])
ax1 = PrettyBox(y='value', x='variable', hue='neurite loc', data=this_df[this_df['variable'] != 'total_tpm'], fliersize=0, ax=ax1)
ax1.set_ylim(-4, 4)
ax1.set_xlabel('')
ax1.set_ylabel('log'r'$_{10}$'' rate')
ax1.set_xticklabels(['synth', 'decay'])

syn_x = this_df.loc[(this_df['neurite loc']) & (this_df['variable'] == 'syn_rate')]['value'].values
syn_y = this_df.loc[(~this_df['neurite loc']) & (this_df['variable'] == 'syn_rate')]['value'].values

deg_x = this_df.loc[(this_df['neurite loc']) & (this_df['variable'] == 'deg_rate')]['value'].values
deg_y = this_df.loc[(~this_df['neurite loc']) & (this_df['variable'] == 'deg_rate')]['value'].values

tot_x = this_df.loc[(this_df['neurite loc']) & (this_df['variable'] == 'total_tpm')]['value'].values
tot_y = this_df.loc[(~this_df['neurite loc']) & (this_df['variable'] == 'total_tpm')]['value'].values

_, syn_p = stats.mannwhitneyu(syn_x, syn_y)
_, deg_p = stats.mannwhitneyu(deg_x, deg_y)
_, tot_p = stats.mannwhitneyu(tot_x, tot_y)

syn_h = max(get_boxtop(this_df, col1='neurite loc', val1=True, col2='variable', val2='syn_rate', val_col='value'),
            get_boxtop(this_df, col1='neurite loc', val1=False, col2='variable', val2='syn_rate', val_col='value'))

deg_h = max(get_boxtop(this_df, col1='neurite loc', val1=True, col2='variable', val2='deg_rate', val_col='value'),
            get_boxtop(this_df, col1='neurite loc', val1=False, col2='variable', val2='deg_rate', val_col='value'))

tot_h = max(get_boxtop(this_df, col1='neurite loc', val1=True, col2='variable', val2='total_tpm', val_col='value'),
            get_boxtop(this_df, col1='neurite loc', val1=False, col2='variable', val2='total_tpm', val_col='value'))

#Kind of strange that syn_h appears to be higher above the IQR than the deg_h
add_stars(-0.2, 0.2, syn_h+0.1, 0.1, syn_p, ax1)
add_stars(0.8, 1.2, deg_h+0.1, 0.1, deg_p, ax1)

ax2 = fig.add_subplot(gs[0, 2:])
ax2 = PrettyBox(y='value', x='variable', hue='neurite loc', data=this_df[this_df['variable'] == 'total_tpm'], fliersize=0, ax=ax2)
ax2.set_ylim(-2.5, 4)
ax2.set_ylabel('log'r'$_{10}$'' TPM')
ax2.set_xlabel('')
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position('right')
ax2.spines['left'].set_visible(False)
ax2.spines['right'].set_visible(True)
ax2.get_legend().remove()
ax2.set_xticklabels(['total RNA'])
add_stars(-0.2, 0.2, tot_h+0.1, 0.1, tot_p, ax2)
plt.savefig('%s.%s' % (os.path.join(outdir, 'neurite_loc'), out_fmt), dpi = out_dpi)

In [None]:
#Boxplot of synthesis, deg, and total RNA
# This explains why the whisker is not exactly at 1.5*IQR, thus why my stars are too high for that one
# https://stackoverflow.com/questions/49139299/whisker-is-defined-as-1-5-iqr-how-could-two-whikers-in-plot-from-python-seabor
fig = plt.figure(figsize=(dfig*1.5, dfig), constrained_layout=True)
id_var = 'glial protrusion'
gs = fig.add_gridspec(ncols=3, nrows=1)
ax1 = fig.add_subplot(gs[0, :2])
this_df = pd.melt(coding_rate_df, id_vars=[id_var], value_vars=['syn_rate', 'deg_rate', 'total_tpm'])
ax1 = PrettyBox(y='value', x='variable', hue=id_var, data=this_df[this_df['variable'] != 'total_tpm'], fliersize=0, ax=ax1)
ax1.set_ylim(-4, 4)
ax1.set_xlabel('')
ax1.set_ylabel('log'r'$_{10}$'' rate')
ax1.set_xticklabels(['synth', 'decay'])
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper right', borderaxespad=0, title='glial\nprotrusion')
syn_x = this_df.loc[(this_df[id_var]) & (this_df['variable'] == 'syn_rate')]['value'].values
syn_y = this_df.loc[(~this_df[id_var]) & (this_df['variable'] == 'syn_rate')]['value'].values

deg_x = this_df.loc[(this_df[id_var]) & (this_df['variable'] == 'deg_rate')]['value'].values
deg_y = this_df.loc[(~this_df[id_var]) & (this_df['variable'] == 'deg_rate')]['value'].values

tot_x = this_df.loc[(this_df[id_var]) & (this_df['variable'] == 'total_tpm')]['value'].values
tot_y = this_df.loc[(~this_df[id_var]) & (this_df['variable'] == 'total_tpm')]['value'].values

_, syn_p = stats.mannwhitneyu(syn_x, syn_y)
_, deg_p = stats.mannwhitneyu(deg_x, deg_y)
_, tot_p = stats.mannwhitneyu(tot_x, tot_y)

syn_h = max(get_boxtop(this_df, col1=id_var, val1=True, col2='variable', val2='syn_rate', val_col='value'),
            get_boxtop(this_df, col1=id_var, val1=False, col2='variable', val2='syn_rate', val_col='value'))

deg_h = max(get_boxtop(this_df, col1=id_var, val1=True, col2='variable', val2='deg_rate', val_col='value'),
            get_boxtop(this_df, col1=id_var, val1=False, col2='variable', val2='deg_rate', val_col='value'))

tot_h = max(get_boxtop(this_df, col1=id_var, val1=True, col2='variable', val2='total_tpm', val_col='value'),
            get_boxtop(this_df, col1=id_var, val1=False, col2='variable', val2='total_tpm', val_col='value'))

#Kind of strange that syn_h appears to be higher above the IQR than the deg_h
add_stars(-0.2, 0.2, syn_h+0.1, 0.1, syn_p, ax1)
add_stars(0.8, 1.2, deg_h+0.1, 0.1, deg_p, ax1)

ax2 = fig.add_subplot(gs[0, 2:])
ax2 = PrettyBox(y='value', x='variable', hue=id_var, data=this_df[this_df['variable'] == 'total_tpm'], fliersize=0, ax=ax2)
ax2.set_ylim(-2.5, 4)
ax2.set_ylabel('log'r'$_{10}$'' TPM')
ax2.set_xlabel('')
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position('right')
ax2.spines['left'].set_visible(False)
ax2.spines['right'].set_visible(True)
ax2.get_legend().remove()
ax2.set_xticklabels(['total RNA'])
add_stars(-0.2, 0.2, tot_h+0.1, 0.1, tot_p, ax2)
plt.savefig('%s.%s' % (os.path.join(outdir, 'glial_loc'), out_fmt), dpi = out_dpi)

##### Also make a version of this plot which excludes ribosomal proteins because they might bias the decay rates
- Could the higher total RNA levels, synthesis rates, and decay rates be a reflection of detection bias?
- i.e. Higher expressed genes could be more likely to be detected as neurite enriched?

In [None]:
# Assign genes which are RPs
res_dir = '/Users/mk/Desktop/Davislab_old/3.10_brain_ss/brain_figures/resources/'
rp_file = os.path.join(res_dir, 'cytorp_dmel632.txt')
rp_ids = set(pd.read_csv(rp_file, sep='\t', header=None)[0].tolist())
dmel332_ids = update_old_ids(coding_rate_df.index, con_d)
coding_rate_df['dmel332_ids'] = dmel332_ids
coding_rate_df['RP'] = coding_rate_df['dmel332_ids'].isin(rp_ids)
coding_rate_df_norp = coding_rate_df[~coding_rate_df['RP']].copy()

In [None]:
# Boxplot of synthesis, deg, and total RNA
# This explains why the whisker is not exactly at 1.5*IQR, thus why my stars are too high for that one
# https://stackoverflow.com/questions/49139299/whisker-is-defined-as-1-5-iqr-how-could-two-whikers-in-plot-from-python-seabor
fig = plt.figure(figsize=(dfig*1.5, dfig), constrained_layout=True)
gs = fig.add_gridspec(ncols=3, nrows=1)
ax1 = fig.add_subplot(gs[0, :2])
this_df = pd.melt(coding_rate_df_norp, id_vars=['neurite loc'], value_vars=['syn_rate', 'deg_rate', 'total_tpm'])
ax1 = PrettyBox(y='value', x='variable', hue='neurite loc', data=this_df[this_df['variable'] != 'total_tpm'], fliersize=0, ax=ax1)
ax1.set_ylim(-4, 4)
ax1.set_xlabel('')
ax1.set_ylabel('log'r'$_{10}$'' rate')
ax1.set_xticklabels(['synth', 'decay'])

syn_x = this_df.loc[(this_df['neurite loc']) & (this_df['variable'] == 'syn_rate')]['value'].values
syn_y = this_df.loc[(~this_df['neurite loc']) & (this_df['variable'] == 'syn_rate')]['value'].values
deg_x = this_df.loc[(this_df['neurite loc']) & (this_df['variable'] == 'deg_rate')]['value'].values
deg_y = this_df.loc[(~this_df['neurite loc']) & (this_df['variable'] == 'deg_rate')]['value'].values

tot_x = this_df.loc[(this_df['neurite loc']) & (this_df['variable'] == 'total_tpm')]['value'].values
tot_y = this_df.loc[(~this_df['neurite loc']) & (this_df['variable'] == 'total_tpm')]['value'].values

_, syn_p = stats.mannwhitneyu(syn_x, syn_y)
_, deg_p = stats.mannwhitneyu(deg_x, deg_y)
_, tot_p = stats.mannwhitneyu(tot_x, tot_y)

syn_h = max(get_boxtop(this_df, col1='neurite loc', val1=True, col2='variable', val2='syn_rate', val_col='value'),
            get_boxtop(this_df, col1='neurite loc', val1=False, col2='variable', val2='syn_rate', val_col='value'))

deg_h = max(get_boxtop(this_df, col1='neurite loc', val1=True, col2='variable', val2='deg_rate', val_col='value'),
            get_boxtop(this_df, col1='neurite loc', val1=False, col2='variable', val2='deg_rate', val_col='value'))

tot_h = max(get_boxtop(this_df, col1='neurite loc', val1=True, col2='variable', val2='total_tpm', val_col='value'),
            get_boxtop(this_df, col1='neurite loc', val1=False, col2='variable', val2='total_tpm', val_col='value'))

#Kind of strange that syn_h appears to be higher above the IQR than the deg_h
add_stars(-0.2, 0.2, syn_h+0.1, 0.1, syn_p, ax1)
add_stars(0.8, 1.2, deg_h+0.1, 0.1, deg_p, ax1)

ax2 = fig.add_subplot(gs[0, 2:])
ax2 = PrettyBox(y='value', x='variable', hue='neurite loc', data=this_df[this_df['variable'] == 'total_tpm'], fliersize=0, ax=ax2)
ax2.set_ylim(-2.5, 4)
ax2.set_ylabel('log'r'$_{10}$'' TPM')
ax2.set_xlabel('')
ax2.yaxis.tick_right()
ax2.yaxis.set_label_position('right')
ax2.spines['left'].set_visible(False)
ax2.spines['right'].set_visible(True)
ax2.get_legend().remove()
ax2.set_xticklabels(['total RNA'])
add_stars(-0.2, 0.2, tot_h+0.1, 0.1, tot_p, ax2)
plt.savefig('%s.%s' % (os.path.join(outdir, 'neurite_loc_norp2'), out_fmt), dpi = out_dpi)