#### CTS stability
- Plot stability of RNAs enriched in specific cell types.
- Use the data from L3 brain single-cell sequencing from the Cocanaugher 2022 paper
- Show the fraction of TF, RBP, etc. in this dataset. Unfortunately we cannot do GO analysis on these genes since we do not know the background set

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import gffutils
import scipy.stats as stats
from collections import defaultdict
import warnings

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import PrettyBox, get_group_Ns, enrich_table
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
#Import the stability data
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)

#Load stability data
rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')
rate_df = rate_df[['deg_rate', 'stab_percentile']].copy()

#Use the Flybase geneset for transcription factor
TF_file = '../Figures/genesets/all_TFs.csv'
tfs = set(pd.read_csv(TF_file, header=None)[0].values)

#Import the GO slim based TF and RBP IDs:
RBPs_go_file = '../Figures/GO/ens/RNA binding.txt'
mRBPs_go_file = '../Figures/GO/ens/mRNA binding.txt'
gial_file = '../../resources/glial_studies/glia-protrusion-localised-id-interest.txt'
rbps = set(pd.read_csv(RBPs_go_file, header=None)[0].values)
mrbps = set(pd.read_csv(mRBPs_go_file, header=None)[0].values)
glial = set(pd.read_csv(gial_file, sep='\t')['dmel_gene_id'].values)
rbps_only = rbps.difference(mrbps)

rate_df['TF'] = rate_df.index.isin(tfs)
rate_df['RBP_all'] = rate_df.index.isin(rbps)
rate_df['mRBP'] = rate_df.index.isin(mrbps)
rate_df['RBP'] = rate_df.index.isin(rbps_only)
rate_df['glial'] = rate_df.index.isin(glial)
gene_groups = ['TF', 'RBP', 'mRBP', 'glial']


In [None]:
#Number in each category
print('num tfs', len(tfs))
print('num rbps', len(rbps))
print('num mrbps', len(mrbps))
#mrbps is a complete subset of rbps. Also make category of rbps which are not mrbps
print('num rbps_only', len(rbps_only))
print('num glial', len(glial))

In [None]:
# Read in the cell-type-specific genes
cts_df = pd.read_csv(os.path.join(outdir, 'corrales_celltypes.csv'), index_col=0)
prog_df = pd.read_csv(os.path.join(outdir, 'dillon_progenitor.csv'), index_col=0).query('celltype != "low quality"')
neuron_df = pd.read_csv(os.path.join(outdir, 'dillon_neuron.csv'), index_col=0)
glia_df = pd.read_csv(os.path.join(outdir, 'dillon_glia.csv'), index_col=0)

In [None]:
# Look at the number and names of the cell types in the atlas:
for df in [prog_df, neuron_df, glia_df]:
    celltypes = df['celltype'].unique()
    print(f'n celltypes {len(celltypes)}\n{celltypes}\n')

In [None]:
# Get all cell type markers and cell-type-specific TFs
db = gffutils.FeatureDB(gffutils_db)
genes = db.features_of_type('gene')
ngenes = sum(1 for _ in genes)
cts = set(cts_df.index.unique())
cts_tfs = cts.intersection(tfs)

#Would be interesting to represent this visually somehow?
print('num CTS RNAs', len(cts))
print('num CTS TFs', len(cts_tfs))
print('num TFs', len(tfs))
print('frac CTS_TF/TF', len(cts_tfs)/len(tfs))
print('frac TF/CTS', len(cts_tfs)/len(cts))
print('frac CTS/genome', len(cts)/ngenes)
print('frac TF/genome', len(tfs)/ngenes)

In [None]:
# Plot the swarmplot for the TFs, now make it bigger and easier to see!
from plotting_fxns import sc_swarmplot, enrich_heatmap

fig = plt.figure(figsize=(dfig*1.7, dfig*4))
hstart = 0.08
h = 0.95 - hstart
ax = fig.add_axes((0.31, hstart, 0.29, h))
big_df = pd.merge(rate_df, cts_df, left_index=True, right_index=True, how='left').dropna(subset=['celltype'])

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax, res_tf, p_tf = sc_swarmplot(data=big_df, all_genes=rate_df, x='stab_percentile', y='celltype', hue='TF', hue_name='TF mRNAs', x_lab='stability percentile', 
                      y_lab='cell type (num genes)', ax=ax, palette=[color_dict['grey'], 'navy'], s=2.5, fig=fig)

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_tf_swarm'), out_fmt), dpi = out_dpi)

In [None]:
# Version of the plot with the RBPs and mRBPs overlaid, now bigger!
# Need to make it a little wider than the TF version to accomodate the 4-column heatmap
fig = plt.figure(figsize=(dfig*2, dfig*4))
hstart = 0.08
h = 0.95 - hstart
ax = fig.add_axes((0.26, hstart, 0.29, h))

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax, res_rbp, p_rbp = sc_swarmplot(data=big_df, all_genes=rate_df, x='stab_percentile', y='celltype', hue='RBP', hue_name='RBP mRNAs', x_lab='stability percentile', 
                      y_lab='cell type (num genes)', ax=ax, palette=[color_dict['grey'], 'navy'], s=2.5, fig=fig, enrich_hm=False)

res_rbp = enrich_heatmap(data=big_df, all_genes=rate_df, x='stab_percentile', y='celltype', hue=['RBP', 'mRBP'], y_lab1='fraction of genes',
               y_lab2='-log'r'$_{10}$'' p-value', hstart=hstart, lstart=0.61, fig=plt.gcf(), 
               xticklabs1=['RBP', 'mRBP'], xticklabs2=['RBP', 'mRBP'], hm_width=0.09)

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_rbp_swarm'), out_fmt), dpi = out_dpi)

In [None]:
# Change the names to match the format of the ones from Corrales et al.
name_convert = {'immature neurons': 'Immat N', 'new-born neurons': 'Newborn N',
                 'cortex/chiasm glia': 'Cortex/chiasm\nGl', 'perineural glia':'Perineural\nGl', 
                 'astrocytes/neuropil glia':'Astrocytes/\nneuropil Gl', 
                 'subperineural glia':'Subperineural\nGl', 'cholinergic': 'Chol N',
                 'unannotated': 'Unann N', 'GABAergic':'GABA N', 'Glutamatergic':'Glut N', 
                 'undifferentiated':'Undiff N', 'motor neurons':'Motor N', 'kenyon cells gamma': 'KC 'r'$\gamma$', 
                 'monoaminergic':'Mono N', 'peptidergic': 'Pept N', 'octopaminergic':'Octop N', 
                 'neurosecretory cells': 'Neurosec\ncells', 'Quiescent NBs': 'Quiescent\nNBs'}

In [None]:
dillon_df = pd.concat([prog_df, neuron_df, glia_df])
dillon_df2 = dillon_df.copy()
# dillon_df2['celltype'] = dillon_df['celltype'].map(name_convert)
dillon_df2['celltype2'] = dillon_df['celltype'].apply(lambda x: name_convert[x] if x in name_convert else x)
dillon_df2 = dillon_df2.drop(labels=['celltype'], axis=1).rename(columns={'celltype2':'celltype'})

In [None]:
dillon_df['celltype'].unique()

In [None]:
# Going to show these all together -- easiest to make
# Need to add more descriptive labels, like 'neuron' after some of the labels
# Plot the swarmplot for the atlases from Dillon et al.
from plotting_fxns import sc_swarmplot, enrich_heatmap

fig = plt.figure(figsize=(dfig*1.85, dfig*5.18))
hstart = 0.08
h = 0.95 - hstart
ax = fig.add_axes((0.31, hstart, 0.29, h))
big_df_dillon = pd.merge(rate_df, dillon_df2, left_index=True, right_index=True, how='left').dropna(subset=['celltype'])

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax, res_tf_dillon, p_tf_dillon = sc_swarmplot(data=big_df_dillon, all_genes=rate_df, x='stab_percentile', y='celltype', hue='TF', hue_name='TF mRNAs', x_lab='stability percentile', 
                      y_lab='cell type (num genes)', ax=ax, palette=[color_dict['grey'], 'navy'], s=2.5, fig=fig)

plt.savefig('%s.%s' % (os.path.join(outdir, 'dillon_tf_swarm'), out_fmt), dpi = out_dpi)

In [None]:
#Plot the stability percentile of TFs for specific cell types
celltypes = cts_df['celltype'].unique()
rate_df['CTS'] = rate_df.index.isin(cts_df.index)
# rate_df['CTS'] = rate_df[celltypes].any(axis=1)

from plotting_fxns import get_boxtop, add_stars, get_box_coords

fig = plt.figure(figsize=(dfig*1.7, dfig*1), constrained_layout=True)
# fig = plt.figure(figsize=(dfig*0.75, dfig*1.5), constrained_layout=True)
ax = PrettyBox(data=rate_df, x='TF', y='stab_percentile',  hue='CTS', fliersize = 0, width = 0.4)
ax.set_xlabel('')
new_labels = get_group_Ns(rate_df, 'TF', hue='CTS', ticklabels=['non-TF', 'TF'])
# ax.set_xticklabels(['non-TF', 'TF'])
ax.set_xticklabels(new_labels)
#move the ylabel in to get more room
ax.set_ylabel('RNA stability percentile', labelpad=0)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, ['distributed', 'cell type specific'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='RNA class')
#prevent legend box from turning black.
ax.legend().get_frame().set_fc((1.0,1.0,1.0,0.8))
ax.legend(handles, ['ubiquitous', 'cell type specific'], bbox_to_anchor=(1, 0.5), loc='center left', title='RNA class')

# First comparison: non-CTS, TF vs. non-TF
# Second comparison: CTS, TF vs. non-TF
d1a = rate_df.query('~CTS & ~TF')['deg_rate']
d1b = rate_df.query('~CTS & TF')['deg_rate']
d2a = rate_df.query('CTS & ~TF')['deg_rate']
d2b = rate_df.query('CTS & TF')['deg_rate']

_, p1 = stats.mannwhitneyu(d1a, d1b)
_, p2 = stats.mannwhitneyu(d2a, d2b)

h1 = max(get_boxtop(rate_df, col1='CTS', val1=False, col2='TF', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df, col1='CTS', val1=False, col2='TF', val2=True, val_col='stab_percentile'))

h2 = max(get_boxtop(rate_df, col1='CTS', val1=True, col2='TF', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df, col1='CTS', val1=True, col2='TF', val2=True, val_col='stab_percentile'))

add_stars(0, 2, starty=h1+2, height=2.5, p=p1, ax=ax)
add_stars(1, 3, starty=h2+10, height=2.5, p=p2, ax=ax)
ax.set_ylim(-2, 100)

print('diff. ubiq tf vs non-tf %s' % p1)
print('diff. cts tf vs. non-tf %s' % p2)

plt.savefig('%s.%s' % (os.path.join(outdir, 'stab_cts_TFs'), out_fmt), dpi = out_dpi)

In [None]:
# If we want to plot RBP and mRBP separately with Seaborn swarmplot, need to duplicate the mRBP rows
# copy the mRBP rows out
mRBP_df = rate_df.query('mRBP').copy()[['deg_rate', 'stab_percentile', 'mRBP']]
rate_df3 = rate_df.copy()[['deg_rate', 'stab_percentile', 'RBP']]
rate_df3 = pd.concat([mRBP_df, rate_df3]).fillna(False)
# https://stackoverflow.com/questions/67095325/how-to-transform-multiple-boolean-columns-to-one-column-with-column-headers-and
rate_df3['category'] = rate_df3[['mRBP', 'RBP']].idxmax(1).where(rate_df3[['mRBP', 'RBP']].any(1)).fillna('other')
rate_df3['CTS'] = rate_df3.index.isin(rate_df.query('CTS').index)
# Check that each gene belongs to only one category
assert len(rate_df3.query('RBP')) + len(rate_df3.query('mRBP')) + len(rate_df3.query('~RBP & ~mRBP')) == len(rate_df3)

In [None]:
#Plot the stability percentile of RBPs for specific cell types
from plotting_fxns import get_boxtop, add_stars, get_box_coords

# fig = plt.figure(figsize=(dfig*1.7, dfig*1.35), constrained_layout=True)
fig = plt.figure(figsize=(dfig*2, dfig*1.35), constrained_layout=True)

ax = PrettyBox(data=rate_df3, x='category', y='stab_percentile', order=['other', 'RBP', 'mRBP'], hue='CTS', fliersize = 0, width = 0.4)
ax.set_xlabel('')
# ax.set_xticklabels(['non-RBP', 'RBP'])
new_labels = get_group_Ns(rate_df3, 'category', hue='CTS', order=['other', 'RBP', 'mRBP'], ticklabels=['other', 'RBP', 'mRBP'])
# ax.set_xticklabels(['non-TF', 'TF'])
ax.set_xticklabels(new_labels)
#move the ylabel in to get more room
ax.set_ylabel('RNA stability percentile', labelpad=0)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, ['distributed', 'cell type specific'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='RNA class')
#prevent legend box from turning black.

ax.legend().get_frame().set_fc((1.0,1.0,1.0,0.8))
# top gets clipped off, even with constrained_layout
# ax.legend(handles, ['ubiquitous', 'cell type specific'], bbox_to_anchor=(0.5, 1.3), loc='lower center', title='RNA class')
ax.legend(handles, ['ubiquitous', 'cell type specific'], bbox_to_anchor=(1, 0.5), loc='center left', title='RNA class')


# Note that for the comparison between other and mRBPs -- the other group is also depleted of RBPs in the graph but for the stats, 
# we will compare all non-mRBPs to mRBPs

# First comparison: non-CTS, RBP vs. non-RBP
# Second comparison: non-CTS, mRBP vs. non-mRBP
# Third comparison: CTS, RBP vs. non-RBP
# Fourth comparison: CTS, mRBP vs. non-mRBP
d1a = rate_df3.query('~CTS & ~RBP')['deg_rate']
d1b = rate_df3.query('~CTS & RBP')['deg_rate']

d2a = rate_df3.query('~CTS & ~mRBP')['deg_rate']
d2b = rate_df3.query('~CTS & mRBP')['deg_rate']

d3a = rate_df3.query('CTS & ~RBP')['deg_rate']
d3b = rate_df3.query('CTS & RBP')['deg_rate']

d4a = rate_df3.query('CTS & ~mRBP')['deg_rate']
d4b = rate_df3.query('CTS & mRBP')['deg_rate']

_, p1 = stats.mannwhitneyu(d1a, d1b)
_, p2 = stats.mannwhitneyu(d2a, d2b)
_, p3 = stats.mannwhitneyu(d3a, d3b)
_, p4 = stats.mannwhitneyu(d4a, d4b)

h1 = max(get_boxtop(rate_df3, col1='CTS', val1=False, col2='RBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=False, col2='RBP', val2=True, val_col='stab_percentile'))

h2 = max(get_boxtop(rate_df3, col1='CTS', val1=False, col2='mRBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=False, col2='mRBP', val2=True, val_col='stab_percentile'))

h3 = max(get_boxtop(rate_df3, col1='CTS', val1=True, col2='RBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=True, col2='RBP', val2=True, val_col='stab_percentile'))

h4 = max(get_boxtop(rate_df3, col1='CTS', val1=True, col2='mRBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=True, col2='mRBP', val2=True, val_col='stab_percentile'))

add_stars(0, 2, starty=h1+2, height=2.5, p=p1, ax=ax)
add_stars(0, 4, starty=h2+10, height=2.5, p=p2, ax=ax)
add_stars(1, 3, starty=h3+18, height=2.5, p=p3, ax=ax)
add_stars(1, 5, starty=h4+26, height=2.5, p=p4, ax=ax)

ax.set_ylim(-2, 100)

print('diff. ubiq other vs RBP %s' % p1)
print('diff. ubiq other vs. mRBP %s' % p2)
print('diff. cts other vs. RBP %s' % p3)
print('diff. cts other vs. mRBP %s' % p4)

plt.savefig('%s.%s' % (os.path.join(outdir, 'stab_cts_RBPs'), out_fmt), dpi = out_dpi)

In [None]:
# How many genes in the CTS mRBP category?
len(rate_df3.query('CTS & mRBP')['deg_rate'])


In [None]:
#Overall percentile median = 50, as expected
print('stability of all genes %s' % rate_df['stab_percentile'].median())
#Percentile of CTS genes
print('stability of CTS genes %s' % rate_df.loc[rate_df['CTS'], 'stab_percentile'].median())
#Percentile of non-CTS genes
print('stability of non-CTS genes %s' % rate_df.loc[~rate_df['CTS'], 'stab_percentile'].median())
#Percentile of TF genes
print('stability of TF genes %s' % rate_df.loc[rate_df['TF'], 'stab_percentile'].median())
#Percentile of non-CTS TF genes
print('stability of non-CTS TF genes %s' % rate_df.loc[rate_df['TF'] & ~rate_df['CTS'], 'stab_percentile'].median())
#Percentile of TF & CTS
print('stability of CTS & TF genes %s' % rate_df.loc[rate_df['TF'] & rate_df['CTS'], 'stab_percentile'].median())

In [None]:
# Find CTS genes which are in the top and bottom 10% of most stable RNAs for the CTS genes
# Previously I tried the top and bottom 10% of stability overall, but this found only 26 genes in the bottom 10% of genes
# Use as bg set, all the genes with stability measurements
bg_genes = rate_df.index

geneset_outdir = os.path.join(outdir, 'genesets')
os.makedirs(geneset_outdir, exist_ok=True)

num_genes = int(round(len(rate_df.query('CTS'))/10))
sorted_cts = rate_df.query('CTS').sort_values(by='stab_percentile')
stab_genes10 = sorted_cts.head(n=num_genes).index
unstab_genes10 = sorted_cts.tail(n=num_genes).index

# Write the output genelists
pd.DataFrame(stab_genes10).to_csv(os.path.join(geneset_outdir, 'CTS_10unstable_genes.csv'), header=None, index=None)
pd.DataFrame(unstab_genes10).to_csv(os.path.join(geneset_outdir, 'CTS_10stable_genes.csv'), header=None, index=None)
pd.DataFrame(bg_genes).to_csv(os.path.join(geneset_outdir, 'bg_genes.csv'), header=None, index=None)

##### Now show the p-values for the stability distribution differences and enrichment with TFs or RBPs

In [None]:
print('pval co', -math.log(0.05, 10))

In [None]:
enrich_table(res_tf, p_tf)

In [None]:
enrich_table(res_rbp, p_rbp)

In [None]:
enrich_table(res_tf_dillon, p_tf_dillon)