#### CTS stability
- Plot stability of RNAs enriched in specific cell types.
- Use the data from L3 brain single-cell sequencing from the Cocanaugher 2022 paper
- Show the fraction of TF, RBP, etc. in this dataset. Unfortunately we cannot do GO analysis on these genes since we do not know the background set

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import gffutils
import scipy.stats as stats
from collections import defaultdict
import warnings

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import PrettyBox
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
#Import the stability data
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)

#Load stability data
rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')

#Use the Flybase geneset for transcription factor
TF_file = '../Figures/genesets/all_TFs.csv'
tfs = set(pd.read_csv(TF_file, header=None)[0].values)

#Import the GO slim based TF and RBP IDs:
RBPs_go_file = '../Figures/GO/ens/RNA binding.txt'
mRBPs_go_file = '../Figures/GO/ens/mRNA binding.txt'
gial_file = '../../resources/glial_studies/glia-protrusion-localised-id-interest.txt'
rbps = set(pd.read_csv(RBPs_go_file, header=None)[0].values)
mrbps = set(pd.read_csv(mRBPs_go_file, header=None)[0].values)
glial = set(pd.read_csv(gial_file, sep='\t')['dmel_gene_id'].values)
rbps_only = rbps.difference(mrbps)

rate_df['TF'] = rate_df.index.isin(tfs)
rate_df['RBP_all'] = rate_df.index.isin(rbps)
rate_df['mRBP'] = rate_df.index.isin(mrbps)
rate_df['RBP'] = rate_df.index.isin(rbps_only)
rate_df['glial'] = rate_df.index.isin(glial)
gene_groups = ['TF', 'RBP', 'mRBP', 'glial']


In [None]:
#Number in each category
print('num tfs', len(tfs))
print('num rbps', len(rbps))
print('num mrbps', len(mrbps))
#mrbps is a complete subset of rbps. Also make category of rbps which are not mrbps
print('num rbps_only', len(rbps_only))
print('num glial', len(glial))

In [None]:
# Load the genes enriched from single-cell sequencing in L3 brains
cts_df = pd.read_csv(os.path.join(outdir, 'cts_celltypes_log1.00.csv'), index_col=0)
cts_df.index.name = 'gene'
# Index cts_df2 on the cell type to get all genes associated with certain cell types
cts_df2 = cts_df.reset_index().set_index('celltype').copy()

In [None]:
# Get all cell type markers and cell-type-specific TFs
db = gffutils.FeatureDB(gffutils_db)
genes = db.features_of_type('gene')
ngenes = sum(1 for _ in genes)
cts = cts_df.index.unique()
cts_tfs = cts.intersection(tfs)

#Would be interesting to represent this visually somehow?
print('num CTS RNAs', len(cts))
print('num CTS TFs', len(cts_tfs))
print('num TFs', len(tfs))
print('frac CTS_TF/TF', len(cts_tfs)/len(tfs))
print('frac TF/CTS', len(cts_tfs)/len(cts))
print('frac CTS/genome', len(cts)/ngenes)
print('frac TF/genome', len(tfs)/ngenes)

In [None]:
# Summarize by overall cell types
rate_df2 = rate_df[['deg_rate', 'stab_percentile'] + gene_groups].copy()

cts_names = cts_df2.index.unique()
for ct in cts_names:
    # This causes an error if there's only one gene in it
    rate_df2[ct] = rate_df2.index.isin(cts_df2.loc[ct, ['gene']].values.flatten())

    
print('genes found specifically in at least one cell type', rate_df2[cts_names].any(axis=1).value_counts())


In [None]:
big_df = pd.merge(rate_df2[['deg_rate', 'stab_percentile'] + gene_groups], cts_df, left_on='gene', right_index=True, how='left').dropna(subset=['celltype'])
# https://stackoverflow.com/questions/41709239/dividing-dataframe-column-by-matching-index-in-another-dataframe
cat_counts = big_df.groupby('celltype')[gene_groups].sum()
cat_total = big_df.groupby('celltype')[gene_groups].size()
cat_counts['total'] = cat_total
for i in gene_groups:
    cat_counts['%s_frac' % i] = cat_counts[i]/cat_counts['total']

In [None]:
from plotting_fxns import sc_swarmplot, enrich_heatmap

# Version of plot with the TFs overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.32, hstart, 0.29, h))

order = big_df.groupby('celltype')['stab_percentile'].median().sort_values().index

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue='TF', hue_name='TF mRNAs', x_lab='stability percentile', 
                      y_lab='cell type (num genes)', ax=ax)
                      
res_tf = enrich_heatmap(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue=['TF'], y_lab1='fraction of genes',
                   y_lab2='-log'r'$_{10}$'' p-value', hstart=hstart, lstart=0.65, fig=fig, xticklabs1=['counts'], xticklabs2=['enrichment'], hm_xlab=False)

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_tf_swarm3'), out_fmt), dpi = out_dpi)

In [None]:
# Version of plot with the gial protrusion genes overlaid in swarmplot form -- for Ilan
fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.32, hstart, 0.29, h))

order = big_df.groupby('celltype')['stab_percentile'].median().sort_values().index

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue='glial', hue_name='glial protrusion mRNAs', x_lab='stability percentile', 
                      y_lab='cell type (num genes)', ax=ax)
plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_glial_swarm2'), out_fmt), dpi = 600)

In [None]:
# Version of plot with the RBPs and mRBPs overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*1.9, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.28, hstart, 0.29, h))

order = big_df.groupby('celltype')['stab_percentile'].median().sort_values().index

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue='RBP', hue_name='RBPs', x_lab='stability percentile', 
                      y_lab='cell type (num genes)', ax=ax)
                      
res_rbp = enrich_heatmap(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue=['RBP', 'mRBP'], y_lab1='fraction of genes',
               y_lab2='-log'r'$_{10}$'' p-value', hstart=hstart, lstart=0.61, fig=fig, xticklabs1=['RBP', 'mRBP'], xticklabs2=['RBP', 'mRBP'],
               hm_width=0.09)

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_rbp_swarm'), out_fmt), dpi = out_dpi)

In [None]:
# Plot enrichment of RBPs and mRBPs
fig = plt.figure(figsize=(dfig*1.2, dfig*1.5))
hstart = 0.15
h = 0.95 - hstart

# Plot heatmap of the counts and enrichment values
arrays = enrich_heatmap(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue=['RBP', 'mRBP'], y_lab1='fraction of genes',
                   y_lab2='-log'r'$_{10}$'' p-value', lstart=0.5, hstart=0.22, hm_width=0.1, cb_width=0.03, fig=fig, xticklabs1=['RBP', 'mRBP'],
                   xticklabs2=['RBP', 'mRBP'], ylabels=True, cbar_lab_sp=6)

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_rbps'), out_fmt), dpi = out_dpi)

In [None]:
# Version of plot with the gial protrusion genes overlaid in swarmplot form -- for Ilan
fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.32, hstart, 0.29, h))

order = big_df.groupby('celltype')['stab_percentile'].median().sort_values().index

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue='glial', hue_name='glial protrusion mRNAs', x_lab='stability percentile', 
                      y_lab='cell type (num genes)', ax=ax)
                      
enrich_heatmap(data=big_df, all_genes=rate_df2, x='stab_percentile', y='celltype', hue=['glial'], y_lab1='fraction of genes',
                   y_lab2='-log'r'$_{10}$'' p-value', lstart=0.65, hstart=0.22, fig=fig, xticklabs1=['counts'], xticklabs2=['enrichment'], hm_xlab=False)

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_glial_swarm'), out_fmt), dpi = out_dpi)

In [None]:
#We cannot do a proper GO analysis on this data since it is unclear what is the background set. 
#However, we could match these to the GO_slim_categories to see what they are enriched for as a rough idea.
low_stab = big_df[big_df['stab_percentile'] < 25].copy()
hi_stab = big_df[big_df['stab_percentile'] > 75].copy()

slim_dict = {}
indir = '../Figures/GO/ens/'
for file in os.listdir(indir):
    slim_dict[file.rstrip('.txt')] = set(pd.read_csv(os.path.join(indir, file), header=None)[0].tolist())
    
d = {'low':{}, 'hi':{}}
hi_genes = set(hi_stab.index.tolist())
low_genes = set(low_stab.index.tolist())
n_low = len(low_genes)
n_hi = len(hi_genes)
for i in slim_dict:
    d['low'][i] = len(low_genes.intersection(slim_dict[i]))/n_low
    d['hi'][i] = len(hi_genes.intersection(slim_dict[i]))/n_hi
pd.DataFrame.from_dict(d, orient='columns')

In [None]:
#Plot the stability percentile of TFs for specific cell types
celltypes = cts_df['celltype'].unique()
rate_df2['CTS'] = rate_df2[celltypes].any(axis=1)

from plotting_fxns import get_boxtop, add_stars, get_box_coords

fig = plt.figure(figsize=(dfig*0.75, dfig*1.5), constrained_layout=True)
ax = PrettyBox(data=rate_df2, x='TF', y='stab_percentile',  hue='CTS', fliersize = 0, width = 0.4)
ax.set_xlabel('')
ax.set_xticklabels(['non-TF', 'TF'])
#move the ylabel in to get more room
ax.set_ylabel('RNA stability percentile', labelpad=0)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, ['distributed', 'cell type specific'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='RNA class')
#prevent legend box from turning black.
ax.legend().get_frame().set_fc((1.0,1.0,1.0,0.8))
ax.legend(handles, ['ubiquitous', 'cell type specific'], bbox_to_anchor=(0.5, 1.15), loc='lower center', title='RNA class')

# First comparison: non-CTS, TF vs. non-TF
# Second comparison: CTS, TF vs. non-TF
d1a = rate_df2.query('~CTS & ~TF')['deg_rate']
d1b = rate_df2.query('~CTS & TF')['deg_rate']
d2a = rate_df2.query('CTS & ~TF')['deg_rate']
d2b = rate_df2.query('CTS & TF')['deg_rate']

_, p1 = stats.mannwhitneyu(d1a, d1b)
_, p2 = stats.mannwhitneyu(d2a, d2b)

h1 = max(get_boxtop(rate_df2, col1='CTS', val1=False, col2='TF', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df2, col1='CTS', val1=False, col2='TF', val2=True, val_col='stab_percentile'))

h2 = max(get_boxtop(rate_df2, col1='CTS', val1=True, col2='TF', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df2, col1='CTS', val1=True, col2='TF', val2=True, val_col='stab_percentile'))

add_stars(0, 2, starty=h1+2, height=2.5, p=p1, ax=ax)
add_stars(1, 3, starty=h2+10, height=2.5, p=p2, ax=ax)
ax.set_ylim(-2, 100)

plt.savefig('%s.%s' % (os.path.join(outdir, 'stab_cts_TFs'), out_fmt), dpi = out_dpi)

In [None]:
# If we want to plot RBP and mRBP separately with Seaborn swarmplot, need to duplicate the mRBP rows
# copy the mRBP rows out
mRBP_df = rate_df2.query('mRBP').copy()[['deg_rate', 'stab_percentile', 'mRBP']]
rate_df3 = rate_df2.copy()[['deg_rate', 'stab_percentile', 'RBP']]
rate_df3 = pd.concat([mRBP_df, rate_df3]).fillna(False)
# https://stackoverflow.com/questions/67095325/how-to-transform-multiple-boolean-columns-to-one-column-with-column-headers-and
rate_df3['category'] = rate_df3[['mRBP', 'RBP']].idxmax(1).where(rate_df3[['mRBP', 'RBP']].any(1)).fillna('other')
rate_df3['CTS'] = rate_df3.index.isin(rate_df2.query('CTS').index)
# Check that each gene belongs to only one category
assert len(rate_df3.query('RBP')) + len(rate_df3.query('mRBP')) + len(rate_df3.query('~RBP & ~mRBP')) == len(rate_df3)

In [None]:
#Plot the stability percentile of RBPs for specific cell types
from plotting_fxns import get_boxtop, add_stars, get_box_coords

fig = plt.figure(figsize=(dfig*1.5, dfig*1.5), constrained_layout=True)
ax = PrettyBox(data=rate_df3, x='category', y='stab_percentile', order=['other', 'RBP', 'mRBP'], hue='CTS', fliersize = 0, width = 0.4)
ax.set_xlabel('')
# ax.set_xticklabels(['non-RBP', 'RBP'])
#move the ylabel in to get more room
ax.set_ylabel('RNA stability percentile', labelpad=0)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, ['distributed', 'cell type specific'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='RNA class')
#prevent legend box from turning black.

ax.legend().get_frame().set_fc((1.0,1.0,1.0,0.8))
# top gets clipped off, even with constrained_layout
# ax.legend(handles, ['ubiquitous', 'cell type specific'], bbox_to_anchor=(0.5, 1.3), loc='lower center', title='RNA class')
ax.legend(handles, ['ubiquitous', 'cell type specific'], bbox_to_anchor=(1, 0.5), loc='center left', title='RNA class')


# Note that for the comparison between other and mRBPs -- the other group is also depleted of RBPs in the graph but for the stats, 
# we will compare all non-mRBPs to mRBPs

# First comparison: non-CTS, RBP vs. non-RBP
# Second comparison: non-CTS, mRBP vs. non-mRBP
# Third comparison: CTS, RBP vs. non-RBP
# Fourth comparison: CTS, mRBP vs. non-mRBP
d1a = rate_df3.query('~CTS & ~RBP')['deg_rate']
d1b = rate_df3.query('~CTS & RBP')['deg_rate']

d2a = rate_df3.query('~CTS & ~mRBP')['deg_rate']
d2b = rate_df3.query('~CTS & mRBP')['deg_rate']

d3a = rate_df3.query('CTS & ~RBP')['deg_rate']
d3b = rate_df3.query('CTS & RBP')['deg_rate']

d4a = rate_df3.query('CTS & ~mRBP')['deg_rate']
d4b = rate_df3.query('CTS & mRBP')['deg_rate']

_, p1 = stats.mannwhitneyu(d1a, d1b)
_, p2 = stats.mannwhitneyu(d2a, d2b)
_, p3 = stats.mannwhitneyu(d3a, d3b)
_, p4 = stats.mannwhitneyu(d4a, d4b)

h1 = max(get_boxtop(rate_df3, col1='CTS', val1=False, col2='RBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=False, col2='RBP', val2=True, val_col='stab_percentile'))

h2 = max(get_boxtop(rate_df3, col1='CTS', val1=False, col2='mRBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=False, col2='mRBP', val2=True, val_col='stab_percentile'))

h3 = max(get_boxtop(rate_df3, col1='CTS', val1=True, col2='RBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=True, col2='RBP', val2=True, val_col='stab_percentile'))

h4 = max(get_boxtop(rate_df3, col1='CTS', val1=True, col2='mRBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df3, col1='CTS', val1=True, col2='mRBP', val2=True, val_col='stab_percentile'))

add_stars(0, 2, starty=h1+2, height=2.5, p=p1, ax=ax)
add_stars(0, 4, starty=h2+10, height=2.5, p=p2, ax=ax)
add_stars(1, 3, starty=h3+18, height=2.5, p=p3, ax=ax)
add_stars(1, 5, starty=h4+26, height=2.5, p=p4, ax=ax)

ax.set_ylim(-2, 100)

plt.savefig('%s.%s' % (os.path.join(outdir, 'stab_cts_RBPs2'), out_fmt), dpi = out_dpi)

In [None]:
# How many genes in the CTS mRBP category?
len(rate_df3.query('CTS & mRBP')['deg_rate'])


In [None]:
#Plot the stability percentile of RBPs and mRBPs for specific cell types
from plotting_fxns import get_boxtop, add_stars, get_box_coords

fig = plt.figure(figsize=(dfig*0.75, dfig*1.5), constrained_layout=True)
ax = PrettyBox(data=rate_df2, x='RBP', y='stab_percentile',  hue='CTS', fliersize = 0, width = 0.4)
ax.set_xlabel('')
ax.set_xticklabels(['non-RBP', 'RBP'])
#move the ylabel in to get more room
ax.set_ylabel('RNA stability percentile', labelpad=0)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, ['distributed', 'cell type specific'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='RNA class')
#prevent legend box from turning black.
ax.legend().get_frame().set_fc((1.0,1.0,1.0,0.8))
ax.legend(handles, ['ubiquitous', 'cell type specific'], bbox_to_anchor=(0.5, 1.15), loc='lower center', title='RNA class')

# First comparison: non-CTS, TF vs. non-TF
# Second comparison: CTS, TF vs. non-TF
d1a = rate_df2.query('~CTS & ~RBP')['deg_rate']
d1b = rate_df2.query('~CTS & RBP')['deg_rate']
d2a = rate_df2.query('CTS & ~RBP')['deg_rate']
d2b = rate_df2.query('CTS & RBP')['deg_rate']

_, p1 = stats.mannwhitneyu(d1a, d1b)
_, p2 = stats.mannwhitneyu(d2a, d2b)

h1 = max(get_boxtop(rate_df2, col1='CTS', val1=False, col2='RBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df2, col1='CTS', val1=False, col2='RBP', val2=True, val_col='stab_percentile'))

h2 = max(get_boxtop(rate_df2, col1='CTS', val1=True, col2='RBP', val2=False, val_col='stab_percentile'),
         get_boxtop(rate_df2, col1='CTS', val1=True, col2='RBP', val2=True, val_col='stab_percentile'))

add_stars(0, 2, starty=h1+2, height=2.5, p=p1, ax=ax)
add_stars(1, 3, starty=h2+10, height=2.5, p=p1, ax=ax)

plt.savefig('%s.%s' % (os.path.join(outdir, 'stab_cts_RBPs'), out_fmt), dpi = out_dpi)

In [None]:
#Overall percentile median = 50, as expected
print('stability of all genes %s' % rate_df2['stab_percentile'].median())
#Percentile of CTS genes
print('stability of CTS genes %s' % rate_df2.loc[rate_df2['CTS'], 'stab_percentile'].median())
#Percentile of non-CTS genes
print('stability of non-CTS genes %s' % rate_df2.loc[~rate_df2['CTS'], 'stab_percentile'].median())
#Percentile of TF genes
print('stability of TF genes %s' % rate_df2.loc[rate_df2['TF'], 'stab_percentile'].median())
#Percentile of non-CTS TF genes
print('stability of non-CTS TF genes %s' % rate_df2.loc[rate_df2['TF'] & ~rate_df2['CTS'], 'stab_percentile'].median())
#Percentile of TF & CTS
print('stability of CTS & TF genes %s' % rate_df2.loc[rate_df2['TF'] & rate_df2['CTS'], 'stab_percentile'].median())

In [None]:
# Find CTS genes which are in the top and bottom 10% of most stable RNAs for the CTS genes
# Previously I tried the top and bottom 10% of stability overall, but this found only 26 genes in the bottom 10% of genes
# Use as bg set, all the genes with stability measurements
bg_genes = rate_df2.index

geneset_outdir = os.path.join(outdir, 'genesets')
os.makedirs(geneset_outdir, exist_ok=True)

num_genes = int(round(len(rate_df2.query('CTS'))/10))
sorted_cts = rate_df2.query('CTS').sort_values(by='stab_percentile')
stab_genes10 = sorted_cts.head(n=num_genes).index
unstab_genes10 = sorted_cts.tail(n=num_genes).index

# Write the output genelists
pd.DataFrame(stab_genes10).to_csv(os.path.join(geneset_outdir, 'CTS_10unstable_genes.csv'), header=None, index=None)
pd.DataFrame(unstab_genes10).to_csv(os.path.join(geneset_outdir, 'CTS_10stable_genes.csv'), header=None, index=None)
pd.DataFrame(bg_genes).to_csv(os.path.join(geneset_outdir, 'bg_genes.csv'), header=None, index=None)

In [None]:
enrich_tf = pd.DataFrame(res_tf['enrich'], index=res_tf['order'], columns=['-log10_enrich_tf'])
enrich_rbp = pd.DataFrame(res_rbp['enrich'], index=res_rbp['order'], columns=['-log10_enrich_rbp', '-log10_enrich_mrbp'])
enrich_df = pd.concat([enrich_tf, enrich_rbp], axis=1)
print(f'-log10 0.05 {-math.log(0.05, 10)}')
enrich_df.sort_values(by='-log10_enrich_tf', ascending=False)