#### CTS stability 2
- In this version I will get out the genes which have only symbol, no FBg and see if they can be converted to ID with Flybase
- Plot stability of RNAs enriched in specific cell types.
- Use the data from L3 brain single-cell sequencing from the Cocanaugher 2022 paper
- Show the fraction of TF, RBP, etc. in this dataset. Unfortunately we cannot do GO analysis on these genes since we do not know the background set

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import gffutils
import scipy.stats as stats
from collections import defaultdict

sys.path.append('../scripts')
from plot_helpers import *
from plotting_fxns import PrettyBox
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
#Import the stability data
outdir = '../Figures/CTS'
os.makedirs(outdir, exist_ok=True)

#Load stability data
rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')

#Use the Flybase geneset for transcription factor
TF_file = '../Figures/genesets/all_TFs.csv'
tfs = set(pd.read_csv(TF_file, header=None)[0].values)

#Import the GO slim based TF and RBP IDs:
RBPs_go_file = '../Figures/GO/ens/RNA binding.txt'
mRBPs_go_file = '../Figures/GO/ens/mRNA binding.txt'
gial_file = '../../resources/glial_studies/glia-protrusion-localised-id-interest.txt'
rbps = set(pd.read_csv(RBPs_go_file, header=None)[0].values)
mrbps = set(pd.read_csv(mRBPs_go_file, header=None)[0].values)
glial = set(pd.read_csv(gial_file, sep='\t')['dmel_gene_id'].values)
rbps_only = rbps.difference(mrbps)

rate_df['TF'] = rate_df.index.isin(tfs)
rate_df['RBP_all'] = rate_df.index.isin(rbps)
rate_df['mRBP'] = rate_df.index.isin(mrbps)
rate_df['RBP'] = rate_df.index.isin(rbps_only)
rate_df['glial'] = rate_df.index.isin(glial)
gene_groups = ['TF', 'RBP', 'mRBP', 'glial']


In [None]:
#Number in each category
print('num tfs', len(tfs))
print('num rbps', len(rbps))
print('num mrbps', len(mrbps))
#mrbps is a complete subset of rbps. Also make category of rbps which are not mrbps
print('num rbps_only', len(rbps_only))
print('num glial', len(glial))

In [None]:
#Load the genes enriched from single-cell sequencing in L3 brains
sc_file = '../../resources/other_studies/Corrales_Supplementary/Supplementary_spreadsheet_2_Ncells_and_gene_markers_per_cluster.xlsx'
xl = pd.ExcelFile(sc_file)
sheets = xl.sheet_names
sheets.remove('Ncells-cluster')

marker_dict = {}
#replace some names because of typos in the spreadsheet
celltype_replace = {'34 Immat': '34 Immat N'}
for i in range(0, len(sheets)):
# for i in range(0, 1):
    df = pd.read_excel(sc_file, sheets[i])
    ## celltype = ' '.join(sheets[0].split(' ')[1:])
    celltype = sheets[i].strip()
    if celltype in celltype_replace:
        celltype = celltype_replace[celltype]
    # Why don't the non-coding RNAs have an FBgn_ID? They do have them???
    # For now use permissive cutoff where we will accept all the genes that they have identified as markers
    pval_co = 1
    fc_co = 1
    #Currently, dropna() gets rid of lncRNAs, since they haven't been assigned to FBg IDs
    genes = df.loc[(df['p_val_adj'] < pval_co) & (df['avg_log2FC'] > fc_co), ['gene', 'FBgn_ID']]

    marker_dict[celltype] = genes

In [None]:
allgenes = pd.concat([*marker_dict.values()])
no_id = allgenes[pd.isnull(allgenes['FBgn_ID'])]

In [None]:
no_id['gene'].unique()

In [None]:
marker_dict['0 Immat N']

In [None]:
pd.DataFrame(marker_dict['0 Immat N'], columns=['gene_nam']

In [None]:
marker_dict.keys()

In [None]:
# Get all cell type markers and cell-type-specific TFs
db = gffutils.FeatureDB(gffutils_db)
genes = db.features_of_type('gene')
ngenes = sum(1 for _ in genes)
cts = set().union(*marker_dict.values())
cts_tfs = cts.intersection(tfs)

#Would be interesting to represent this visually somehow?
print('num CTS RNAs', len(cts))
print('num CTS TFs', len(cts_tfs))
print('num TFs', len(tfs))
print('frac CTS_TF/TF', len(cts_tfs)/len(tfs))
print('frac TF/CTS', len(cts_tfs)/len(cts))
print('frac CTS/genome', len(cts)/ngenes)
print('frac TF/genome', len(tfs)/ngenes)

# Write CTS genes to a file
pd.Series(list(cts)).to_csv(os.path.join(outdir, 'CTS_genes.csv'), index=False, header=False)

#### Summarize stability percentile of cell-type-specific RNAs
##### 1) Use all the clusters from Cocanougher (rate_df, clean_names)
##### 2) Combine cell types of the same name to form general cell type (rate_df2, clean_names2)

In [None]:
# Combine cell types of the same name to form a general cell type, e.g. Immat N
def is_int(string):
    try:
        return int(string)
    except:
        return None

def clean_name(string):
    return ' '.join(string.split(' ')[1:])

ct_names = list(marker_dict.keys())
clean_names = [i for i in ct_names if is_int(i) is None]

#Summarize by overall cell types
rate_df2 = rate_df[['stab_percentile'] + gene_groups].copy()

celltypes = defaultdict(set)
for i in clean_names:
    name = clean_name(i)
    celltypes[name].add(i)

#combine categories further as needed
#e.g. Hemocytes will be labelled as Hemos and combined with category Hemos
ct_combine = {'Hemocytes':'Hemos'}
celltypes2 = {}
for i in celltypes:
    if i in ct_combine:
        celltypes2[ct_combine[i]] = celltypes2[ct_combine[i]].union(celltypes[i])
    else:
        celltypes2[i] = celltypes[i]
        
#Assign genes to sub types to general cell type
marker_dict2 = defaultdict(set)
for i in celltypes2:
    for j in celltypes2[i]:
        marker_dict2[i] = marker_dict2[i].union(marker_dict[j])

for ct in marker_dict2:
    rate_df2[ct] = rate_df2.index.isin(marker_dict2[ct])

ct_names2 = list(marker_dict2.keys())
clean_names2 = [i for i in ct_names2 if is_int(i) is None]

print('genes found specifically in at least one cell type', rate_df2[clean_names2].any(axis=1).value_counts())

# In order to allow for genes to overlap cell types, we need to duplicate the gene index
d = defaultdict(list)
for cat in marker_dict2:
    for gene in marker_dict2[cat]:
        d[gene].append(cat)

# Need to do this instead of melt inorder to allow genes to be a member of more than one category
# https://stackoverflow.com/questions/42869544/dictionary-of-lists-to-dataframe
cts_df = pd.DataFrame(pd.concat({k: pd.Series(v) for k, v in d.items()}).droplevel(1), columns=['celltype'])
big_df = pd.merge(rate_df2[['stab_percentile'] + gene_groups], cts_df, left_on='gene', right_index=True, how='left').dropna(subset=['celltype'])
#convert to longform df
# big_df = rate_df2.reset_index().melt(['gene', 'stab_percentile', 'TF', 'RBP', 'mRBP'], var_name='celltype').query('value == True').drop(labels='value', axis=1)

#https://stackoverflow.com/questions/41709239/dividing-dataframe-column-by-matching-index-in-another-dataframe
cat_counts = big_df.groupby('celltype')[gene_groups].sum()
cat_total = big_df.groupby('celltype')[gene_groups].size()
cat_counts['total'] = cat_total
for i in gene_groups:
    cat_counts['%s_frac' % i] = cat_counts[i]/cat_counts['total']

In [None]:
#Version of plot with RBPs and mRBPs included
#https://stackoverflow.com/questions/47585775/seaborn-heatmap-with-single-column
#https://stackoverflow.com/questions/33158075/custom-annotation-seaborn-heatmap

# fig = plt.figure(figsize=(dfig*1.5, dfig*1.35), constrained_layout=True)
fig = plt.figure(figsize=(dfig*4, dfig*4), constrained_layout=True)

gs = fig.add_gridspec(ncols = 20, nrows = 1)
gs.update(wspace=20) # set the spacing between axes. 
#0:10, and 10: did work, but 1:11 now doesn't work?
ax = fig.add_subplot(gs[:, 0:10])
heatmap_ax = fig.add_subplot(gs[:, 10:])
order = big_df.groupby('celltype')['stab_percentile'].median().sort_values().index
#fig.set_constrained_layout_pads(w_pad=0.2)
ax = PrettyBox(data=big_df, x='stab_percentile', y='celltype', order=order, orient='h', color=color_dict['grey'], ax = ax)
ax.set_ylabel('cell type (num genes)')
# add the gene numbers to the cell type genes
for l in ax.get_yticklabels():
    ngenes = cat_counts.loc[l.get_text(), 'total']
    l.set_text('%s (%s)' % (l.get_text(), ngenes))
ax.set_yticklabels(ax.get_yticklabels())
#txt = ax.text(-2, 0.5, 'cell type (num genes)', ha='center', va='center', rotation=90, transform=ax.transAxes)
#_ = ax.set_yticklabels(labels[order])
_ = ax.set_xlabel('stability percentile')
heatmap_ax = sns.heatmap(cat_counts.loc[order][['TF_frac', 'RBP_frac', 'mRBP_frac']],
                         annot=cat_counts.loc[order][['TF', 'RBP', 'mRBP']],
                         ax = heatmap_ax, cmap='viridis',cbar_kws={'label':'fraction of genes', 'fraction':0.15,
                                                                   'shrink':0.4, 'pad':0.02, 'aspect':15})
heatmap_ax.set_yticks([])
heatmap_ax.set_ylabel('')
heatmap_ax.set_xticklabels(['TFs', 'RBPs', 'mRBPs'])
plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_box'), out_fmt), dpi = out_dpi)

In [None]:
#Version of plot with TFs only. Try plotting without the gridspec and constrained_layout because it is causing me problems
#https://stackoverflow.com/questions/47585775/seaborn-heatmap-with-single-column
#https://stackoverflow.com/questions/33158075/custom-annotation-seaborn-heatmap
fig = plt.figure(figsize=(dfig*1.5, dfig*1.35))
ax = fig.add_axes((0.35, 0.15, 0.35, 0.8))
heatmap_ax = fig.add_axes((0.7, 0.15, 0.1, 0.8))
    
ax = PrettyBox(data=big_df, x='stab_percentile', y='celltype', order=order, orient='h', color=color_dict['grey'], ax = ax)
_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('cell type (num genes)')

# add the gene numbers to the cell type genes
for l in ax.get_yticklabels():
    ngenes = cat_counts.loc[l.get_text(), 'total']
    l.set_text('%s (%s)' % (l.get_text(), ngenes))
ax.set_yticklabels(ax.get_yticklabels())

heatmap_ax = sns.heatmap(cat_counts.loc[order][['TF_frac']],
                         annot=cat_counts.loc[order][['TF']],
                         ax = heatmap_ax, cmap='viridis', square=True, cbar_kws={'label':'fraction of genes', 'fraction':0.15,
                                                                   'shrink':0.4, 'pad':0.1, 'aspect':15})
heatmap_ax.set_yticks([])
heatmap_ax.set_xticklabels(['TFs'])
heatmap_ax.set_ylabel('')

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_box2'), out_fmt), dpi = out_dpi)

In [None]:
# Version of plot with the TFs overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*1.5, dfig*1.35))
ax = fig.add_axes((0.35, 0.15, 0.35, 0.8))
heatmap_ax = fig.add_axes((0.7, 0.15, 0.1, 0.8))

ax = sns.swarmplot(data=big_df, x='stab_percentile', y='celltype', order=order, orient='h', hue='TF', hue_order=[False, True], palette=[color_dict['grey'], color_dict['blue']], s=1, ax=ax)
plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_box3'), out_fmt), dpi = out_dpi)
ax.text(0.5, 1, 'TF RNAs', transform=ax.transAxes, color=color_dict['blue'], ha='center')
ax.get_legend().remove()

_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('cell type (num genes)')

# add the gene numbers to the cell type genes
for l in ax.get_yticklabels():
    ngenes = cat_counts.loc[l.get_text(), 'total']
    l.set_text('%s (%s)' % (l.get_text(), ngenes))
ax.set_yticklabels(ax.get_yticklabels())

heatmap_ax = sns.heatmap(cat_counts.loc[order][['TF_frac']],
                         annot=cat_counts.loc[order][['TF']],
                         ax = heatmap_ax, cmap='viridis', square=True, cbar_kws={'label':'fraction of genes', 'fraction':0.15,
                                                                   'shrink':0.4, 'pad':0.1, 'aspect':15})
heatmap_ax.set_yticks([])
heatmap_ax.set_xticklabels(['TFs'])
heatmap_ax.set_ylabel('')

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_box3'), out_fmt), dpi = out_dpi)

In [None]:
# Version of plot with the glial enriched genes overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*1.5, dfig*1.35))
ax = fig.add_axes((0.35, 0.15, 0.35, 0.8))
heatmap_ax = fig.add_axes((0.7, 0.15, 0.1, 0.8))

ax = sns.swarmplot(data=big_df, x='stab_percentile', y='celltype', order=order, orient='h', hue='glial', hue_order=[False, True], palette=[color_dict['grey'], color_dict['blue']], s=1, ax=ax)
ax.text(0.5, 1, 'glial protrusion RNAs', transform=ax.transAxes, color=color_dict['blue'], ha='center')
ax.get_legend().remove()

_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('cell type (num genes)')

# add the gene numbers to the cell type genes
for l in ax.get_yticklabels():
    ngenes = cat_counts.loc[l.get_text(), 'total']
    l.set_text('%s (%s)' % (l.get_text(), ngenes))
ax.set_yticklabels(ax.get_yticklabels())

heatmap_ax = sns.heatmap(glial_df.loc[order][['frac']],
                         annot=glial_df.loc[order][['total']],
                         ax = heatmap_ax, cmap='viridis', square=True, cbar_kws={'label':'fraction of genes', 'fraction':0.15,
                                                                   'shrink':0.4, 'pad':0.1, 'aspect':15})
heatmap_ax.set_yticks([])
heatmap_ax.set_xticklabels(['  count'])
heatmap_ax.set_ylabel('')

plt.savefig('%s.%s' % (os.path.join(outdir, 'cts_box_glia'), out_fmt), dpi = out_dpi)

In [None]:
#We cannot do a proper GO analysis on this data since it is unclear what is the background set. 
#However, we could match these to the GO_slim_categories to see what they are enriched for as a rough idea.
low_stab = big_df[big_df['stab_percentile'] < 25].copy()
hi_stab = big_df[big_df['stab_percentile'] > 75].copy()

slim_dict = {}
indir = '../Figures/GO/ens/'
for file in os.listdir(indir):
    slim_dict[file.rstrip('.txt')] = set(pd.read_csv(os.path.join(indir, file), header=None)[0].tolist())
    
d = {'low':{}, 'hi':{}}
hi_genes = set(hi_stab['gene'].tolist())
low_genes = set(low_stab['gene'].tolist())
n_low = len(low_genes)
n_hi = len(hi_genes)
for i in slim_dict:
    d['low'][i] = len(low_genes.intersection(slim_dict[i]))/n_low
    d['hi'][i] = len(hi_genes.intersection(slim_dict[i]))/n_hi
pd.DataFrame.from_dict(d, orient='columns')

In [None]:
#Plot ridgeline plot
cats = order
num_cats = len(cats)
pal = sns.cubehelix_palette(num_cats, rot=-.25, light=.7)
#sns.set_palette(pal)
fig = plt.figure(figsize=(dfig, dfig*2))
gs = fig.add_gridspec(ncols=1, nrows = len(cats))
for i in range(len(cats)):
    fig.add_subplot(gs[i])
    ax = sns.kdeplot(x='stab_percentile', data=big_df[big_df['celltype']==cats[i]], bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5, color=pal[i])
    ax = sns.kdeplot(x='stab_percentile', data=big_df[big_df['celltype']==cats[i]], clip_on=False, color="w", lw=2, bw_adjust=.5)
    ax.spines['left'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.set_fc((0.0, 0.0, 0.0, 0.0))
    ax.set_yticks([])
    ax.set_ylabel('')
    if i != len(cats)-1:
        ax.set_xticks([])
        ax.set_xlabel('')
    else:
        ax.set_xlabel('stability percentile')
    label = cats[i]
#     if cats[i] in cat_pretty_names:
#         label = cat_pretty_names[cats[i]]
#     else:
#         label = cats[i]
    ax.text(-0.25, .3, label, fontsize=5, fontweight="bold",
            ha="left", va="center", transform=ax.transAxes, color=pal[i])

fig.subplots_adjust(hspace=-.3)
#this adds space on the left side to make room for the labels
fig.subplots_adjust(left=0.2)
plt.savefig('%s.%s' % (os.path.join(outdir, 'test_ridge'), out_fmt), dpi = out_dpi)

In [None]:
#Plot the stability percentile of TFs for specific cell types
rate_df2['CTS'] = rate_df2[clean_names2].any(axis=1)
fig = plt.figure(figsize=(dfig*0.7, dfig*1.35), constrained_layout=True)
ax = PrettyBox(data=rate_df2, x='TF', y='stab_percentile',  hue='CTS', fliersize = 0, width = 0.4)
ax.set_xlabel('')
ax.set_xticklabels(['non-TF', 'TF'])
#move the ylabel in to get more room
ax.set_ylabel('RNA stability percentile', labelpad=0)
handles, labels = ax.get_legend_handles_labels()
#ax.legend(handles, ['distributed', 'cell type specific'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='RNA class')
#prevent legend box from turning black.
ax.legend().get_frame().set_fc((1.0,1.0,1.0,0.8))
ax.legend(handles, ['distributed', 'cell type specific'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='RNA class')

plt.savefig('%s.%s' % (os.path.join(outdir, 'stab_cts_TFs'), out_fmt), dpi = out_dpi)

In [None]:
#Overall percentile median = 50, as expected
print('stability of all genes %s' % rate_df2['stab_percentile'].median())
#Percentile of CTS genes
print('stability of CTS genes %s' % rate_df2.loc[rate_df2['CTS'], 'stab_percentile'].median())
#Percentile of non-CTS genes
print('stability of non-CTS genes %s' % rate_df2.loc[~rate_df2['CTS'], 'stab_percentile'].median())
#Percentile of TF genes
print('stability of TF genes %s' % rate_df2.loc[rate_df2['TF'], 'stab_percentile'].median())
#Percentile of non-CTS TF genes
print('stability of non-CTS TF genes %s' % rate_df2.loc[rate_df2['TF'] & ~rate_df2['CTS'], 'stab_percentile'].median())
#Percentile of TF & CTS
print('stability of CTS & TF genes %s' % rate_df2.loc[rate_df2['TF'] & rate_df2['CTS'], 'stab_percentile'].median())

In [None]:
# Find CTS genes which are in the top and bottom 10% of most stable RNAs for the CTS genes
# Previously I tried the top and bottom 10% of stability overall, but this found only 26 genes in the bottom 10% of genes
# Use as bg set, all the genes with stability measurements
bg_genes = rate_df2.index

geneset_outdir = os.path.join(outdir, 'genesets')
os.makedirs(geneset_outdir, exist_ok=True)

num_genes = int(round(len(rate_df2.query('CTS'))/10))
sorted_cts = rate_df2.query('CTS').sort_values(by='stab_percentile')
stab_genes10 = sorted_cts.head(n=num_genes).index
unstab_genes10 = sorted_cts.tail(n=num_genes).index

# Write the output genelists
pd.DataFrame(stab_genes10).to_csv(os.path.join(geneset_outdir, 'CTS_10unstable_genes.csv'), header=None, index=None)
pd.DataFrame(unstab_genes10).to_csv(os.path.join(geneset_outdir, 'CTS_10stable_genes.csv'), header=None, index=None)
pd.DataFrame(bg_genes).to_csv(os.path.join(geneset_outdir, 'bg_genes.csv'), header=None, index=None)