##### Examples dev TFs
- Look up specific TFs involved in development, for example temporal TFs from embryonic NBs
- Also look up the pair-rule and gap genes
- These genes were manually parsed out of reviews and/or Interactive Fly and looked up in Flybase 632

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
# Import the stability data
outdir = '../Figures/Examples'
os.makedirs(outdir, exist_ok=True)

# Load stability data
rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')
rate_df['round_halflife'] = rate_df['halflife'].round(0).astype(int)

# Here you can see genes that were eliminated due to expression filtering or INSPEcT filtering
infile = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = pd.read_csv(infile)

In [None]:
pattern_genelist = pd.read_csv(os.path.join(outdir, 'patterning_genes_628.csv'), header=None)[0].values
nb_genelist = pd.read_csv(os.path.join(outdir, 'neuraldev_genes_628.csv'), header=None)[0].values
segpol_genelist = pd.read_csv(os.path.join(outdir, 'segpol_genes_628.csv'), header=None)[0].values

In [None]:
def sort_and_write(df, genelist, outname):
    sdf = df[df.index.isin(genelist)].copy()
    # sdf['gene_name'] = pd.Categorical(sdf['gene_name'], categories=genelist)
    sdf.sort_values(by='gene_name', inplace=True, key=lambda col: col.str.lower())
    sdf[['gene_name', 'round_halflife']].to_csv(os.path.join(outdir, '%s.csv' % outname))

sort_and_write(rate_df, nb_genelist, 'nb_stab')
sort_and_write(rate_df, pattern_genelist, 'pattern_stab')

In [None]:
# Just add the pattern factor and nb factors to the rate_df
rate_df['nb_factor'] = rate_df.index.isin(nb_genelist)
rate_df['pattern_factor'] = rate_df.index.isin(pattern_genelist)
rate_df['segpol_factor'] = rate_df.index.isin(segpol_genelist)

#Use the Flybase geneset for transcription factor
TF_file = '../Figures/genesets/all_TFs.csv'
tfs = set(pd.read_csv(TF_file, header=None)[0].values)
rate_df['TF'] = rate_df.index.isin(tfs)

In [None]:
# Get RNAs which are regulated temporally in neuroblasts or in pattern formation
# Genes will appear multiple times if belong to multiple groups
dfs = []
for i in ['nb_factor', 'pattern_factor', 'segpol_factor']:
    sdf = rate_df.query(i).copy()
    sdf['factor'] = i
    dfs.append(sdf)
df2 = pd.concat(dfs)

In [None]:
# Need to drop ones which are only seg_pol factor
df3 = df2.query('~segpol_factor').copy()

In [None]:
# Swarm plot of patterning and neural fate genes, with segment polarity excluded
fig = plt.figure(figsize=(dfig*1.5, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
# It doesn't really make sense to plot the background distribution because of the percentile transformation
ax = sns.violinplot(data=df3, x='stab_percentile', y='factor', ax=ax, palette=[color_dict['purple'], color_dict['blue']], inner=None)
ax = sns.swarmplot(data=df3, x='stab_percentile', y='factor', palette=['black', 'white'], hue='TF', ax=ax, s=2)
ax.set_xlabel('stability percentile')
ax.set_ylabel('RNA regulated in:')
ax.set_yticklabels(['neural fate', 'patterning'])
current_handles, current_labels = plt.gca().get_legend_handles_labels()
ax.legend(current_handles, ['non-TF', 'TF'], bbox_to_anchor=(0.5, 1.0), loc=8, ncol = 2, facecolor=color_dict['grey'])
plt.savefig('%s.%s' % (os.path.join(outdir, 'development_swarm1'), out_fmt), dpi = out_dpi)

In [None]:
# https://stackoverflow.com/questions/49612037/combine-overlay-two-factorplots-in-matplotlib
fig = plt.figure(figsize=(dfig*1.5, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
# It doesn't really make sense to plot the background distribution because of the percentile transformation
ax = sns.violinplot(data=df2, x='stab_percentile', y='factor', ax=ax, palette=[color_dict['purple'], color_dict['blue']], inner=None)
ax = sns.swarmplot(data=df2, x='stab_percentile', y='factor', palette=['black', 'white'], hue='TF', ax=ax, s=2)
ax.set_xlabel('stability percentile')
ax.set_ylabel('RNA regulated in:')
ax.set_yticklabels(['neural fate', 'patterning', 'segment polarity'])
# legend = ax.legend()
# frame = legend.get_frame()
# frame.set_facecolor(color_dict['grey'])
# legend.set_title('TF')
# This doesn't work -- if you call ax.legend() again to set the frame to grey, then it also resets the size of the dots
# for lh in ax.legend().legendHandles:
#     lh._sizes = [3]
# frame.set_facecolor(color_dict['grey'])
current_handles, current_labels = plt.gca().get_legend_handles_labels()
ax.legend(current_handles, ['non-TF', 'TF'], bbox_to_anchor=(0.5, 1.0), loc=8, ncol = 2, facecolor=color_dict['grey'])
plt.savefig('%s.%s' % (os.path.join(outdir, 'development_swarm2'), out_fmt), dpi = out_dpi)

In [None]:
# What is the overlap of the neural_fate group and the patterning group?
reused_genes = df2.query('nb_factor').index.intersection(df2.query('pattern_factor').index)

In [None]:
df2.loc[reused_genes]

Plot the stability of the NB and pattern factors in the swarm plot using the single cell type separations.
This doesn't really make sense though because a lot of the factors aren't there.

In [None]:
cts_types = '../Figures/CTS/cts_celltypes.csv'
cts_df = pd.read_csv(cts_types, index_col=0)
# Because of dropna, this will remove genes haven't mapped to a celltype
big_df = pd.merge(rate_df[['deg_rate', 'stab_percentile']], cts_df, left_on='gene', right_index=True, how='left').dropna(subset=['celltype'])
big_df['nb_factor'] = big_df.index.isin(nb_genelist)
big_df['pattern_factor'] = big_df.index.isin(pattern_genelist)

# Make a swarmplot of the NB and pattern factors
import warnings
from plotting_fxns import sc_swarmplot
from plotting_fxns import enrich_heatmap

# Version of plot with the TFs overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.35, hstart, 0.29, h))

order = big_df.groupby('celltype')['stab_percentile'].median().sort_values().index

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df, x='stab_percentile', y='celltype', hue='nb_factor', hue_name='NB factor', order=order, x_lab='stability percentile', 
                      y_lab='cell type (num genes)', add_n_numbers=False, ax=ax)

                      
# enrich_heatmap(data=big_df, all_genes=rate_df, x='stab_percentile', y='category', hue='nb_factor', order=order, y_lab1='fraction of genes',
#                    y_lab2='-log'r'$_{10}$'' p-value', hstart=0.22, fig=fig, ax=None)
# ax.set_xlabel('log'r'$_{10}$'' rate')

plt.savefig('%s.%s' % (os.path.join(outdir, 'nbfactor_swarm'), out_fmt), dpi = out_dpi)

fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.35, hstart, 0.29, h))

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df, x='stab_percentile', y='celltype', hue='pattern_factor', hue_name='pattern factor', order=order, x_lab='stability percentile', 
                      y_lab='cell type (num genes)', add_n_numbers=False, ax=ax)

plt.savefig('%s.%s' % (os.path.join(outdir, 'patternfactor_swarm'), out_fmt), dpi = out_dpi)