##### Examples dev TFs
- Look up specific TFs involved in development, for example temporal TFs from embryonic NBs
- Also look up the pair-rule and gap genes

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
# Manually look up the gene IDs in Flybase v6.32
# This corresponse to the NB7-1 lineage, in fig. 1 of Doe review 2017
embryo_nb = ['hb', 'svp', 'Kr', 'pdm2', 'nub', 'cas', 'grh']
# These from the Syed & Doe eLife paper, 2017
larval_nb = ['Imp', 'Syp', 'br', 'chinmo', 'Eip93F', 'svp', 'lin-28']
# Additional ones from the Chris Doe review 2017
extra_embryo_nb = ['sqz', 'nab']
opc_nbs = ['hth', 'klu', 'ey', 'slp1', 'slp2', 'D', 'tll']
topc_nbs = ['Dll', 'ey', 'slp1', 'slp2', 'D']
mb_nbs = ['Imp', 'chinmo', 'Syp', 'br', 'mir-let7']
ad_nbs = ['Imp', 'chinmo', 'Kr', 'Syp']
thoracic_nbs = ['Imp', 'chinmo', 'cas', 'svp', 'Syp', 'br']
typeII_nbs = ['cas', 'D', 'Imp', 'chinmo', 'lin-28', 'EcR', 'Syp', 'br', 'Eip93F']
# pair-rule and gap genes reported by Interactive Fly
# Classic set of pair-rule genes doesnt inclue Ten-m
pairrule = ['eve', 'ftz', 'h', 'opa', 'odd', 'prd', 'runt', 'slp1', 'slp2', 'Ten-m']
pairrule_classic = pairrule.remove('Ten-m')
gap = ['btd', 'cnc', 'cad', 'kn', 'croc', 'ems', 'gt', 'hb', 'hkb', 'Kr', 'kni', 'oc', 'slp1', 'slp2', 'tll']

genes = {'hb':'FBgn0001180', 'Kr':'FBgn0001325', 'pdm2':'FBgn0004394', 'cas':'FBgn0004878', 'eve':'FBgn0000606', 'ftz':'FBgn0001077', 
         'h':'FBgn0001168', 'opa':'FBgn0003002', 'odd':'FBgn0002985', 'prd':'FBgn0003145', 'runt':'FBgn0003300', 'slp2':'FBgn0004567', 
         'slp1':'FBgn0003430', 'Ten-m':'FBgn0004449', 'btd':'FBgn0000233', 'cnc':'FBgn0262975', 'cad':'FBgn0000251', 'kn':'FBgn0001319',
         'croc':'FBgn0014143', 'ems':'FBgn0000576', 'gt':'FBgn0001150', 'hb':'FBgn0001180', 'hkb':'FBgn0261434', 'Kr':'FBgn0001325', 
         'kni':'FBgn0001320', 'oc':'FBgn0004102', 'tll':'FBgn0003720', 'Imp':'FBgn0285926', 'Syp':'FBgn0038826', 'nub':'FBgn0085424', 
         'grh':'FBgn0259211', 'br':'FBgn0283451', 'chinmo':'FBgn0086758', 'Eip93F':'FBgn0264490', 'svp':'FBgn0003651', 'lin-28':'FBgn0035626',
         'sqz':'FBgn0010768', 'nab':'FBgn0259986', 'hth':'FBgn0001235', 'klu':'FBgn0013469', 'ey':'FBgn0005558', 'D':'FBgn0000411', 'tll':'FBgn0003720',
         'Dll':'FBgn0000157','mir-let7':'FBgn0262406', 'EcR':'FBgn0000546'}

all_nb_factors = set(embryo_nb + extra_embryo_nb + larval_nb + opc_nbs + topc_nbs + mb_nbs + ad_nbs + thoracic_nbs + typeII_nbs)
all_pattern_factors = set(pairrule + gap)

In [None]:
# Import the stability data
outdir = '../Figures/Examples'
os.makedirs(outdir, exist_ok=True)

# Load stability data
rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')
rate_df['round_halflife'] = rate_df['halflife'].round(0).astype(int)

# Here you can see genes that were eliminated due to expression filtering or INSPEcT filtering
infile = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = pd.read_csv(infile)


In [None]:
def sort_and_write(df, genelist, outname):
    sdf = df[df.index.isin(genelist)].copy()
    # sdf['gene_name'] = pd.Categorical(sdf['gene_name'], categories=genelist)
    sdf.sort_values(by='gene_name', inplace=True, key=lambda col: col.str.lower())
    sdf[['gene_name', 'round_halflife']].to_csv(os.path.join(outdir, '%s.csv' % outname))

In [None]:
nb_genelist = [genes[i] for i in all_nb_factors]
pattern_genelist = [genes[i] for i in all_pattern_factors]

sort_and_write(rate_df, nb_genelist, 'nb_stab')
sort_and_write(rate_df, pairrule_genelist, 'pattern_stab')

In [None]:
cts_types = '../Figures/CTS/cts_celltypes.csv'
cts_df = pd.read_csv(cts_types, index_col=0)
# Because of dropna, this will remove genes haven't mapped to a celltype
big_df = pd.merge(rate_df[['deg_rate', 'stab_percentile']], cts_df, left_on='gene', right_index=True, how='left').dropna(subset=['celltype'])
big_df['nb_factor'] = big_df.index.isin(nb_genelist)
big_df['pattern_factor'] = big_df.index.isin(pattern_genelist)

In [None]:
# Just add the pattern factor and nb factors to the rate_df
rate_df['nb_factor'] = rate_df.index.isin(nb_genelist)
rate_df['pattern_factor'] = rate_df.index.isin(pattern_genelist)

#Use the Flybase geneset for transcription factor
TF_file = '../Figures/genesets/all_TFs.csv'
tfs = set(pd.read_csv(TF_file, header=None)[0].values)
rate_df['TF'] = rate_df.index.isin(tfs)

In [None]:
# Check that all these factors (which were manually entered using Flybase 6.32) can be found in the unfiltered dataset
to_check = nb_genelist + pattern_genelist
assert len(set(to_check)) == len(df.query('gene in @to_check')['gene'].unique())

In [None]:
# Get RNAs which are regulated temporally in neuroblasts or in pattern formation
dfs = []
for i in ['nb_factor', 'pattern_factor']:
    sdf = rate_df.query(i).copy()
    sdf['factor'] = i
    dfs.append(sdf)
df2 = pd.concat(dfs)
df3 = rate_df[~rate_df.index.isin(df2.index)].copy()
df3['factor'] = 'none'
df4 = pd.concat([df2, df3])

In [None]:
# https://stackoverflow.com/questions/49612037/combine-overlay-two-factorplots-in-matplotlib
fig = plt.figure(figsize=(dfig*1.5, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
# It doesn't really make sense to plot the background distribution because of the percentile transformation
ax = sns.violinplot(data=df2, x='stab_percentile', y='factor', ax=ax, palette=[color_dict['purple'], color_dict['blue']], inner=None)
ax = sns.swarmplot(data=df2, x='stab_percentile', y='factor', palette=['black', 'white'], hue='TF', ax=ax, s=3)
ax.set_xlabel('stability percentile')
ax.set_ylabel('RNA regulated in:')
ax.set_yticklabels(['neural fate', 'patterning'])
# legend = ax.legend()
# frame = legend.get_frame()
# frame.set_facecolor(color_dict['grey'])
# legend.set_title('TF')
# This doesn't work -- if you call ax.legend() again to set the frame to grey, then it also resets the size of the dots
# for lh in ax.legend().legendHandles:
#     lh._sizes = [3]
# frame.set_facecolor(color_dict['grey'])
current_handles, current_labels = plt.gca().get_legend_handles_labels()
ax.legend(current_handles, ['non-TF', 'TF'], bbox_to_anchor=(0.5, 1.0), loc=8, ncol = 2, facecolor=color_dict['grey'])
plt.savefig('%s.%s' % (os.path.join(outdir, 'development_swarm'), out_fmt), dpi = out_dpi)

Plot the stability of the NB and pattern factors in the swarm plot using the single cell type separations.
This doesn't really make sense though because a lot of the factors aren't there.

In [None]:
# Make a swarmplot of the NB and pattern factors
import warnings
from plotting_fxns import sc_swarmplot
from plotting_fxns import enrich_heatmap

# Version of plot with the TFs overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.35, hstart, 0.29, h))

order = big_df.groupby('celltype')['stab_percentile'].median().sort_values().index

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df, x='stab_percentile', y='celltype', hue='nb_factor', hue_name='NB factor', order=order, x_lab='stability percentile', 
                      y_lab='cell type (num genes)', add_n_numbers=False, ax=ax)

                      
# enrich_heatmap(data=big_df, all_genes=rate_df, x='stab_percentile', y='category', hue='nb_factor', order=order, y_lab1='fraction of genes',
#                    y_lab2='-log'r'$_{10}$'' p-value', hstart=0.22, fig=fig, ax=None)
# ax.set_xlabel('log'r'$_{10}$'' rate')

plt.savefig('%s.%s' % (os.path.join(outdir, 'nbfactor_swarm'), out_fmt), dpi = out_dpi)

fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.35, hstart, 0.29, h))

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=big_df, all_genes=rate_df, x='stab_percentile', y='celltype', hue='pattern_factor', hue_name='pattern factor', order=order, x_lab='stability percentile', 
                      y_lab='cell type (num genes)', add_n_numbers=False, ax=ax)

plt.savefig('%s.%s' % (os.path.join(outdir, 'patternfactor_swarm'), out_fmt), dpi = out_dpi)