#### Reg me3 vs TF
- Compare TF and CTS to the other categories, both in terms of enrichment and stability
- Plot stability of:
    - TFs +/- me3
    - TFs +/- CTS
    - CTS +/- me3

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
from scipy.stats import fisher_exact

sys.path.append('../scripts')
from plot_helpers import *
from stats_helpers import calc_fisher_exact
from annotation_utilities import *
from utilities import load_dataset
from plotting_fxns import PrettyBox
import scipy.stats as stats

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Reg'
os.makedirs(outdir, exist_ok=True)

In [None]:
me_df = pd.read_csv('../Figures/Devreg/gene_cat_me3.csv', index_col='gene')
me_df['me3_target'] = me_df['category'] == 'updowngene'

In [None]:
# Plot the stability percentile of TFs for specific cell types
# Constrained_layout and tight_layout() not giving good results here
from plotting_fxns import get_boxtop, add_stars, get_box_coords
fig = plt.figure(figsize=(dfig, dfig*1.2), constrained_layout=True)
gs = fig.add_gridspec(ncols=2,nrows=10, wspace=0.2)
# leave some room at the top for the legends
# fig.set_constrained_layout_pads(w_pad=0, wspace=0)
ax1 = fig.add_subplot(gs[2:, 0])
ax2 = fig.add_subplot(gs[2:, 1])
# Show the effect of me3 for TF vs non-TF
ax1 = PrettyBox(data=me_df, x='TF', y='stab_percentile',  hue='me3_target', fliersize = 0, width = 0.4, ax=ax1)
ax1.set_xlabel('')
ax1.set_xticklabels(['non-TF', 'TF'])
#move the ylabel in to get more room
ax1.set_ylabel('RNA stability percentile', labelpad=0)
handles, labels = ax1.get_legend_handles_labels()
# ax1.legend(handles, ['other', 'me3 target'], bbox_to_anchor=(0.5, 1.0), loc='lower center', title='gene class')
ax1.get_legend().remove()
# Show the effect of me3 for CTS vs. non-CTS
ax2 = PrettyBox(data=me_df, x='CTS', y='stab_percentile',  hue='me3_target', fliersize = 0, width = 0.4, ax=ax2)
ax2.set_xlabel('')
ax2.set_xticklabels(['non-CTS', 'CTS'])
#move the ylabel in to get more room
ax2.set_ylabel('')
handles, labels = ax2.get_legend_handles_labels()
ax2.set_yticklabels([])
ax2.legend(handles, ['other', 'me3 target'], bbox_to_anchor=(0.5, 1.015), loc='upper center', title='gene class', bbox_transform=fig.transFigure, ncol=2)

# First comparison: non-CTS, TF vs. non-TF
# Second comparison: CTS, TF vs. non-TF
d1a = me_df.query('~me3_target & ~TF')['deg_rate']
d1b = me_df.query('~me3_target & TF')['deg_rate']
d2a = me_df.query('me3_target & ~TF')['deg_rate']
d2b = me_df.query('me3_target & TF')['deg_rate']
d3a = me_df.query('~me3_target & ~CTS')['deg_rate']
d3b = me_df.query('~me3_target & CTS')['deg_rate']
d4a = me_df.query('me3_target & ~CTS')['deg_rate']
d4b = me_df.query('me3_target & CTS')['deg_rate']

_, p1 = stats.mannwhitneyu(d1a, d1b)
_, p2 = stats.mannwhitneyu(d2a, d2b)
_, p3 = stats.mannwhitneyu(d3a, d3b)
_, p4 = stats.mannwhitneyu(d4a, d4b)

ax1.set_ylim(-2, 100)
ax2.set_ylim(-2, 100)

h1 = max(get_boxtop(me_df, col1='me3_target', val1=False, col2='TF', val2=False, val_col='stab_percentile'),
         get_boxtop(me_df, col1='me3_target', val1=False, col2='TF', val2=True, val_col='stab_percentile'))

h2 = max(get_boxtop(me_df, col1='me3_target', val1=True, col2='TF', val2=False, val_col='stab_percentile'),
         get_boxtop(me_df, col1='me3_target', val1=True, col2='TF', val2=True, val_col='stab_percentile'))

h3 = max(get_boxtop(me_df, col1='me3_target', val1=False, col2='CTS', val2=False, val_col='stab_percentile'),
         get_boxtop(me_df, col1='me3_target', val1=False, col2='CTS', val2=True, val_col='stab_percentile'))

h4 = max(get_boxtop(me_df, col1='me3_target', val1=True, col2='CTS', val2=False, val_col='stab_percentile'),
         get_boxtop(me_df, col1='me3_target', val1=True, col2='CTS', val2=True, val_col='stab_percentile'))

add_stars(0, 2, starty=h1+2, height=2.5, p=p1, ax=ax1)
add_stars(1, 3, starty=h2+10, height=2.5, p=p2, ax=ax1)
add_stars(0, 2, starty=h1+2, height=2.5, p=p3, ax=ax2)
add_stars(1, 3, starty=h1+10, height=2.5, p=p4, ax=ax2)

# Subplots adjust puts the left side/right side of axis where specified
plt.subplots_adjust(left=0.22, right=1)
plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_vs_TF_CTS'), out_fmt), dpi = out_dpi)

In [None]:
bottom_width = 4*dfig + 1/25.4
top_width = (bottom_width - 2/25.4)/3

# Get and plot enrichment for overlap between the different gene groups
def add_n_to_label(labels, gene_n):
    new_labels = []
    for i in labels:
        text = i.get_text()
        if text in gene_n:
            new_labels.append(f'{text}\n({gene_n[text]})')
        else:
            new_labels.append(f'{text}\n')
    return new_labels

def plot_enrichmap(df_vals, df_anns, label='p-value', vmin=0, locator=None):
    '''
    Plot enrichment results.
    df_vals is values to plot
    df_anns is the number of genes in each category.
    '''
    fig = plt.figure(figsize=(top_width, dfig))
    ax = fig.add_subplot(111)

    a = df_vals.values
    idx = np.argwhere(pd.isna(a))
    mask = np.zeros(np.shape(a))
    for i in idx:
        mask[i[0], i[1]] = 1
    heatmap_ax = sns.heatmap(df_vals.astype(float), annot=df_anns, fmt='d', square=True, mask=mask, cbar_kws={'label':label, 'ticks':locator}, vmin=vmin,
                             cmap='magma', ax=ax)
    return heatmap_ax


# Get the enrichment values for overlap with gene groups
TF = me_df.query('TF').index
me3 = me_df.query('me3_target').index
CTS = me_df.query('CTS').index
TF_n = len(TF)
me3_n = len(me3)
CTS_n = len(CTS)
CTS_TF = me_df.query('CTS & TF').index
gene_groups = {'TF':TF, 'me3':me3, 'CTS':CTS, 'CTS TF':CTS_TF}
gene_n = {'me3':me3_n, 'TF':TF_n, 'CTS':CTS_n}
gene_n2 = {'CTS':CTS_n}


mat = pd.DataFrame(columns=['TF', 'me3'], index=['CTS', 'TF', 'CTS TF'])
enrich_dict = {'odds_r':mat.copy(), 'pvals':mat.copy(), 'ngenes':mat.copy()}
for i in mat.columns:
    for j in mat.itertuples():
        print(i, j.Index)
        odds_r, p, lower, upper, table = calc_fisher_exact(gene_groups[j.Index], gene_groups[i], len(me_df))
        print(odds_r, p, lower, upper)
        enrich_dict['odds_r'].loc[j.Index,i] = odds_r
        enrich_dict['pvals'].loc[j.Index,i] = p
        enrich_dict['ngenes'].loc[j.Index,i] = table[0][0]

ax = plot_enrichmap(enrich_dict['pvals'], enrich_dict['ngenes'], label='enrichment (-log'r'$_{10}$' ' p-value)')
new_xlabs = add_n_to_label(ax.get_xticklabels(), gene_n)
new_ylabs = add_n_to_label(ax.get_yticklabels(), gene_n2)
ax.set_yticklabels(new_ylabs, ma='center')
ax.set_xticklabels(new_xlabs, ma='center')
ax.set_title('(num genes)')
plt.subplots_adjust(bottom=0.22, left=-0.32)
plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_overlap_pval'), out_fmt), dpi = out_dpi)

loc = plticker.MultipleLocator(base=5.0)
ax = plot_enrichmap(enrich_dict['odds_r'], enrich_dict['ngenes'], label='odds ratio', vmin=0, locator=loc)
new_xlabs = add_n_to_label(ax.get_xticklabels(), gene_n)
new_ylabs = add_n_to_label(ax.get_yticklabels(), gene_n2)
ax.set_yticklabels(new_ylabs, ma='center')
ax.set_xticklabels(new_xlabs, ma='center')
ax.set_title('(num genes)')
plt.subplots_adjust(bottom=0.22, left=-0.32)
plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_overlap_oddsr'), out_fmt), dpi = out_dpi)

In [None]:
# Report the fraction of H3K27me3 genes that are TFs and CTS TFs
frac_me3_is_TF = len(me_df.query('me3_target & TF'))/len(me_df.query('me3_target'))
frac_me3_is_CTS_TF = len(me_df.query('me3_target & TF & CTS'))/len(me_df.query('me3_target'))
print('% me3 is TF', frac_me3_is_TF*100)
print('% me3 is CTS TF', frac_me3_is_CTS_TF*100)

In [None]:
# Report enrich dict odds ratio
enrich_dict['odds_r']

Although extended H3K27me3 are enriched in both TFs and cell-type-specific TFs, they are not more enriched in the 
CTS TFs, as shown by the overlapping confidence intervals