### Devreg xtin stab
- Look at stability of the different classes of genes with respect to H3K27me3

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
from collections import defaultdict
import pickle
from itertools import chain, combinations
import gffutils
import re
from copy import copy

sys.path.append('../scripts')
from stats_helpers import calc_fisher_exact
from plot_helpers import *
from annotation_utilities import *
from plotting_fxns import PrettyBox
from utilities import load_dataset

db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Devreg'
os.makedirs(outdir, exist_ok=True)

In [None]:
#Load pickle of methylation sites
geneset_dir = '../Figures/genesets/'
with open(os.path.join(geneset_dir, 'h3k27me3_mapped.p'), 'rb') as f:
    xtin_dict = pickle.load(f)

In [None]:
#Load stability data
rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')

#Use the Flybase geneset for transcription factor
TF_file = '../Figures/genesets/all_TFs.csv'
tfs = set(pd.read_csv(TF_file, header=None)[0].values)

#Import the GO slim based TF and RBP IDs:
RBPs_go_file = '../Figures/GO/ens/RNA binding.txt'
mRBPs_go_file = '../Figures/GO/ens/mRNA binding.txt'
rbps = set(pd.read_csv(RBPs_go_file, header=None)[0].values)
mrbps = set(pd.read_csv(mRBPs_go_file, header=None)[0].values)
rbps_only = rbps.difference(mrbps)

rate_df['TF'] = rate_df.index.isin(tfs)
rate_df['RBP_all'] = rate_df.index.isin(rbps)
rate_df['mRBP'] = rate_df.index.isin(mrbps)
rate_df['RBP'] = rate_df.index.isin(rbps_only)

In [None]:
#Annotate the genes with h3k27me3 upstream, downstream, or within gene
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

rate_df['up'] = rate_df.index.isin(xtin_dict['up']['h3k27me3'])
rate_df['down'] = rate_df.index.isin(xtin_dict['down']['h3k27me3'])
rate_df['gene'] = rate_df.index.isin(xtin_dict['gene']['h3k27me3'])

#Get all combinations and sort from max elements to least
l = ['up', 'down', 'gene']
combos = list(powerset(l))
combos.sort(key=lambda x: len(x))
combos.reverse()

#https://stackoverflow.com/questions/67095325/how-to-transform-multiple-boolean-columns-to-one-column-with-column-headers-and
s = []
for i in combos:
    cat = ''.join(i)
    col = rate_df[list(i)].all(axis=1)
    col.name = cat
    s.append(col)
df2 = pd.concat(s, axis=1)
df3 = df2.idxmax(1)
df3.name = 'category'
df4 = pd.concat([rate_df[['deg_rate','stab_percentile', 'TF', 'RBP_all', 'RBP', 'mRBP']], df3], axis=1)
df4.replace('', 'none', inplace=True)

In [None]:
# Add the CTS genes into the df 
CTS = pd.read_csv(os.path.join('../Figures/CTS/CTS_genes.csv'), header=None)[0]
df4['CTS'] = df4.index.isin(CTS)

In [None]:
def find_enrichment(row, df='', groupname='', total_counts=10000):
    l1 = df.query('category == @row.name').index
    l2 = df.query(groupname).index
    odds_r, log_p, lower, upper, table = calc_fisher_exact(l1, l2, total_counts)
    return log_p, odds_r

cats = ['TF', 'RBP', 'mRBP']
cat_counts = df4.groupby('category')[cats].sum()
cat_total = df4.groupby('category')[cats].size()
cat_counts['total'] = cat_total
total_counts = cat_counts['total'].sum()
for i in cats:
    cat_counts['%s_frac' % i] = cat_counts[i]/cat_counts['total']
    # cat_counts['%s_enrich' % i] = cat_counts.apply(find_enrichment, df=df4, groupname=i, total_counts=total_counts, axis=1)
    cat_counts[['%s_enrich_p' % i, '%s_enrich_oddsr' % i]] = cat_counts.apply(find_enrichment, df=df4, groupname=i, total_counts=total_counts, result_type='expand', axis=1)

In [None]:
#Version of plot with TFs only. Try plotting without the gridspec and constrained_layout because it is causing me problems
#https://stackoverflow.com/questions/47585775/seaborn-heatmap-with-single-column
#https://stackoverflow.com/questions/33158075/custom-annotation-seaborn-heatmap
#https://stackoverflow.com/questions/69727661/seaborn-heatmap-white-out-the-bottom-row-and-right-column
order = ['none','up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']

#Now try adding a white column with the total N numbers to the right
fig = plt.figure(figsize=(dfig*2, dfig*1.35))
test= df4.copy().head(n=1000)
cat_counts['total_frac'] = 1
#specify left1 and width1
# l1, w1 = (0.28, 0.26)
# l2 = l1+w1+0.02
# w2 = 1-l2-0.1
l1, w1 = (0.28, 0.37)
l2 = l1+w1+0.07
w2 = 1-l2-0.1
ax = fig.add_axes((l1, 0.15, w1, 0.8))
heatmap_ax = fig.add_axes((l2, 0.15, w2, 0.8))
# rect = [left, bottom, width, height]

ax = PrettyBox(data=df4, x='stab_percentile', y='category', order=order, orient='h', color=color_dict['grey'], ax = ax)
_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('H3K27me3 regions')

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}


norm = plt.Normalize(cat_counts['TF_frac'].min(), cat_counts['TF_frac'].max())

#prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    i.set_text(pretty_names[i.get_text()])
    new_labs.append(i)
ax.set_yticklabels(new_labs)

#set really large values as white, hack to make the all column white
cmap2 = copy(plt.get_cmap('viridis'))
cmap2.set_over('white')


#square=True argument shrinks the heatmap vertically, causes misalignment with boxes
heatmap_ax = sns.heatmap(cat_counts.loc[order][['TF_frac', 'total_frac']].values,
                         annot=cat_counts.loc[order][['TF', 'total']], fmt='d',
                         ax = heatmap_ax, cmap=cmap2, norm=norm, cbar_kws={'label':'fraction of genes', 'fraction':0.15,
                                                                   'shrink':0.4, 'pad':0.1, 'aspect':15})

heatmap_ax.set_yticks([])
heatmap_ax.set_xticklabels(['TFs', 'all'])
heatmap_ax.set_xlabel('gene group')
heatmap_ax.set_ylabel('')

plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_box'), out_fmt), dpi = out_dpi)

In [None]:
# Version of the plot with swarmplot
# Only problem is that we need to removed the 'none' category because there will be way too many dots to show
order = ['up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']

#Now try adding a white column with the total N numbers to the right
fig = plt.figure(figsize=(dfig*2, dfig*1.35))
test= df4.copy().head(n=1000)
cat_counts['total_frac'] = 1
#specify left1 and width1
# l1, w1 = (0.28, 0.26)
# l2 = l1+w1+0.02
# w2 = 1-l2-0.1
l1, w1 = (0.28, 0.37)
l2 = l1+w1+0.07
w2 = 1-l2-0.1
ax = fig.add_axes((l1, 0.15, w1, 0.8))
heatmap_ax = fig.add_axes((l2, 0.15, w2, 0.8))
# rect = [left, bottom, width, height]

ax = sns.swarmplot(data=df4, x='stab_percentile', y='category', order=order, hue='TF', orient='h', hue_order=[False, True], palette=[color_dict['grey'], color_dict['blue']], s=1.1, ax = ax)
# ax = PrettyBox(data=df4, x='stab_percentile', y='category', order=order, orient='h', color=color_dict['grey'], ax = ax)
_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('H3K27me3 regions')
ax.get_legend().remove()

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}


norm = plt.Normalize(cat_counts['TF_frac'].min(), cat_counts['TF_frac'].max())

#prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    i.set_text(pretty_names[i.get_text()])
    new_labs.append(i)
ax.set_yticklabels(new_labs)

#set really large values as white, hack to make the all column white
cmap2 = copy(plt.get_cmap('viridis'))
cmap2.set_over('white')


#square=True argument shrinks the heatmap vertically, causes misalignment with boxes
heatmap_ax = sns.heatmap(cat_counts.loc[order][['TF_frac', 'total_frac']].values,
                         annot=cat_counts.loc[order][['TF', 'total']], fmt='d',
                         ax = heatmap_ax, cmap=cmap2, norm=norm, cbar_kws={'label':'fraction of genes', 'fraction':0.15,
                                                                   'shrink':0.4, 'pad':0.1, 'aspect':15})
ax.text(0.5, 1, 'TF RNAs', transform=ax.transAxes, color=color_dict['blue'], ha='center')

heatmap_ax.set_yticks([])
heatmap_ax.set_xticklabels(['TFs', 'all'])
heatmap_ax.set_xlabel('gene group')
heatmap_ax.set_ylabel('')

plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_box2'), out_fmt), dpi = out_dpi)

In [None]:
# Version of the plot with two heat maps, both the fraction and the enrichment p-value
# Version of the plot where we try to improve the spacing
order = ['up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']

#Now try adding a white column with the total N numbers to the right
# Constrained_layout makes very weird decisions that don't work for me here
fig = plt.figure(figsize=(dfig*3, dfig*1.5))
gs = fig.add_gridspec(1,10)
ax = fig.add_subplot(gs[1:6])

gs2 = gs[6:].subgridspec(1,15, wspace=0, hspace=0)
ax2 = fig.add_subplot(gs2[0:3])
ax3 = fig.add_subplot(gs2[3:6])
ax4 = fig.add_subplot(gs2[6])
ax5 = fig.add_subplot(gs2[11])

ax = sns.swarmplot(data=df4, x='stab_percentile', y='category', order=order, hue='TF', orient='h',
                   hue_order=[False, True], palette=[color_dict['grey'], color_dict['blue']], s=1.1, ax = ax)
_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('H3K27me3 regions')
ax.get_legend().remove()

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}

#prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    i.set_text(pretty_names[i.get_text()])
    new_labs.append(i)
ax.set_yticklabels(new_labs)

im1 = ax2.imshow(cat_counts.loc[order][['TF_frac']].values, cmap='viridis')
im2 = ax3.imshow(cat_counts.loc[order][['TF_enrich_p']].values, cmap='magma')

cbar1 = ax4.figure.colorbar(im1, ax=ax4, fraction=0.5, location='right')
cbar2 = ax5.figure.colorbar(im2, ax=ax5, fraction=0.5, location='right')
for a in [ax2, ax3]:
    a.set(xticks=[],yticks=[])
    a.spines['bottom'].set_visible(False)
    a.spines['left'].set_visible(False)
ax4.set_visible(False)
ax5.set_visible(False)
cbar1.ax.set_ylabel('fraction TFs', labelpad=2)
# cbar2.ax.set_ylabel('enrichment (-log10 pvalue)', labelpad=0.2)
cbar2.set_label('enrichment (-log'r'$_{10}$' ' pvalue)', labelpad=2)

plt.savefig(os.path.join(outdir, 'testplot.png'))

In [None]:
# Version of the plot with two heat maps, both the fraction and the enrichment p-value
# Version of the plot where we try to improve the spacing
order = ['up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']

#Now try adding a white column with the total N numbers to the right
# Constrained_layout makes very weird decisions that don't work for me here
fig = plt.figure(figsize=(dfig*3, dfig*1.5))
gs = fig.add_gridspec(1,10)
ax = fig.add_subplot(gs[1:6])

gs2 = gs[6:].subgridspec(1,15, wspace=0, hspace=0)
ax2 = fig.add_subplot(gs2[0:3])
ax3 = fig.add_subplot(gs2[3:6])
ax4 = fig.add_subplot(gs2[6])
ax5 = fig.add_subplot(gs2[11])

ax = sns.swarmplot(data=df4, x='stab_percentile', y='category', order=order, hue='TF', orient='h',
                   hue_order=[False, True], palette=[color_dict['grey'], color_dict['blue']], s=1.1, ax = ax)
_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('H3K27me3 regions')
ax.get_legend().remove()

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}

#prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    i.set_text(pretty_names[i.get_text()])
    new_labs.append(i)
ax.set_yticklabels(new_labs)

im1 = ax2.imshow(cat_counts.loc[order][['TF_frac']].values, cmap='viridis')
im2 = ax3.imshow(cat_counts.loc[order][['TF_enrich_p']].values, cmap='magma')

cbar1 = ax4.figure.colorbar(im1, ax=ax4, fraction=0.5, location='right')
cbar2 = ax5.figure.colorbar(im2, ax=ax5, fraction=0.5, location='right')
for a in [ax2, ax3]:
    a.set(xticks=[],yticks=[])
    a.spines['bottom'].set_visible(False)
    a.spines['left'].set_visible(False)
ax4.set_visible(False)
ax5.set_visible(False)
cbar1.ax.set_ylabel('fraction TFs', labelpad=2)
# cbar2.ax.set_ylabel('enrichment (-log10 pvalue)', labelpad=0.2)
cbar2.set_label('enrichment (-log'r'$_{10}$' ' pvalue)', labelpad=2)

plt.savefig(os.path.join(outdir, 'testplot.png'))

In [None]:
df4['category'].unique()

In [None]:
order = ['gene', 'updowngene', 'none', 'updown']
order2 = [i for i in order if i not in ['none', 'updown']]
# list(map(lambda x: order.remove(x), order))
# sns.swarmplot(data=df4, order=['gene', 'updowngene'], y='category', x='stab_percentile')

In [None]:
order2

In [None]:
import warnings
from plotting_fxns import sc_swarmplot
from plotting_fxns import enrich_heatmap

order = ['up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']


# Version of plot with the TFs overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*1.6, dfig*1.5))
hstart = 0.22
h = 0.95 - hstart
ax = fig.add_axes((0.35, hstart, 0.29, h))

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax = sc_swarmplot(data=df4, all_genes=df4, x='stab_percentile', y='category', hue='TF', hue_name='TF mRNAs', order=order, x_lab='stability percentile', 
                      y_lab='cell type (num genes)', y_excluded=['none'], add_n_numbers=False, ax=ax)

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}

#prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    i.set_text(pretty_names[i.get_text()])
    new_labs.append(i)
ax.set_yticklabels(new_labs)
                      
enrich_heatmap(data=df4, all_genes=df4, x='stab_percentile', y='category', hue='TF', order=order, y_lab1='fraction of genes',
                   y_lab2='-log'r'$_{10}$'' p-value', hstart=0.22, fig=fig, ax=None)
# ax.set_xlabel('log'r'$_{10}$'' rate')

plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_box3'), out_fmt), dpi = out_dpi)

In [None]:
# This one is the best!!!
# Version of the plot with two heat maps, both the fraction and the enrichment p-value
# Version of the plot where we try to improve the spacing and add the numbers in for the number TFs
order = ['up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']

#Now try adding a white column with the total N numbers to the right
# Constrained_layout makes very weird decisions that don't work for me here
fig = plt.figure(figsize=(dfig*2, dfig*1.5))
gs = fig.add_gridspec(1,10)
ax = fig.add_subplot(gs[2:6])

gs2 = gs[6:].subgridspec(1,15, wspace=0, hspace=0)
ax2 = fig.add_subplot(gs2[0:3])
ax3 = fig.add_subplot(gs2[3:6])
ax4 = fig.add_subplot(gs2[6])
ax5 = fig.add_subplot(gs2[13])

ax = sns.swarmplot(data=df4, x='stab_percentile', y='category', order=order, hue='TF', orient='h',
                   hue_order=[False, True], palette=[color_dict['grey'], color_dict['blue']], s=1.1, ax = ax)
_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('H3K27me3 regions')
ax.get_legend().remove()

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}

#prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    i.set_text(pretty_names[i.get_text()])
    new_labs.append(i)
ax.set_yticklabels(new_labs)

# im1 = ax2.imshow(cat_counts.loc[order][['TF_frac']].values, cmap='viridis')
# im2 = ax3.imshow(cat_counts.loc[order][['TF_enrich']].values, cmap='magma')

heatmap_ax1 = sns.heatmap(cat_counts.loc[order][['TF_frac']].values,
                         annot=cat_counts.loc[order][['TF']].values, fmt='d',
                         ax = ax2, cmap='viridis', cbar=False)

heatmap_ax2 = sns.heatmap(cat_counts.loc[order][['TF_enrich_p']].values,
                         ax = ax3, cmap='magma', cbar=False)
                                                                   # 'shrink':0.4, 'pad':0.1, 'aspect':15})

norm1 = plt.Normalize(cat_counts.loc[order, 'TF_frac'].min(), cat_counts.loc[order,'TF_frac'].max())
sm1 = plt.cm.ScalarMappable(cmap='viridis', norm=norm1)
cb1 = ax.figure.colorbar(sm1, ax=ax4, fraction=0.5)

norm2 = plt.Normalize(cat_counts.loc[order, 'TF_enrich_p'].min(), cat_counts.loc[order,'TF_enrich_p'].max())
sm1 = plt.cm.ScalarMappable(cmap='magma', norm=norm2)
cb2 = ax.figure.colorbar(sm1, ax=ax5, fraction=0.5)

for a in [ax2, ax3]:
    a.set(xticks=[],yticks=[])
    a.spines['bottom'].set_visible(False)
    a.spines['left'].set_visible(False)
ax4.set_visible(False)
ax5.set_visible(False)
cb1.set_label('fraction TFs', labelpad=2)
# cbar2.ax.set_ylabel('enrichment (-log10 pvalue)', labelpad=0.2)
cb2.set_label('enrichment (-log'r'$_{10}$' ' pvalue)', labelpad=2)

ax2.set_xticks([0.5])
heatmap_ax1.set_xticklabels(['TFs'])

plt.subplots_adjust(bottom=0.15)
plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_box3'), out_fmt), dpi = out_dpi)

In [None]:
# Version of the plot with two heat maps, both the fraction and the enrichment p-value
order = ['up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']

#Now try adding a white column with the total N numbers to the right
# Constrained_layout makes very weird decisions that don't work for me here
fig = plt.figure(figsize=(dfig*2, dfig*1.35))
gs = fig.add_gridspec(1,10)
ax = fig.add_subplot(gs[0:5])

ax = sns.swarmplot(data=df4, x='stab_percentile', y='category', order=order, hue='TF', orient='h', hue_order=[False, True], palette=[color_dict['grey'], color_dict['blue']], s=1.1, ax = ax)
# ax = PrettyBox(data=df4, x='stab_percentile', y='category', order=order, orient='h', color=color_dict['grey'], ax = ax)
_ = ax.set_xlabel('stability percentile')
_ = ax.set_ylabel('H3K27me3 regions')
ax.get_legend().remove()

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}

#prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    i.set_text(pretty_names[i.get_text()])
    new_labs.append(i)
ax.set_yticklabels(new_labs)

gs2 = gs[5:].subgridspec(1,12, wspace=0, hspace=0)
ax2 = fig.add_subplot(gs2[0:3])
ax3 = fig.add_subplot(gs2[3:6])
ax4 = fig.add_subplot(gs2[6])
ax5 = fig.add_subplot(gs2[11])

im1 = ax2.imshow(cat_counts.loc[order][['TF']].values, cmap='viridis')
im2 = ax3.imshow(cat_counts.loc[order][['TF_enrich_p']].values, cmap='magma')

cbar1 = ax4.figure.colorbar(im1, ax=ax4, fraction=0.5, location='right')
cbar2 = ax5.figure.colorbar(im2, ax=ax5, fraction=0.5, location='right')
for a in [ax2, ax3]:
    a.set(xticks=[],yticks=[])
    a.spines['bottom'].set_visible(False)
    a.spines['left'].set_visible(False)
ax4.set_visible(False)
ax5.set_visible(False)
cbar1.ax.set_ylabel('enrichment', labelpad=0.2)
cbar2.ax.set_ylabel('enrichment', labelpad=0.2)

plt.savefig(os.path.join(outdir, 'testplot.png'))

In [None]:
#Report fraction of genes which are TF in both categories -- all and covered by H3K27me3
tf_me3 = len(df4.loc[(df4['category'] == 'updowngene') & (df4['TF'])])
all_me3 = len(df4.loc[df4['category'] == 'updowngene'])
print('%s / %s updowngenes are TFs = %s' % (tf_me3, all_me3, tf_me3/all_me3))

tf = len(df4.loc[df4['TF']])
all_genes = len(df4)
print('%s / %s updowngenes are TFs = %s' % (tf, all_genes, tf/all_genes))

#Report the stability of TFs and non-TFs in these categories
print('stability updowngene TFs', df4.loc[(df4['category'] == 'updowngene') & (df4['TF']), 'stab_percentile'].median())
print('stability updowngene non-TFs', df4.loc[(df4['category'] == 'updowngene') & (~df4['TF']), 'stab_percentile'].median())

print('stability none TFs', df4.loc[(df4['category'] == 'none') & (df4['TF']), 'stab_percentile'].median())
print('stability non non-TFs', df4.loc[(df4['category'] == 'none') & (~df4['TF']), 'stab_percentile'].median())

# If the me3 binding peaks are large, then does this suggest that shorter genes could be covered completely by the peaks, 
# leading to their overinclusion in this category?

In [None]:
df4.to_csv(os.path.join(outdir, 'gene_cat_me3.csv'))

In [None]:
#Number of genes per methylation category
df4['category'].value_counts()

In [None]:
# Formatting testing
# Adding using subGridSpec
# https://matplotlib.org/stable/gallery/userdemo/demo_gridspec06.html#sphx-glr-gallery-userdemo-demo-gridspec06-py
# https://matplotlib.org/stable/tutorials/intermediate/arranging_axes.html
df = pd.DataFrame({'A':np.random.randint(100, size=10), 'B':np.random.randint(10, size=10), 'C':np.random.randint(1000, size=10)})
fig = plt.figure(figsize=(2,2))
gs = fig.add_gridspec(1,10)
ax = fig.add_subplot(gs[0:5])
# Main bar plot
ax.bar(df.index, df['A'])
gs2 = gs[5:].subgridspec(1,12, wspace=0, hspace=0)
ax2 = fig.add_subplot(gs2[0:3])
ax3 = fig.add_subplot(gs2[3:6])
ax4 = fig.add_subplot(gs2[6])
ax5 = fig.add_subplot(gs2[11])

im1 = ax2.imshow(df[['B']].values, cmap='viridis')
im2 = ax3.imshow(df[['C']].values, cmap='magma')

cbar1 = ax4.figure.colorbar(im1, ax=ax4, fraction=0.5, location='right')
cbar2 = ax5.figure.colorbar(im2, ax=ax5, fraction=0.5, location='right')
for a in [ax2, ax3]:
    a.set(xticks=[],yticks=[])
    a.spines['bottom'].set_visible(False)
    a.spines['left'].set_visible(False)
ax4.set_visible(False)
ax5.set_visible(False)
cbar1.ax.set_ylabel('enrichment', labelpad=0.2)
cbar2.ax.set_ylabel('enrichment', labelpad=0.2)
# cbar.ax.set_ylabel('hello', labelpad=0)