### Devreg xtin stab
- Look at stability of the different classes of genes with respect to H3K27me3

In [None]:
# Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
from collections import defaultdict
import pickle
from itertools import chain, combinations
import gffutils
import re
from copy import copy
import warnings

sys.path.append('../scripts')
from stats_helpers import calc_fisher_exact
from plot_helpers import *
from annotation_utilities import *
from plotting_fxns import PrettyBox, sc_swarmplot, enrich_heatmap, enrich_table
from utilities import load_dataset

db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Devreg'
os.makedirs(outdir, exist_ok=True)

In [None]:
# Load pickle of methylation sites
geneset_dir = '../Figures/genesets/'
with open(os.path.join(geneset_dir, 'h3k27me3_mapped.p'), 'rb') as f:
    xtin_dict = pickle.load(f)

In [None]:
# Load stability data
rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')

# Use the Flybase geneset for transcription factor
TF_file = '../Figures/genesets/all_TFs.csv'
tfs = set(pd.read_csv(TF_file, header=None)[0].values)

# Import the GO slim based TF and RBP IDs:
RBPs_go_file = '../Figures/GO/ens/RNA binding.txt'
mRBPs_go_file = '../Figures/GO/ens/mRNA binding.txt'
rbps = set(pd.read_csv(RBPs_go_file, header=None)[0].values)
mrbps = set(pd.read_csv(mRBPs_go_file, header=None)[0].values)
rbps_only = rbps.difference(mrbps)

rate_df['TF'] = rate_df.index.isin(tfs)
rate_df['RBP_all'] = rate_df.index.isin(rbps)
rate_df['mRBP'] = rate_df.index.isin(mrbps)
rate_df['RBP'] = rate_df.index.isin(rbps_only)

In [None]:
# Annotate the genes with h3k27me3 upstream, downstream, or within gene
def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

rate_df['up'] = rate_df.index.isin(xtin_dict['up']['h3k27me3'])
rate_df['down'] = rate_df.index.isin(xtin_dict['down']['h3k27me3'])
rate_df['gene'] = rate_df.index.isin(xtin_dict['gene']['h3k27me3'])

# Get all combinations and sort from max elements to least
l = ['up', 'down', 'gene']
combos = list(powerset(l))
combos.sort(key=lambda x: len(x))
combos.reverse()

# https://stackoverflow.com/questions/67095325/how-to-transform-multiple-boolean-columns-to-one-column-with-column-headers-and
s = []
for i in combos:
    cat = ''.join(i)
    col = rate_df[list(i)].all(axis=1)
    col.name = cat
    s.append(col)
df2 = pd.concat(s, axis=1)
df3 = df2.idxmax(1)
df3.name = 'category'
df4 = pd.concat([rate_df[['deg_rate','stab_percentile', 'TF', 'RBP_all', 'RBP', 'mRBP']], df3], axis=1)
df4.replace('', 'none', inplace=True)

In [None]:
print(f'number of genes in rate dataset that are updowngene %s' % len(df4.query('category == "updowngene"')))

In [None]:
order = ['up', 'gene', 'down', 'upgene', 'downgene', 'updowngene']

# Version of plot with the TFs overlaid in swarmplot form
fig = plt.figure(figsize=(dfig*2.2, dfig*2.5))
hstart = 0.14
h = 0.95 - hstart
ax = fig.add_axes((0.33, hstart, 0.29, h))

# Ingnore the seaborn swarm overplotting warning here:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    ax, res, pvals = sc_swarmplot(data=df4, all_genes=df4, x='stab_percentile', y='category', hue='TF', hue_name='TF mRNAs', order=order, 
    x_lab='stability percentile', y_lab='H3K27me3 regions (region label, num genes)', y_excluded=['none', 'updown'], palette=[color_dict['grey'], 
    'navy'], hm_lstart=0.67, ax=ax, fig=fig, cbar_lab_sp=4)

pretty_names = {'up':'upstream (A)', 'down':'downstream (C)', 'gene':'gene body (B)', 'up':'upstream (A)', 'updown':'(AC)', 
               'upgene':'(AB)', 'downgene':'(BC)', 'updowngene':'(ABC)', 'none':'none'}

# prettify the category labels
new_labs = []
for i in ax.get_yticklabels():
    label = i.get_text().split(' ')[0]
    ngenes = i.get_text().split(' ')[1].strip('()')
    new_label = f'{pretty_names[label].rstrip(")")}, {ngenes})' 
    new_labs.append(new_label)
ax.set_yticklabels(new_labs)
                      
plt.savefig('%s.%s' % (os.path.join(outdir, 'me3_swarm'), out_fmt), dpi = out_dpi)

In [None]:
# Report fraction of genes which are TF in both categories -- all and covered by H3K27me3
tf_me3 = len(df4.loc[(df4['category'] == 'updowngene') & (df4['TF'])])
all_me3 = len(df4.loc[df4['category'] == 'updowngene'])
print('%s / %s updowngenes are TFs = %s' % (tf_me3, all_me3, tf_me3/all_me3))

tf = len(df4.loc[df4['TF']])
all_genes = len(df4)
print('%s / %s all genes are TFs = %s' % (tf, all_genes, tf/all_genes))

# Report the stability of TFs and non-TFs in these categories
print('stability updowngene TFs', df4.loc[(df4['category'] == 'updowngene') & (df4['TF']), 'stab_percentile'].median())
print('stability updowngene non-TFs', df4.loc[(df4['category'] == 'updowngene') & (~df4['TF']), 'stab_percentile'].median())

print('stability none TFs', df4.loc[(df4['category'] == 'none') & (df4['TF']), 'stab_percentile'].median())
print('stability non non-TFs', df4.loc[(df4['category'] == 'none') & (~df4['TF']), 'stab_percentile'].median())


In [None]:
# write the me3 targets and the CTS categories to an outfile
cts_df = pd.read_csv('../Figures/CTS/corrales_celltypes.csv', index_col=0)

df4['CTS'] = df4.index.isin(cts_df.index)
df4.to_csv(os.path.join(outdir, 'gene_cat_me3.csv'))

In [None]:
# Number of genes per methylation category
df4['category'].value_counts()

In [None]:
# print the p-values for the RNA stability distribution differences and the enrichment with TFs
enrich_table(res, pvals)