##### Motifs genesets
- Prepare tables to feed Transite and looking at conservation of UTR sites

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
from collections import defaultdict
import pickle
from itertools import chain, combinations
import gffutils
import re
from copy import copy

sys.path.append('../scripts')
from plot_helpers import *
from annotation_utilities import *
from plotting_fxns import PrettyBox
from utilities import load_dataset

db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Motifs'
os.makedirs(outdir, exist_ok = True)

In [None]:
# Load the data that includes the TF data
me_df = pd.read_csv('../Figures/Devreg/gene_cat_me3.csv', index_col='gene')
me_df['me3_target'] = me_df['category'] == 'updowngene'

In [None]:
# Get the set of coding genes:
coding_genes = set()
allgenes = db.features_of_type('gene')
i = 0
for g in allgenes:
    try:
        if g.attributes['gene_biotype'] == ['protein_coding']:
            coding_genes.add(g.id)
            i+=1
    except KeyError:
        continue
print('num coding', i)

In [None]:
coding_df = me_df.loc[me_df.index.isin(coding_genes)].copy()
num_coding_genes = len(coding_df)
# Get the top 5% most unstable coding genes
percent = 5
ngenes = num_coding_genes*(percent/100)
most_unstable = coding_df.sort_values(by='deg_rate', ascending=False).head(round(ngenes))
most_unstable.reset_index()['gene'].to_csv(os.path.join(outdir, f'unstable_{percent}perc.csv'), index=False)
# Bg set should be sorted in ascending signal to noise ratio (genes upregulated in treatment are at the end of the list)
# This bg file is also sorted from most->least stable so that it can be used for a GSEA-like analysis
coding_df.sort_values(by='deg_rate').reset_index()['gene'].to_csv(os.path.join(outdir, 'bg_genes.csv'), index=False)

In [None]:
# Write the CTS TFs to a file
coding_df.query('TF & CTS').reset_index()['gene'].to_csv(os.path.join(outdir, 'CTS_TF_genes.csv'), index=False)