##### Motifs genesets
- Prepare tables to feed Transite and looking at conservation of UTR sites

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
from collections import defaultdict
import pickle
from itertools import chain, combinations
import gffutils
import re
from copy import copy

sys.path.append('../scripts')
from plot_helpers import *
from annotation_utilities import *
from plotting_fxns import PrettyBox
from utilities import load_dataset

db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Motifs'
os.makedirs(outdir, exist_ok = True)

In [None]:
# Load the data that includes the TF data
me_df = pd.read_csv('../Figures/Devreg/gene_cat_me3.csv', index_col='gene')
me_df['me3_target'] = me_df['category'] == 'updowngene'

In [None]:
# Get the set of coding genes:
coding_genes = set()
allgenes = db.features_of_type('gene')
i = 0
for g in allgenes:
    try:
        if g.attributes['gene_biotype'] == ['protein_coding']:
            coding_genes.add(g.id)
            i+=1
    except KeyError:
        continue
print('num coding', i)

In [None]:
coding_df = me_df.loc[me_df.index.isin(coding_genes)].copy()
num_coding_genes = len(coding_df)
# Get the top 5% most unstable coding genes
percent = 5
ngenes = num_coding_genes*(percent/100)
most_unstable = coding_df.sort_values(by='deg_rate', ascending=False).head(round(ngenes))
most_unstable.reset_index()['gene'].to_csv(os.path.join(outdir, f'unstable_{percent}perc.csv'), index=False)
# Bg set should be sorted in ascending signal to noise ratio (genes upregulated in treatment are at the end of the list)
# This bg file is also sorted from most->least stable so that it can be used for a GSEA-like analysis
coding_df.sort_values(by='deg_rate').reset_index()['gene'].to_csv(os.path.join(outdir, 'bg_genes.csv'), index=False)

In [None]:
# Write the CTS TFs to a file
coding_df.query('TF & CTS').reset_index()['gene'].to_csv(os.path.join(outdir, 'CTS_TF_genes.csv'), index=False)

In [None]:
# Look at the 3'UTR length of the CTS TFs. I think based on this analysis that they might actually be super long
cts_tfs = coding_df.query('TF & CTS').index

In [None]:
from pyfaidx import Fasta
three_fafile = '/Users/mk/Desktop/Davislab_old/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/region_fastas/longest_threeprime.fa'
three_txts = Fasta(three_fafile)
five_fafile = '/Users/mk/Desktop/Davislab_old/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/region_fastas/longest_fiveprime.fa'
five_txts = Fasta(five_fafile)
cds_fafile = '/Users/mk/Desktop/Davislab_old/3.4_NMJ_4Tu_4sU/3.4e_pipeline_dev/nmj_figures/resources/region_fastas/longest_cds.fa'
cds_txts = Fasta(cds_fafile)

In [None]:
coding_df['3pUTR_len'] = coding_df.index.map(lambda x: len(three_txts[x]) if x in three_txts else np.nan)
coding_df['5pUTR_len'] = coding_df.index.map(lambda x: len(five_txts[x]) if x in five_txts else np.nan)
coding_df['CDS_len'] = coding_df.index.map(lambda x: len(cds_txts[x]) if x in five_txts else np.nan)
coding_df['3pUTR_log'] = coding_df['3pUTR_len'].apply(np.log10)
coding_df['5pUTR_log'] = coding_df['5pUTR_len'].apply(np.log10)
coding_df['CDS_log'] = coding_df['CDS_len'].apply(np.log10)

In [None]:
print('len 3pUTR overall', coding_df['3pUTR_len'].median())
print('len 3pUTR CTS_TF', coding_df.query('CTS_TF')['3pUTR_len'].median())
print('len 3pUTR TF', coding_df.query('TF')['3pUTR_len'].median())

In [None]:
print('len 5pUTR overall', coding_df['5pUTR_len'].median())
print('len 5pUTR CTS_TF', coding_df.query('CTS_TF')['5pUTR_len'].median())
print('len 5pUTR TF', coding_df.query('TF')['5pUTR_len'].median())

In [None]:
print('len 5pUTR overall', coding_df['CDS_len'].median())
print('len 5pUTR CTS_TF', coding_df.query('CTS_TF')['CDS_len'].median())
print('len 5pUTR TF', coding_df.query('TF')['CDS_len'].median())

In [None]:
outdir2 = '../Figures/Reg/Features'
os.makedirs(outdir2, exist_ok=True)

In [None]:
# Write the lengths to a file:
coding_df[['5pUTR_len', '3pUTR_len', 'CDS_len']].to_csv()

In [None]:
coding_df.head()

In [None]:
coding_df['CTS_TF'] = coding_df['CTS'] & coding_df['TF']
ax = sns.boxplot(data=coding_df, x='CTS_TF', y='3pUTR_log')

In [None]:
coding_df['CTS_TF'] = coding_df['CTS'] & coding_df['TF']
ax = sns.boxplot(data=coding_df, x='CTS_TF', y='5pUTR_log')

In [None]:
coding_df['CTS_TF'] = coding_df['CTS'] & coding_df['TF']
ax = sns.boxplot(data=coding_df, x='CTS_TF', y='CDS_log')

In [None]:
txts

In [None]:
len(txts[cts_tfs[0]])

In [None]:
cts_tfs

In [None]:
# There's actually a fair number of genes that have deg_rate = 0, and they are therefore randomly ordered in the list. Not sure if this is good.
test = pd.read_csv(os.path.join(outdir, 'bg_genes.csv'))
test.tail()

In [None]:
coding_df.sort_values(by='deg_rate').head()

In [None]:
df = pd.read_csv('../Figures/Motifs/bg_genes.csv')

In [None]:
i = 'FBgn0020618'
coding_df.loc[i]

In [None]:
df.head(n=100)

In [None]:
coding_df.loc['FBgn0051606']

In [None]:
df2.head()