#### Reg predict RBPs
- look at the predictive power of RBP binding on RNA stability
- load significant RBPs from Transite motif analysis output

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
import statsmodels.api as sm
import itertools
from decimal import Decimal

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats.mstats import winsorize

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Reg/'
os.makedirs(outdir, exist_ok=True)

In [None]:
# Load the data
# log-transform and winsorize the deg_rates
# rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')
me_df = pd.read_csv('../Figures/Devreg/gene_cat_me3.csv', index_col='gene')
me_df['me3_target'] = me_df['category'] == 'updowngene'
me_df['log_deg'] = me_df['deg_rate'].apply(np.log10)
me_df['log_deg_wins_1'] = winsorize(me_df['log_deg'], (0.01, 0.01))

In [None]:
import glob
motif_dir = '../Figures/Motifs/'
motif_cols = []
d = {'three_prime':os.path.join(motif_dir, 'transite_spectrum_3pUTR'), 'five_prime':os.path.join(motif_dir, 'transite_spectrum_5pUTR')}
for i in d:
    hits_file = glob.glob(f'{d[i]}/*_hits.csv')[0]
    df = pd.read_csv(hits_file, index_col=0)
    df.columns = [f'{i}_{j}' for j in df.columns]
    motif_cols.extend(df.columns.tolist())
    me_df = pd.merge(me_df, df, left_index=True, right_index=True)

In [None]:
motif_cols

In [None]:
me_df.columns

In [None]:
me_df.head()

In [None]:
this_df.head()

In [None]:
this_df = pd.DataFrame(me_df.groupby(motif_cols[0])['TF'].sum())
this_df.index.name = 'num hits'
this_df.reset_index(inplace=True)
sns.barplot(x='num hits', y='TF', data=this_df)

In [None]:
sns.histplot(me_df.groupby(motif_cols[0])['TF'].sum())

In [None]:
this_df.droplevel(level=0, axis=1)

In [None]:
this_df = me_df.groupby(motif).agg({'TF':['count', 'sum']})
this_df.droplevel(level=0, axis=1)
# this_df['not'] = this_df['count'] - this_df['sum']


In [None]:
s = pd.DataFrame(me_df.groupby(motif)['TF'].value_counts())



In [None]:
s.

In [None]:
s.index

In [None]:
this_df['true_frac'] = this_df['sum']/this_df['sum'].sum()
this_df['false_frac'] = this_df['not']/this_df['not'].sum()

In [None]:
this_df2

In [None]:
me_df['CTS_TF'] = me_df['TF'] & me_df['CTS']
me_df['other_TF'] = me_df['TF'] & ~me_df['CTS']

In [None]:
motif = motif_cols[0]
df2 = me_df.groupby(motif).agg({'CTS_TF':['count', 'sum'], 'other_TF':['count', 'sum']})
df2[('other', 'sum')] = df2.apply(lambda x: x[('other_TF', 'count')] - (x[('other_TF', 'sum')] + x[('CTS_TF', 'sum')]), axis=1)


In [None]:
df3 = pd.DataFrame(index=df2.index)

In [None]:
df3['CTS_TF_frac'] = df2[('CTS_TF', 'sum')]/df2[('CTS_TF', 'sum')].sum()
df3['other_TF_frac'] = df2[('other_TF', 'sum')]/df2[('other_TF', 'sum')].sum()
df3['other_frac'] = df2[('other', 'sum')]/df2[('other', 'sum')].sum()

In [None]:
df3

In [None]:
fig = plt.figure()
ax = fig.add_subplot()
for motif in motif_cols:
    df2 = me_df.groupby(motif).agg({'CTS_TF':['count', 'sum'], 'other_TF':['count', 'sum']})
    df2[('other', 'sum')] = df2.apply(lambda x: x[('other_TF', 'count')] - (x[('other_TF', 'sum')] + x[('CTS_TF', 'sum')]), axis=1)
    df3 = pd.DataFrame(index=df2.index)
    df3['other_TF_frac'] = df2[('other_TF', 'sum')]/df2[('other_TF', 'sum')].sum()
    df3['other_frac'] = df2[('other', 'sum')]/df2[('other', 'sum')].sum()
    df3['CTS_TF_frac'] = df2[('CTS_TF', 'sum')]/df2[('CTS_TF', 'sum')].sum()
    df3.index.name = 'num hits'
    df3.reset_index(inplace=True)
    color = next(ax._get_lines.prop_cycler)['color']
    ax.plot(df3['num hits'], df3['other_TF_frac'], label=f'{motif}_TF', linestyle='--', color=color)
    ax.plot(df3['num hits'], df3['other_frac'], label=f'{motif}_bg', color=color)
    ax.plot(df3['num hits'], df3['CTS_TF_frac'], label=f'{motif}_CTS_TF', linestyle=':', color=color)
ax.legend()
ax.set_xlabel('number of motifs')
ax.set_ylabel('number of TFs')



In [None]:
df2[('all', 'sum')] = df2.apply(lambda x: x[('other_TF', 'count')] - (x[('other_TF', 'sum')] + x[('CTS_TF', 'sum')]), axis=1)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
for motif in motif_cols:
    this_df = me_df.groupby(motif).agg({'TF':['count', 'sum']}).droplevel(level=0, axis=1)
    this_df['not'] = this_df['count'] - this_df['sum']
    this_df2 = me_df.groupby(motif).agg({'CTS_TF':'sum'})
    this_df2['true_frac'] = this_df2['CTS_TF']/this_df2['CTS_TF'].sum()
    this_df['true_frac'] = this_df['sum']/this_df['sum'].sum()
    this_df['false_frac'] = this_df['not']/this_df['not'].sum()
    # this_df = pd.DataFrame(me_df.groupby(motif)['TF'].sum())
    # not_this_df = pd.DataFrame((me_df.groupby(motif)['TF']=='False').sum())
    this_df.index.name = 'num hits'
    this_df.reset_index(inplace=True)
    this_df2.index.name = 'num hits'
    this_df2.reset_index(inplace=True)
    color = next(ax._get_lines.prop_cycler)['color']
    ax.plot(this_df['num hits'], this_df['true_frac'], label=motif, color=color)
    ax.plot(this_df['num hits'], this_df['false_frac'], label=f'{motif}_bg', linestyle='--', color=color)
    ax.plot(this_df2['num hits'], this_df2['true_frac'], label=f'{motif}_CTS_TF', linestyle=':', color=color)
ax.legend()
ax.set_xlabel('number of motifs')
ax.set_ylabel('number of TFs')
    # ax = sns.histplot(x='num hits', y='TF', data=this_df, ax=ax)

    #ax = sns.histplot(x='num hits', y='TF', data=this_df, ax=ax)
    # sns.histplot(me_df.groupby(motif)['TF'].sum())

In [None]:
# Could the A enrichment be because they are long?

In [None]:
me_df.query('CTS_TF')['three_prime_M146_0.6']

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
for motif in motif_cols:
    this_df = me_df.groupby(motif).agg({'TF':['count', 'sum']}).droplevel(level=0, axis=1)
    this_df['not'] = this_df['count'] - this_df['sum']
    this_df2 = me_df.groupby(motif).agg({'CTS_TF':'sum'})
    this_df2['true_frac'] = this_df2['CTS_TF']/this_df2['CTS_TF'].sum()
    this_df['true_frac'] = this_df['sum']/this_df['sum'].sum()
    this_df['false_frac'] = this_df['not']/this_df['not'].sum()
    # this_df = pd.DataFrame(me_df.groupby(motif)['TF'].sum())
    # not_this_df = pd.DataFrame((me_df.groupby(motif)['TF']=='False').sum())
    this_df.index.name = 'num hits'
    this_df.reset_index(inplace=True)
    this_df2.index.name = 'num hits'
    this_df2.reset_index(inplace=True)
    color = next(ax._get_lines.prop_cycler)['color']
    ax.plot(this_df['num hits'], this_df['true_frac'], label=motif, color=color)
    ax.plot(this_df['num hits'], this_df['false_frac'], label=f'{motif}_bg', linestyle='--', color=color)
    ax.plot(this_df2['num hits'], this_df2['true_frac'], label=f'{motif}_CTS_TF', linestyle=':', color=color)
ax.legend()
ax.set_xlabel('number of motifs')
ax.set_ylabel('number of TFs')
    # ax = sns.histplot(x='num hits', y='TF', data=this_df, ax=ax)

    #ax = sns.histplot(x='num hits', y='TF', data=this_df, ax=ax)
    # sns.histplot(me_df.groupby(motif)['TF'].sum())

In [None]:
for motif in motif_cols:
    fig = plt.figure()
    ax = fig.add_subplot(111) 
    ax = sns.violinplot(data=me_df, x=motif, y='log_deg_wins_1')
    ax.set_title(motif)
    ax.set_ylabel('log deg')
    ax.set_xlabel('number of motifs')

In [None]:
X = me_df[motif_cols].values
y = me_df['log_deg_wins_1'].values

In [None]:
# Try random forest model to see how much the RBP binding sites predict

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = RandomForestRegressor()
clf.fit(X_train, y_train)

In [None]:
# Evaluate the model on the test set:
from sklearn.metrics import roc_curve, auc
# Only SVC and logistic regression has the decision function
# Random forest needs to use the predict_proba method
# Then ROC curve is made by looking at TP and FP at different prob cutoffs
# Should we be using the decision furnction or the predict_proba to make the AUC?
y_score = clf.predict_proba(X_test)
fpr, tpr, _ = roc_curve(y_test, y_score[:,1])
roc_auc = auc(fpr, tpr)
print('roc auc', roc_auc)

In [None]:
clf.score(X_test, y_test)

In [None]:
motif_cols

In [None]:
me_df.head()