#### Reg Predict
- Show the contribution of various properties (H3K27me3 target, TF, CTS) to the RNA decay rate in a linear model

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
import statsmodels.api as sm
import itertools
from decimal import Decimal

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats.mstats import winsorize

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

In [None]:
outdir = '../Figures/Reg/'
os.makedirs(outdir, exist_ok=True)

In [None]:
# Run OLS +/- interactions for each case
def combine_pred_names(predictors):
    l = []
    for L in range(len(predictors) + 1):
        for subset in itertools.combinations(predictors, L):
            if subset == ():
                l.append('intercept')
                continue
            l.append(':'.join(subset))
    return l

def run_OLS(df, target_column, predictors, interactions=False):
    columns = ['intercept'].extend(predictors)
    X = df[predictors].values
    y = df[target_column].values
    if interactions:
        l = combine_pred_names(predictors)
        poly = PolynomialFeatures(interaction_only=True, degree=len(predictors))
        # When using PolynomialFeatures, it already adds a constant.
        X_tr = poly.fit_transform(X)
        Xt = pd.DataFrame(X_tr, columns=l)
        mod = sm.OLS(y, Xt)
        res = mod.fit()
        res.summary()
    else:
        X1 = sm.add_constant(X.astype(int))
        X1 = pd.DataFrame(X1, columns=['intercept'] + predictors)
        mod = sm.OLS(y, X1)
        res = mod.fit()
        res.summary()
    return res

In [None]:
# Load the data
# log-transform and winsorize the deg_rates
# rate_df = load_dataset('../Figures/summary_files/INSPEcT_rates.csv', '../Figures/summary_files/brain4sU_passed.csv')
me_df = pd.read_csv('../Figures/Devreg/gene_cat_me3.csv', index_col='gene')
me_df['me3_target'] = me_df['category'] == 'updowngene'
me_df['log_deg'] = me_df['deg_rate'].apply(np.log10)
me_df['log_deg_wins_1'] = winsorize(me_df['log_deg'], (0.01, 0.01))
target_column = ['log_deg_wins_1']
predictors_long = ['TF', 'CTS', 'me3_target']
res_full = run_OLS(me_df, target_column, predictors_long, interactions=True)
res_TF_only = run_OLS(me_df, target_column, ['TF'])
res_CTS_only = run_OLS(me_df, target_column, ['CTS'])

In [None]:
res_CTS_only.summary()

In [None]:
pvals = res_full.pvalues
coeff = res_full.params

results_df = pd.DataFrame({"pvals":pvals, "coeff":coeff})

In [None]:
# https://github.com/matplotlib/matplotlib/issues/12828
# https://stackoverflow.com/questions/9932072/matplotlib-table-formatting
# Format the results table
# Having difficulty getting fontsize=6 to fit in the single figure size
results_df2 = results_df.copy()
results_df2['pvals'] = results_df2['pvals'].apply(lambda x: '%.2E' % Decimal(x))
results_df2['coeff'] = results_df2['coeff'].round(3)
results_df2 = results_df2[['coeff', 'pvals']].copy()
results_df2.rename({'pvals':'p-value'}, axis=1, inplace=True)
results_df2.rename({'me3_target':'me3 target', 'TF:me3_target':'TF:me3 target', 'CTS:me3_target':'CTS:me3 target', 'TF:CTS:me3_target':'TF:CTS:me3 target'}, axis=0, inplace=True)

lgrey = mpl.colors.to_rgba(color_dict['grey'], 0.3)

fig = plt.figure(figsize=(dfig, dfig))
gs = fig.add_gridspec(ncols=1, nrows = 8)
ax = fig.add_subplot(gs[1:])
table = ax.table(cellText=results_df2.values, colLabels=results_df2.keys(), rowLabels=results_df2.index, rowColours=[lgrey]*len(results_df2), colColours=[lgrey]*len(results_df2), colWidths=[0.4,0.4], loc='center')
ax.set(xticks=[], yticks=[])
# ax.set_title('Regression for RNA decay rate')
# ax.text(0, 1.3, 'Regression model for RNA decay rate', ha='left', va='top', transform=ax.transAxes)
ax.text(0.5, 0.97, 'Regression model for RNA decay rate', ha='center', va='top', fontsize=6, transform=fig.transFigure)

ax.spines.left.set_visible(False)
ax.spines['bottom'].set_visible(False)
table.auto_set_font_size(False)
table.set_fontsize(5)
plt.subplots_adjust(left=0.4)
plt.savefig('%s.%s' % (os.path.join(outdir, 'model_table'), out_fmt), dpi = out_dpi)
# ax = sns.violinplot(data=me_df, x='CAI', y='category', order=order, orient='h', color=color_dict['grey'], ax = ax)

In [None]:
# Now try predicting with more basic features, like lengths, GC content, and CAI
att_df = pd.read_csv('../Figures/gene_attributes/gene_attributes.csv', index_col=0)

In [None]:
df2 = me_df[['log_deg_wins_1']].copy()
# Just use CAI_hiexp since very similar to CAI_all
df2 = pd.merge(df2, att_df, left_index=True, right_index=True).drop('CAI_all', axis=1)

In [None]:
df2.head()

In [None]:
target_column = ['log_deg_wins_1']
# predictors = df2.columns.tolist()
# predictors = ['CAI_hiexp', 'log_deg_wins_1']
predictors = ['tutr_len', 'futr_len', 'cds_len', 'futr_gc', 'tutr_gc', 'log_deg_wins_1']

predictors.remove(target_column[0])
res_full2 = run_OLS(df2, target_column, predictors)

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.scatter(df3['tutr_len'], df3['log_deg_wins_1'])
# ax = sns.histplot(x='tutr_len', y='log_deg_wins_1', data=df3, bins=50, pthresh=.1, cmap="mako", ax=ax)
# ax.set_xlim(0,2.5)
ax = sns.histplot(x='tutr_gc', y='log_deg_wins_1', data=df3, bins=50, pthresh=.1, cmap="mako", ax=ax)


In [None]:
me_df.head()

In [None]:
df3[['CTS', 'TF']] = me_df[['CTS', 'TF']]

In [None]:
df3['CTS_TF'] = df3['CTS'] & df3['TF']
df3['other_TF'] = df3['TF'] & ~df3['CTS']

In [None]:
df3.head()

In [None]:
df3.query('other_TF')['tutr_len'].median()

In [None]:
df3.query('CTS_TF')['tutr_len'].median()

It seems like these features are not good global predictors for RNA stabilty
What about for TFs specifically?
Or could you use it to predict which TFs are cell-type-specific?
Big limitation of cell type specific TFs is that you can only find them if they are in your tissue of interest. So maybe this is too biased

In [None]:
pd.melt(df3, id_vars=['CTS_TF', 'other_TF'], value_vars='log_deg_wins_1')

In [None]:
?pd.melt

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
predictors = ['tutr_len','futr_len', 'cds_len', 'futr_gc', 'tutr_gc']
features_scaled = scaler.fit_transform(df2[predictors].values)
targets = df2[target_column].values
df3 = pd.DataFrame(np.concatenate([features_scaled, targets], axis=1), columns=predictors+target_column, index=df2.index)
# df_scaled = pd.DataFrame(scaler.fit_transform(df2[predictors]),columns = df2[predictors].columns)
# THIS SHOULD WORK! WHY NOT????
# df3 = pd.concat([df_scaled, df2[target_column]], axis=1, ignore_index=True)
res_full3 = run_OLS(df3, target_column, predictors)
res_full3.summary()

In [None]:
res_full3 = run_OLS(df3, target_column, predictors)
res_full3.summary()

In [None]:
predictors

In [None]:
me_df = pd.read_csv('../Figures/Devreg/gene_cat_me3.csv', index_col='gene')
me_df['me3_target'] = me_df['category'] == 'updowngene'
me_df['log_deg'] = me_df['deg_rate'].apply(np.log10)
me_df['log_deg_wins_1'] = winsorize(me_df['log_deg'], (0.01, 0.01))
target_column = ['log_deg_wins_1']
predictors_long = ['TF', 'CTS', 'me3_target']
res_full = run_OLS(me_df, target_column, predictors_long, interactions=True)
res_TF_only = run_OLS(me_df, target_column, ['TF'])
res_CTS_only = run_OLS(me_df, target_column, ['CTS'])

In [None]:
df2.head()

In [None]:
att_df.head()

In [None]:
list(table.properties().keys())