### Prep rate SDs
- Calculate the SD from synthesis and total RNA variances using those reported by INSPEcT
- Write supplemental file with the rates and measured quantities: synth, decay
#### TODO: 
- Figure out how processing rate is measured so that I can also calculate the SD of that.
- Should total, mature, and pre-RNA levels also be reported? Figure of if there is actually a difference between total TPM levels and those output by INSPEcT (I thought that there might be some scaling involved)

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset
%load_ext autoreload
%autoreload 2

In [None]:
#Load the INSPEcT data
outdir = '../Figures/Overview'
os.makedirs(outdir, exist_ok = True)
syn_var = pd.read_csv(os.path.join(inspect_dir, 'synth_var.csv'), index_col=0, names=['syn_var'], header=0)
tot_var = pd.read_csv(os.path.join(inspect_dir, 'total_var.csv'), index_col=0, names=['tot_var'], header=0)
pre_var = pd.read_csv(os.path.join(inspect_dir, 'premrna_var.csv'), index_col=0, names=['pre_var'], header=0)
syn_rates = pd.read_csv(os.path.join(inspect_dir, 'synth_rates.csv'), index_col=0, names=['syn_rate'], header=0)
tot_levels = pd.read_csv(os.path.join(inspect_dir, 'tot_levels.csv'), index_col=0, names=['tot_level'], header=0)
deg_rates = pd.read_csv(os.path.join(inspect_dir, 'deg_rates.csv'), index_col=0, names=['deg_rate'], header=0)
premrna_levels = pd.read_csv(os.path.join(inspect_dir, 'premrna_levels.csv'), index_col=0, names=['pre_level'], header=0)

df = pd.concat([syn_rates, deg_rates, tot_levels, premrna_levels, syn_var, tot_var, pre_var], axis=1)

In [None]:
#recalculate rates form the components to check that the formula is correct
df['deg_rate2'] = df['syn_rate']/(df['tot_level'] - df['pre_level'])
#df['proc_rate2']

In [None]:
# Get the variance for the calculated decay rates
# Variance of the decay rates can be derived from error propagation
# https://chem.libretexts.org/Bookshelves/Analytical_Chemistry/Supplemental_Modules_(Analytical_Chemistry)/Quantifying_Nature/Significant_Digits/Propagation_of_Error
# mature RNA variance
df['mat_level'] = df['tot_level'] - df['pre_level']
df['mat_var'] = df['pre_var'] + df['tot_var']

# decay rate variance
df['deg_var'] = (df['syn_var']/df['syn_rate']**2 + df['mat_var']/df['mat_level']**2)*df['deg_rate']**2

# get coefficients of variation
df['deg_sd'] = np.sqrt(df['deg_var'])
df['syn_sd'] = np.sqrt(df['syn_var'])
# df['deg_cv'] = (np.sqrt(df['deg_var'])*100)/df['deg_rate']
# df['syn_cv'] = (np.sqrt(df['syn_var'])*100)/df['syn_rate']
# df['mat_cv'] = (np.sqrt(df['mat_var'])*100)/df['mat_level']
df.index.name = 'gene'

In [None]:
outdir = '../Figures/summary_files'
os.makedirs(outdir, exist_ok = True)
# Divide the rates by 60 to convert from per hr to per min
rate_df = df[['syn_rate', 'deg_rate', 'syn_sd', 'deg_sd']]/60
rate_df['halflife'] = math.log(2)/rate_df['deg_rate']
rate_df['halflife_capped'] = rate_df['halflife'].apply(lambda x: 500 if x >= 500 else x)
rate_df['stab_percentile'] = (1/rate_df['deg_rate']).rank(pct=True)*100

# Add total TPM for input/total RNA to the dataframe
# IS total TPM that is input into INSPEcT different than the returned TPM values?
infile = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = load_dataset(infile, '../Figures/summary_files/brain4sU_passed.csv')
total_tpm = df.query('RNAtype == "input"').groupby('gene')['summed_tpm_recalc'].mean()
rate_df['total_tpm'] = total_tpm
rate_df.to_csv(os.path.join(outdir, 'INSPEcT_rates.csv'))