### Prep rate SDs
- Calculate the SD of processing and decay rates using the variances reported by INSPEcT
- Write supplemental file with the rates and measured quantities: synth, processing, decay, total, pre mRNA

In [None]:
# Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
import gffutils

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset
db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

In [None]:
# Load the INSPEcT data
outdir = '../Figures/Overview'
os.makedirs(outdir, exist_ok = True)
syn_var = pd.read_csv(os.path.join(inspect_dir, 'synth_var.csv'), index_col=0, names=['syn_var'], header=0)
tot_var = pd.read_csv(os.path.join(inspect_dir, 'total_var.csv'), index_col=0, names=['tot_var'], header=0)
pre_var = pd.read_csv(os.path.join(inspect_dir, 'premrna_var.csv'), index_col=0, names=['pre_var'], header=0)
syn_rates = pd.read_csv(os.path.join(inspect_dir, 'synth_rates.csv'), index_col=0, names=['syn_rate'], header=0)
deg_rates = pd.read_csv(os.path.join(inspect_dir, 'deg_rates.csv'), index_col=0, names=['deg_rate'], header=0)
proc_rates = pd.read_csv(os.path.join(inspect_dir, 'proc_rates.csv'), index_col=0, names=['proc_rate'], header=0)
premrna_levels = pd.read_csv(os.path.join(inspect_dir, 'premrna_levels.csv'), index_col=0, names=['pre_level'], header=0)
tot_levels = pd.read_csv(os.path.join(inspect_dir, 'tot_levels.csv'), index_col=0, names=['tot_level'], header=0)

df = pd.concat([syn_rates, deg_rates, proc_rates, tot_levels, premrna_levels, syn_var, tot_var, pre_var], axis=1)
# Fill na values with 0, this will allow calculation of deg rates for genes without introns where pre_level=0
df.fillna(0, inplace=True)

In [None]:
# recalculate rates form the components to check that the formula is correct
df['deg_rate2'] = df['syn_rate']/(df['tot_level'] - df['pre_level'])
df['proc_rate2'] = df['syn_rate']/df['pre_level']

In [None]:
# Get the variance for the calculated decay rates
# Variance of the decay rates can be derived from error propagation
# https://chem.libretexts.org/Bookshelves/Analytical_Chemistry/Supplemental_Modules_(Analytical_Chemistry)/Quantifying_Nature/Significant_Digits/Propagation_of_Error
# mature RNA variance
df['mat_level'] = df['tot_level'] - df['pre_level']
df['mat_var'] = df['pre_var'] + df['tot_var']

# decay rate variance
df['deg_var'] = (df['syn_var']/df['syn_rate']**2 + df['mat_var']/df['mat_level']**2)*df['deg_rate']**2
# processing rate variance
df['proc_var'] = (df['syn_var']/df['syn_rate']**2 + df['pre_var']/df['pre_level']**2)*df['proc_rate']**2

# get coefficients of variation
df['deg_sd'] = np.sqrt(df['deg_var'])
df['syn_sd'] = np.sqrt(df['syn_var'])
df['proc_sd'] = np.sqrt(df['proc_var'])
df['tot_sd'] = np.sqrt(df['tot_var'])
df['pre_sd'] = np.sqrt(df['pre_var'])
df.index.name = 'gene'

##### Calculating the lowest half-life to report
- We will assume based on purification experiments with spike-in RNAs that the max achievable specificity is about 100X.  
- Therefore if we had an RNA with no transcription, as a rough estimate we could expect for it to be present at about 1/100 of the mature RNA in the total RNA library
- We will use this estimate to calculate the upper limit of the half-life estimates

In [None]:
# Calculating the largest half-life to report = calculating the lowest deg rate
# 20 min is the labeling time in min
t_label = 20
deg_rate = (1/t_label)/100
halflife = math.log(2)/deg_rate
print('largest halflife to report %1.0f' % halflife )
# I will round this to 1000 min to be conservative

In [None]:
# Convert the rates to per min:
df2 = df.copy()
df2[['syn_rate', 'deg_rate', 'proc_rate', 'syn_sd', 'deg_sd', 'proc_sd']] = df2[['syn_rate', 'deg_rate', 'proc_rate', 'syn_sd', 'deg_sd', 'proc_sd']]/60

In [None]:
outdir = '../Figures/summary_files'
os.makedirs(outdir, exist_ok = True)
# Divide the rates by 60 to convert from per hr to per min
df2['halflife'] = math.log(2)/df2['deg_rate']
halflife_cap = 1000
df2['halflife_capped'] = df2['halflife'].apply(lambda x: halflife_cap if x >= halflife_cap else x)
# Calculate the stability percentile
# Note that this percentile has all the data, not only the pass filter genes
df2['stab_percentile'] = df2['halflife'].rank(pct=True)*100
df2['biotype'] = df2.index.map(lambda x: db[x].attributes['gene_biotype'][0])
df2['gene_name'] = df2.index.map(lambda x: db[x].attributes['gene_name'][0])

In [None]:
# Write summary file
cols_2_write = ['gene_name', 'biotype', 'halflife', 'halflife_capped', 'stab_percentile', 'syn_rate', 'proc_rate', 'deg_rate', 'tot_level', 'pre_level', 'syn_sd', 'proc_sd', 'deg_sd', 'tot_sd', 'pre_sd']
df2[cols_2_write].to_csv(os.path.join(outdir, 'INSPEcT_rates.csv'))

In [None]:
# Write supplementary file for GEO -- rename column names to longer versions and remove halflife_capped
column_names = {'gene_name':'gene_symbol', 'stab_percentile':'stability_percentile', 'syn_rate':'synthesis_rate', 'proc_rate':'processing_rate', 'deg_rate':'degradation_rate', 'pre_level':'preRNA_TPM', 
                'tot_level':'totalRNA_TPM', 'syn_sd':'synthesis_sd', 'proc_sd':'processing_sd', 'deg_sd':'degradation_sd', 'tot_sd':'totalRNA_sd', 'pre_sd':'preRNA_sd'}
df2.rename(columns=column_names, inplace=True)
df2.index.name = 'gene_ID'
cols_2_write2 = [column_names[i] if i in column_names else i for i in cols_2_write]
cols_2_write2.remove('halflife_capped')
df2[cols_2_write2].to_csv(os.path.join(geo_outdir, 'INSPEcT_rates.csv'))