#### Inc Control, S1
- Look at the changes after 60 min of incubation
- If largely unchanged, then it suggests it will be OK to use the incubation conditions as pseudo-steady-state

In [None]:
import os
import pandas as pd
import sys
import numpy as np
import scipy.stats as stats
import matplotlib.ticker as plticker

sys.path.append('../scripts')
from plot_helpers import *
from utilities import load_dataset

%load_ext autoreload
%autoreload 2

outdir = '../Figures/IncTest'
os.makedirs(outdir, exist_ok = True)

In [None]:
#The _filtered.csv file already has the spike-ins and rRNA genes removed
res_file = os.path.join(results_dir_inctest, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
df = load_dataset(res_file, '../Figures/summary_files/brainInc_passed.csv')
#calculate gene CPM
df['CPM'] = df['summed_est_counts']*1e6/df.groupby(['replicate', 'condition', 'RNAtype'])['summed_est_counts'].transform('sum')
#get the means by experiment
mean_df = df.groupby(['gene', 'condition', 'RNAtype']).mean()
#once you log transform, get np.inf and -np.inf, then you have the issue where if you drop, then you can drop rows in >1 experiment
mean_df['log_tpm'] = mean_df['summed_tpm_recalc'].apply(np.log10)
mean_df['log_tpm'].replace([np.inf, -np.inf], np.nan, inplace=True)
mean_df.dropna(subset=['log_tpm'], how='any', inplace=True)

In [None]:
#Compare the 60 min input samples to the 0 min input samples
df_0 = mean_df.loc[pd.IndexSlice[:, '0mock', 'input'], ['CPM', 'summed_tpm_recalc', 'log_tpm']].copy()
df_60 = mean_df.loc[pd.IndexSlice[:, '60mock', 'input'], ['CPM', 'summed_tpm_recalc', 'log_tpm']].copy()
comp_df = pd.merge(df_0.reset_index('condition'), df_60.reset_index('condition'), left_index=True, right_index=True, suffixes=('_0min', '_60min'))
comp_df.dropna(subset=['summed_tpm_recalc_0min', 'summed_tpm_recalc_60min'], how='any', inplace=True)
#restrict to genes with mean CPM >=1 over each condition
comp_df = comp_df[(comp_df['CPM_0min'] >= 1) & (comp_df['CPM_60min'] >= 1)].copy()

In [None]:
#Plot the correlation between the 0 min and 60 min incubation libraries
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
x = comp_df.loc[pd.IndexSlice[:, 'input'], 'log_tpm_0min']
y = comp_df.loc[pd.IndexSlice[:, 'input'], 'log_tpm_60min']
ax.scatter(x, y, s=2, color='k', alpha=0.3, ec='none')

rval, pval = stats.pearsonr(x, y)
r2_val_av = rval**2
loc = plticker.MultipleLocator(base=5.0)
ax.xaxis.set_major_locator(loc)
ax.yaxis.set_major_locator(loc)
ax.text(0.1, 0.9, 'r'r'$^2$'' = %1.2f' % r2_val_av, fontsize = 8, transform=ax.transAxes)
ax.set_xlabel('0 min inc. RNA level\n (log'r'$_{10}$'' TPM)')
ax.set_ylabel('60 min inc. RNA level\n (log'r'$_{10}$'' TPM)')
plt.savefig('%s.%s' % (os.path.join(outdir, '60vs0_scatter'), out_fmt), dpi = out_dpi)

In [None]:
fig = plt.figure(figsize=(dfig, dfig), constrained_layout=True)
ax = fig.add_subplot(111)
x = comp_df.loc[pd.IndexSlice[:, 'input'], 'log_tpm_0min']
y = comp_df.loc[pd.IndexSlice[:, 'input'], 'log_tpm_60min']
ax = sns.histplot(x=x, y=y, cmap='rocket', ax=ax, zorder=2)

loc = plticker.MultipleLocator(base=5.0)
ax.xaxis.set_major_locator(loc)
ax.yaxis.set_major_locator(loc)
ax.text(0.1, 0.9, 'r'r'$^2$'' = %1.2f' % r2_val_av, fontsize = 8, transform=ax.transAxes)
ax.set_xlabel('0 min inc. RNA level\n (log'r'$_{10}$'' TPM)')
ax.set_ylabel('60 min inc. RNA level\n (log'r'$_{10}$'' TPM)')
plt.savefig('%s.%s' % (os.path.join(outdir, '60vs0_histscatter'), out_fmt), dpi = out_dpi)
