### Prep summarize exps
- Summarize experiments for GEO
- The filtered.csv files have the rRNAs and spikeins removed and TPMs recalculated -- these ones are the ones that used as input into INSPEcT
- Also output the count and TPM values for the rRNAs and spikeins

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np

sys.path.append('../scripts')
from plot_helpers import *

%load_ext autoreload
%autoreload 2

In [None]:
# For each summary file, output the ones with the spikein and rRNAs filtered out that were used for plotting
# For completeness, add in the original quantification for the rRNAs and spikeins and note these in the a column 'gene type'
result_dirs = {'brain_4sU':results_dir, 'brain_incubation_test':results_dir_inctest, 'BG3_ph_RNAi':results_dir_pherson}
for exp in result_dirs:
    unfilt = os.path.join(result_dirs[exp], 'gene_quantification', 'summary_abundance_by_gene.csv')
    filt = os.path.join(result_dirs[exp], 'gene_quantification', 'summary_abundance_by_gene_filtered.csv')
    df1 = pd.read_csv(unfilt)
    df2 = pd.read_csv(filt)
    removed_gene_IDs = set(df1['gene']).difference(set(df2['gene']))
    df_removed = df1.query('gene in @removed_gene_IDs').copy()
    df_removed['gene_type'] = df_removed['gene'].apply(lambda x: 'spikein' if (x.startswith('SIRV') or x.startswith('ERCC')) else 'rRNA')
    combo_df = pd.concat([df2, df_removed])
    combo_df.to_csv(os.path.join(geo_outdir, f'{exp}_summary_abundance.csv'), index=False)

In [None]:
# Test that the non spike-in, non-rRNA add up to TPM = 1M
# Calculate the number of samples in the table
df3 = pd.read_csv(os.path.join(geo_outdir, f'{exp}_summary_abundance.csv'))
df3['exp_tag'] = df3[['replicate', 'RNAtype', 'condition']].astype(str).apply(lambda x: '_'.join(x), axis=1)
num_samples = len(df3['exp_tag'].unique())
print('total TPM %1.2f' % (df3.loc[pd.isnull(df3['gene_type']), 'summed_tpm_recalc'].sum()/num_samples))