### Prep filter low exp
- Filter libraries to remove genes with low expression levels before further analysis

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
import gffutils

sys.path.append('../scripts')
from plot_helpers import *
from utilities import filter_low_exp, load_dataset, calc_pseudocount_val

db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

#### Choice of filtering level for the experiments
- Set the filtering level to 10 counts in at least 2/3 of libraries in one condition/replicate set
- For most experimental sets, npass=2 because there were three replicates
- For the ph/mock BG3 data, npass=4 for input mock because there were 6 replicates

In [None]:
#Get the genes passing the read count filtering for the each experiment
outdir = '../Figures/summary_files'
os.makedirs(outdir, exist_ok = True)
                              
#Exp1 (Brain4sU)
res_file1 = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
exp1 = [{'RNAtype':'input', 'condition':1, 'npass':2}, {'RNAtype':'pd', 'condition':1, 'npass':2}]
passed_genes1 = filter_low_exp(res_file1, filter_col='summed_est_counts', filter_co=10, experiments=exp1, 
                              outname=os.path.join(outdir,'brain4sU'))

# Brain Incubation control experiment, the passed genes are for a) 60mock vs. 0 mock or b) 60foursu vs. 0 mock
# If we use the same one for both, we might be plotting a comparison with very low counts in both where the passing
# was driven by a different sample.
# Exp2a (BrainInc 60mock vs. 0mock)
res_file2 = os.path.join(results_dir_inctest, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
exp2a = [{'RNAtype':'input', 'condition':'0mock', 'npass':2}, {'RNAtype':'input', 'condition':'60mock', 'npass':2}]
passed_genes2a = filter_low_exp(res_file2, filter_col='summed_est_counts', filter_co=10, experiments=exp2a, 
                               outname=os.path.join(outdir,'brainInc_mock'))

# Exp2b (BrainInc 60foursu vs. 0mock)
exp2b = [{'RNAtype':'input', 'condition':'0mock', 'npass':2}, {'RNAtype':'input', 'condition':'60foursu', 'npass':2}]
passed_genes2b = filter_low_exp(res_file2, filter_col='summed_est_counts', filter_co=10, experiments=exp2b, 
                               outname=os.path.join(outdir,'brainInc_foursu'))

# #Exp3 (BG3 Ph RNAi)
res_file3 = os.path.join(results_dir_pherson, 'gene_quantification', 'summary_abundance_by_gene_filtered.csv')
exp3 = [{'RNAtype':'input', 'condition':'mock', 'npass':4}, {'RNAtype':'input', 'condition':'ph', 'npass':2},
        {'RNAtype':'pd', 'condition':'mock', 'npass':2}, {'RNAtype':'pd', 'condition':'ph', 'npass':2}]
passed_genes3 = filter_low_exp(res_file3, filter_col='summed_est_counts', filter_co=10, experiments=exp3, 
                               outname=os.path.join(outdir,'BG3'))

In [None]:
# Get the pseudocount value to use for making scatterplots across the datasets
# Get the means by experiment
val_col = 'summed_tpm_recalc'
groupbylist = ['gene', 'condition', 'RNAtype']

#The _filtered.csv file already has the spike-ins and rRNA genes removed
df1 = load_dataset(res_file1, '../Figures/summary_files/brain4sU_passed.csv').reset_index()
ps1 = calc_pseudocount_val(df1, val_col=val_col, frac=0.1, groupby=groupbylist)

df2a = load_dataset(res_file2, '../Figures/summary_files/brainInc_mock_passed.csv').reset_index()
ps2a = calc_pseudocount_val(df2a, val_col=val_col, frac=0.1, groupby=groupbylist)

df2b = load_dataset(res_file2, '../Figures/summary_files/brainInc_foursu_passed.csv').reset_index()
ps2b = calc_pseudocount_val(df2b, val_col=val_col, frac=0.1, groupby=groupbylist)

df3 = load_dataset(res_file3, '../Figures/summary_files/BG3_passed.csv').reset_index()
ps3 = calc_pseudocount_val(df3, val_col=val_col, frac=0.1, groupby=groupbylist)


In [None]:
#What is CPM for the lowest expression passed genes?
def get_cpm_co(df, read_col):
    '''Print the equivalent CPM cutoff for the read cutoff applied for that dataset'''
    df['CPM'] = df[read_col]*1e6/df.groupby(['replicate', 'condition', 'RNAtype'])[read_col].transform('sum')
    df2 = df.reset_index().groupby(['gene', 'condition', 'RNAtype'])['CPM'].mean()
    df2.groupby('gene').max().sort_values(ascending=True)
    min_cpm_passed = df2.groupby('gene').max().min()
    print('min cpm passed gene', f'{min_cpm_passed}')

In [None]:
get_cpm_co(load_dataset(res_file1, os.path.join(outdir,'brain4sU_passed.csv')), 'summed_est_counts')

In [None]:
get_cpm_co(load_dataset(res_file2, os.path.join(outdir,'brainInc_mock_passed.csv')), 'summed_est_counts')

In [None]:
get_cpm_co(load_dataset(res_file2, os.path.join(outdir,'brainInc_foursu_passed.csv')), 'summed_est_counts')

In [None]:
get_cpm_co(load_dataset(res_file3, os.path.join(outdir,'BG3_passed.csv')), 'summed_est_counts')