### Prep filter low exp
- Filter libraries to remove genes with low expression levels before further analysis

In [None]:
#Imports
import sys
import os
import pandas as pd
import seaborn as sns
import numpy as np
import math
import scipy.stats as stats
import gffutils

sys.path.append('../scripts')
from plot_helpers import *
from utilities import filter_low_exp, load_dataset

db = gffutils.FeatureDB(gffutils_db)

%load_ext autoreload
%autoreload 2

#### Choice of filtering level for the experiments
- Set the filtering level to 10 counts in at least 2/3 of libraries in one condition/replicate set
- For most experimental sets, npass=2 because there were three replicates
- For the ph/mock BG3 data, npass=4 for input mock because there were 6 replicates

In [None]:
#Get the genes passing the read count filtering for the each experiment
outdir = '../Figures/summary_files'
os.makedirs(outdir, exist_ok = True)

#Exp1 (Brain4sU)
res_file1 = os.path.join(results_dir, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
exp1 = [{'RNAtype':'input', 'condition':1, 'npass':2}, {'RNAtype':'pd', 'condition':1, 'npass':2}]
passed_genes1 = filter_low_exp(res_file1, filter_by='count', filter_co=10, experiments=exp1, 
                              outname=os.path.join(outdir,'brain4sU'))

#Exp2 (BrainInc)
res_file2 = os.path.join(results_dir_inctest, 'gene_quantification','summary_abundance_by_gene_filtered.csv')
exp2 = [{'RNAtype':'input', 'condition':'0mock', 'npass':2}, {'RNAtype':'input', 'condition':'60mock', 'npass':2}]
passed_genes2 = filter_low_exp(res_file2, filter_by='count', filter_co=10, experiments=exp2, 
                               outname=os.path.join(outdir,'brainInc'))

#Exp3 (BG3 Ph RNAi)
res_file3 = os.path.join(results_dir_pherson, 'gene_quantification', 'summary_abundance_by_gene_filtered.csv')
exp3 = [{'RNAtype':'input', 'condition':'mock', 'npass':4}, {'RNAtype':'input', 'condition':'ph', 'npass':2},
        {'RNAtype':'pd', 'condition':'mock', 'npass':2}, {'RNAtype':'pd', 'condition':'ph', 'npass':2}]
passed_genes3 = filter_low_exp(res_file3, filter_by='count', filter_co=10, experiments=exp3, 
                               outname=os.path.join(outdir,'BG3'))

In [None]:
#What is TPM for the lowest expression passed genes?
def get_cpm_co(df, read_col):
    '''Print the equivalent CPM cutoff for the read cutoff applied for that dataset'''
    df['CPM'] = df[read_col]*1e6/df.groupby(['replicate', 'condition', 'RNAtype'])[read_col].transform('sum')
    df2 = df.reset_index().groupby(['gene', 'condition', 'RNAtype'])['CPM'].mean()
    df2.groupby('gene').max().sort_values(ascending=True)
    min_cpm_passed = df2.groupby('gene').max().min()
    print('min cpm passed gene', f'{min_cpm_passed}')

In [None]:
get_cpm_co(load_dataset(res_file1, os.path.join(outdir,'brain4sU_passed.csv')), 'summed_est_counts')

In [None]:
get_cpm_co(load_dataset(res_file2, os.path.join(outdir,'brainInc_passed.csv')), 'summed_est_counts')

In [None]:
get_cpm_co(load_dataset(res_file3, os.path.join(outdir,'BG3_passed.csv')), 'summed_est_counts')