# Imports and data path

In [14]:
import pandas as pd
import glob
from scipy import stats
from statsmodels.stats.multitest import multipletests
import numpy as np

In [15]:
# Path to files
data = '../outputs/fitness/'
output = '../outputs/'

# Define conditions for mapping names
condition_map = dict(zip(['Untreated','Cpd100','Cpd200','Rif7','Rif7Cpd100'],
                        ['Untreated','R0C100','R0C200','R7C0','R7C100']))

# Process data 


1) Read in and pull together all aggregated csvs from different treatment conditions into same dataframe

2) Calculate p-values 

3) Map back to gene names/products

In [16]:
# Pull out list of aggregated fitness files
file_list = glob.glob(data+'*Aggregated*')

In [17]:
# Read in untreated fitness files first
file = file_list[0]
fitness_df = pd.read_csv(file)
fitness_df = fitness_df.drop(columns='M')
fitness_df = fitness_df.set_index('Locus')
fitness_df.columns = [condition_map[file.split('/')[-1].split('_')[0]]+'_'+i for i in fitness_df.columns]

# Merge the rest of the treatment conditions together
for file in file_list[1:]:
    curr_df = pd.read_csv(file)
    curr_df = curr_df.drop(columns='M')
    curr_df = curr_df.set_index('Locus')
    curr_df.columns = [condition_map[file.split('/')[-1].split('_')[0]]+'_'+i for i in curr_df.columns]
    fitness_df = fitness_df.merge(curr_df,left_index=True,right_index=True)

In [20]:
# Pull out untreated values
unt_df = fitness_df[fitness_df.columns[fitness_df.columns.str.contains('Untreated')]]

# Loop through each condition and calculate p-values relative to untreated
for cond in ['R0C100','R0C200','R7C0','R7C100']:
    curr_annot_cond = fitness_df[fitness_df.columns[fitness_df.columns.str.contains(cond)]]
    pvals = stats.ttest_ind_from_stats(unt_df['Untreated_W'],unt_df['Untreated_SD'],unt_df['Untreated_Count'],
                                    curr_annot_cond[cond+'_W'],curr_annot_cond[cond+'_SD'],curr_annot_cond[cond+'_Count'])
    fitness_df['p_'+cond] = pvals[1]
    fitness_df['p_'+cond].fillna(1,inplace=True)
    fitness_df['d_'+cond] = curr_annot_cond[cond+'_W'] - unt_df['Untreated_W']
    pvals_corr = multipletests(fitness_df['p_'+cond],alpha=0.05, method='fdr_tsbky')
    fitness_df['q_'+cond] = pvals_corr[1]
    fitness_df['log10q_'+cond] = -np.log10(fitness_df['q_'+cond])
    
# Perform comparison between combo vs RIF only

# Pull out RIF only values
unt_df = fitness_df[fitness_df.columns[fitness_df.columns.str.contains('R7C0')]]

cond = 'R7C100'
curr_annot_cond = fitness_df[fitness_df.columns[fitness_df.columns.str.contains(cond)]]
pvals = stats.ttest_ind_from_stats(unt_df['R7C0_W'],unt_df['R7C0_SD'],unt_df['R7C0_Count'],
                                curr_annot_cond[cond+'_W'],curr_annot_cond[cond+'_SD'],curr_annot_cond[cond+'_Count'])
fitness_df['p_R7C100vsR7C0'] = pvals[1]
fitness_df['p_R7C100vsR7C0'].fillna(1,inplace=True)
fitness_df['d_R7C100vsR7C0'] = curr_annot_cond[cond+'_W'] - unt_df['R7C0_W']
pvals_corr = multipletests(fitness_df['p_'+cond],alpha=0.05, method='fdr_tsbky')
fitness_df['q_R7C100vsR7C0'] = pvals_corr[1]
fitness_df['log10q_R7C100vsR7C0'] = -np.log10(fitness_df['q_R7C100vsR7C0'])

In [21]:
# Add gene annotations to the dataframe
noteATCC = pd.read_excel('../ATCC annotations.xlsx')
noteATCC.columns = ['Locus','Contig','Gene','Product']

fitness_df = noteATCC.merge(fitness_df,on=['Locus'])

# Save file
fitness_df.to_csv(output+'aggregated_tn-seq.csv')