In [1]:
from matplotlib import pyplot as pl
import numpy as np
import seaborn as sns
from fitness_assay_debug import inferFitness, inverseVarAve
import pandas as pd
from collections import defaultdict
from matplotlib.collections import LineCollection

## Importing BFA files

In [2]:
dbfa2 = pd.read_csv('../Final_Count_Pipeline/BFA_data/Combined_Counts/dBFA2_counts.csv')
hbfa1 = pd.read_csv('../Final_Count_Pipeline/BFA_data/Combined_Counts/hBFA1_counts.csv')
hbfa2 = pd.read_csv('../Final_Count_Pipeline/BFA_data/Combined_Counts/hBFA2_counts.csv')
nd = {'dBFA2': dbfa2, 'hBFA1': hbfa1, 'hBFA2': hbfa2}

## Labeling putative environments

In [3]:
env_file = '../env_bc_scratch/Environment_BC_calls_Feb_2018_simple.csv'
ed = {i[0]: i[1] for i in pd.read_csv(env_file).as_matrix(['Environment.BC', 'Putative.Environment'])}
for n in nd:
    td = nd[n]
    td['Putative.Environment'] = td.apply(lambda row: ed.setdefault(row['Environment.BC'], 'unknown'), axis=1)

## For dbfa2, I will use the subpool info for the neutral barcodes, which is not in this combined file, but is in the first harvard lane. 'Ancestor_YPD_2N_R1_1' is at about 4 times the frequency of 'Ancestor_YPD_2N_R1_2' and there are ~180 bcs in each

In [4]:
d_w_pools = pd.read_csv('../Final_Count_Pipeline/BFA_data/dBFA2_Harvard_1/dBFA2_Harvard_1_bc_counts_clustered.csv')
d_w_pools['Full.BC'] = d_w_pools['Diverse.BC'] + d_w_pools['Environment.BC']
anc_ypd_1 = d_w_pools.loc[d_w_pools['Ancestor_YPD_2N_R1_1']>5]['Full.BC']
anc_ypd_2 = d_w_pools.loc[d_w_pools['Ancestor_YPD_2N_R1_2']>5]['Full.BC']

In [5]:
putative_neuts = dict()
putative_neuts['hBFA1'] = list(hbfa1.loc[hbfa1['Putative.Environment'] == 'YPD_alpha']['Full.BC'])
putative_neuts['hBFA2'] = list(hbfa2.loc[hbfa2['Putative.Environment'] == 'CLM_2N']['Full.BC'])
putative_neuts['dBFA2'] = list(dbfa2.loc[dbfa2['Full.BC'].isin(anc_ypd_1)]['Full.BC'])

for n in putative_neuts:
    print(n, len(putative_neuts[n]))

hBFA1 302
hBFA2 277
dBFA2 177


## Reading timepoint exclusion info from file

In [6]:
tp_exclusion = pd.read_csv('bfa_timepoint_exclusion_list.csv')
tp_ex = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) # dict like tp_ex[bfa_name][env_name][replicate] = list of excluded tps
for row in tp_exclusion.as_matrix(['ASSAY', 'ENV', 'REP', 'TIME']):
    tp_ex[row[0]][row[1]][row[2]] = row[3].split(';')
print('Example', tp_ex['hBFA1']['FLC4'])

Example defaultdict(<class 'list'>, {'R2': ['16']})


In [14]:
fit_data = defaultdict(dict)
c_times = [1, 2, 3, 4, 5]
for bfa_name in nd:
    td = nd[bfa_name]
    bcs = list(td['Full.BC'])
    envs = set([i.split('-')[1] for i in td.columns if 'Time' in i and i.split('-')[1] not in ['Pre', 'T0_Pool']])
    for env in envs:
        read_dat = dict()
        reps = sorted(set([i.split('-')[2] for i in td.columns if 'Time' in i and env in i]))
        for rep in reps:
            if not 'EXCLUDE ALL' in tp_ex[bfa_name][env][rep]:
                excluded_tps = tp_ex[bfa_name][env][rep]
                tps = [bfa_name + '-' + env + '-' + rep + '-Time' + str(i*8) for i in c_times]
                # to exclude timepoints I will just zero out the counts so they will be caught by the low coverage thresh
                for tp in tps:
                    if tp[tp.index('Time')+4:] in excluded_tps:
                        td[tp] = np.zeros(len(td))
                tmp_read_dat = np.nan_to_num(td.as_matrix([bfa_name + '-' + env + '-' + rep + '-Time' + str(i*8) for i in c_times]))
                included_tps = [i for i in c_times if np.sum(tmp_read_dat, axis=0)[i-1] > 1e5]
                if len(included_tps) < 2:
                    print(bfa_name, env, rep, 'not enough tps')
                else:
                    print(bfa_name, env, rep, 'included tps:', included_tps)
                    read_dat[rep] = tmp_read_dat
        if len(read_dat) > 0:
            fit_data[bfa_name][env] = inferFitness(bcs, c_times, read_dat, outputFolder = 'test_Atish_out/', 
                                              experimentName = bfa_name+'-'+env+'-', lowCoverageThresh=1e5, 
                                              neutralBarcodes=putative_neuts[bfa_name])


dBFA2 YPD R1 included tps: [1, 2, 3, 4, 5]
dBFA2 YPD R2 included tps: [1, 2, 3, 4, 5]
dBFA2 YPD R3 included tps: [1, 2, 3, 4, 5]


  zScores = zScores*np.power(expectedReads,-0.5)
  zScores = zScores*np.power(expectedReads,-0.5)
  allTimeFitness = np.log(allReads[repName][:,1:]/totReads[1:])-np.log(allReads[repName][:,0:-1]/totReads[0:-1])
  allTimeFitness = np.log(allReads[repName][:,1:]/totReads[1:])-np.log(allReads[repName][:,0:-1]/totReads[0:-1])
  allTimeErrors = np.sqrt(np.power(allReads[repName][:,1:]/kappas,-1)
  weightedMeans = np.sum(meanVals*np.power(standardDevs,-2),axis=1)/np.sum(np.power(standardDevs,-2),axis=1)
  weightedStandardDevs = np.power(np.sum(np.power(standardDevs,-2),axis=1),-0.5)


dBFA2 37C_Stan R1 included tps: [1, 2]
dBFA2 37C_Stan R2 included tps: [1, 2, 3, 4]
dBFA2 37C_Stan R3 included tps: [1, 2, 3, 4]
dBFA2 FLC4 R1 not enough tps
dBFA2 FLC4 R2 included tps: [1, 2]
dBFA2 FLC4 R3 included tps: [1, 2]
dBFA2 SC R1 included tps: [1, 3, 4, 5]
dBFA2 SC R2 included tps: [1, 2, 3, 4]
dBFA2 SC R3 included tps: [1, 2, 3, 4, 5]
dBFA2 CLM R1 included tps: [1, 2]
dBFA2 CLM R2 included tps: [1, 2]
dBFA2 CLM R3 included tps: [1, 2]
dBFA2 21C R1 included tps: [1, 2, 3, 4]
dBFA2 21C R2 included tps: [1, 4, 5]
dBFA2 21C R3 included tps: [1, 2, 3, 5]
dBFA2 pH7_3 R1 included tps: [1, 2, 3, 4, 5]
dBFA2 pH7_3 R2 included tps: [1, 2, 3, 4, 5]
dBFA2 pH7_3 R3 included tps: [1, 2, 3, 4, 5]
dBFA2 pH3_8 R1 included tps: [2, 3, 5]
dBFA2 pH3_8 R2 included tps: [2, 4, 5]
dBFA2 pH3_8 R3 included tps: [1, 2, 3, 5]
dBFA2 GlyEtOH R1 included tps: [1, 2, 3, 4, 5]
dBFA2 GlyEtOH R2 included tps: [1, 2, 3]
dBFA2 GlyEtOH R3 included tps: [1, 2]
dBFA2 48Hr R1 not enough tps
dBFA2 48Hr R3 not enoug

## For each barcode, I will take both the inverse variance weighted average and an unweighted average of s measurements:

In [15]:
fdat_short = dict()
for bfa_name in nd:
    td = nd[bfa_name]
    td.sort_values(by='Full.BC', inplace=True)
    td['putative_neutral'] = td['Full.BC'].isin(putative_neuts[bfa_name])
    for env in fit_data[bfa_name]:
        tmp = fit_data[bfa_name][env][0]
        reps = [i for i in tmp.keys() if 'R' in i]
        s_aves = np.array([tmp[r]['aveFitness'] for r in reps]).T
        s_errs = np.array([tmp[r]['aveError'] for r in reps]).T
        tmp['iva_s'], tmp['iva_s_err'] = inverseVarAve(s_aves, s_errs)
        tmp['ave_s'] = np.mean(s_aves, axis=1)
        tmp['ave_err'] = np.power(np.mean(np.power(s_errs, 2),axis=1), 0.5)
        for r in reps:
            td[env + '-' + r + '-aveFitness'] = tmp[r]['aveFitness']
            td[env + '-' + r + '-aveError'] = tmp[r]['aveError']
        for key in ['iva_s', 'iva_s_err', 'ave_s', 'ave_err']:
            td[env + '-' + key] = tmp[key]
        td['used_as_neutral_in_' + env] = fit_data[bfa_name][env][1]

  weightedStandardDevs = np.power(np.sum(np.power(standardDevs,-2),axis=1),-0.5)


In [25]:
for bfa_name in nd:
    td = nd[bfa_name]
    cols = ['Full.BC', 'Diverse.BC', 'Environment.BC', 'Putative.Environment'] + [i for i in td.columns if 'ave' in i or 'iva' in i or 'neutral' in i]
    td.to_csv('03_07_18_fitness_estimates/' + bfa_name + '_03_07_18_s.csv', index=False)

KeyboardInterrupt: 

## Adding frequency information for plotting later:

In [17]:
for n in nd:
    td = nd[n]
    all_tps = [i for i in td.columns if 'Time' in i and 'logfreq' not in i]
    for tp in all_tps:
        tmp_sum = sum(td[tp])
        td[tp + '.logfreq.plot'] = np.log10(np.clip((td[tp] / tmp_sum), 10**(-6), 1))

# Plotting s correlations between replicates

In [18]:
for bfa_name in nd:
    for env in fit_data[bfa_name]:
        tmp = fit_data[bfa_name][env][0]
        reps = [i for i in tmp.keys() if 'R' in i]
        if len(reps) > 1:
            fig, subps = pl.subplots(len(reps), len(reps), figsize=(5*len(reps), 5*len(reps)), sharex=True, sharey=True)
            mn = max([i for i in tmp['ave_s']/8 if i < 10])
            mx = min([i for i in tmp['ave_s']/8 if i > -10])
            for i in range(len(reps)):
                for j in range(len(reps)):
                    subps[i][j].plot([mn, mx], [mn, mx], linestyle='dashed', color='k', alpha=0.5)
                    if i == j:
                        subps[i][j].scatter(tmp[reps[i]]['aveFitness']/8, tmp['ave_s']/8)
                        subps[i][j].set_xlabel(reps[i])
                        subps[i][j].set_ylabel('replicate average')
                    else:
                        subps[i][j].scatter(tmp[reps[i]]['aveFitness']/8, tmp[reps[j]]['aveFitness']/8)
                        subps[i][j].set_xlabel(reps[i])
                        subps[i][j].set_ylabel(reps[j])
                    
                    
            fig.savefig('prelim_graphs/s_correlations/' + bfa_name + '-' + env + '_unweighted_ave.png')
            pl.close("all")   

In [19]:
for bfa_name in nd:
    for env in fit_data[bfa_name]:
        tmp = fit_data[bfa_name][env][0]
        reps = [i for i in tmp.keys() if 'R' in i]
        if len(reps) > 1:
            fig, subps = pl.subplots(len(reps), len(reps), figsize=(5*len(reps), 5*len(reps)), sharex=True, sharey=True)
            mn = max([i for i in tmp['iva_s']/8 if i < 10])
            mx = min([i for i in tmp['iva_s']/8 if i > -10])
            for i in range(len(reps)):
                for j in range(len(reps)):
                    subps[i][j].plot([mn, mx], [mn, mx], linestyle='dashed', color='k', alpha=0.5)
                    if i == j:
                        subps[i][j].scatter(tmp[reps[i]]['aveFitness']/8, tmp['iva_s']/8)
                        subps[i][j].set_xlabel(reps[i])
                        subps[i][j].set_ylabel('replicate average')
                    else:
                        subps[i][j].scatter(tmp[reps[i]]['aveFitness']/8, tmp[reps[j]]['aveFitness']/8)
                        subps[i][j].set_xlabel(reps[i])
                        subps[i][j].set_ylabel(reps[j])
                    
                    
            fig.savefig('prelim_graphs/s_correlations/' + bfa_name + '-' + env + '_iva.png')
            pl.close("all")  

In [21]:
bfa_name = 'dBFA2'    
td = nd[bfa_name]
fig, subps = pl.subplots(6, 2, figsize=(16, 20))
envs = [i for i in fit_data[bfa_name]]
c = 0
bc_list = sorted(list(nd[bfa_name]['Full.BC']))
for subarr in subps:
    for sub in subarr:
        if c < len(envs):
            env = envs[c]
            bcs_inferred_neut = [bc_list[i] for i in range(len(bc_list)) if fit_data[bfa_name][env][1][i]]
            c += 1
            tmp = fit_data[bfa_name][env][0]
            use_s = [tmp['iva_s'][i] for i in range(len(bc_list)) if bc_list[i] in list(anc_ypd_1)]
            use_s_2 = [tmp['ave_s'][i] for i in range(len(bc_list)) if bc_list[i] in list(anc_ypd_1)]
            #use_s_2 = [tmp['iva_s'][i] for i in range(len(bc_list)) if bc_list[i] in list(anc_ypd_2)]
            #use_s_2 = [tmp['ave_s'][i] for i in range(len(bc_list)) if bc_list[i] in list(bcs_inferred_neut)]
            #sub.hist(np.clip(np.nan_to_num(tmp['ave_s'])/8, -0.05, 0.05), bins=50, color='k', alpha=0.5, normed=True)
            sub.hist(np.clip(np.nan_to_num(use_s)/8, -0.05, 0.05), bins=50, color='r', alpha=0.5, normed=True)
            sub.hist(np.clip(np.nan_to_num(use_s_2)/8, -0.05, 0.05), bins=50, color='y', alpha=0.5, normed=True)
            sub.set_title(env, y=0.8, x=0.15)
fig.savefig('prelim_graphs/neut_s_distribs/dBFA2_neut_s_distrib_.png')

for r in ['R1', 'R2', 'R3']:
    fig, subps = pl.subplots(6, 2, figsize=(16, 20))
    c = 0
    for subarr in subps:
        for sub in subarr:
            if c < len(envs):
                env = envs[c]
                bcs_inferred_neut = [bc_list[i] for i in range(len(bc_list)) if fit_data[bfa_name][env][1][i]]
                c += 1
                tmp = fit_data[bfa_name][env][0]
                if r in tmp:
                    use_s = [tmp[r]['aveFitness'][i] for i in range(len(bc_list)) if bc_list[i] in list(anc_ypd_1)]
                    sub.hist(np.clip(np.nan_to_num(use_s)/8, -0.05, 0.05), bins=50, color='r', alpha=0.5, normed=True)
                    sub.set_title(env, y=0.8, x=0.15)
    fig.savefig('prelim_graphs/neut_s_distribs/by_rep/dBFA2_' + r + '_neut_s_distrib_.png')
    pl.close("all")

In [22]:
for bfa_name in ['hBFA1', 'hBFA2' ]:   
    td = nd[bfa_name]
    fig, subps = pl.subplots(5, 2, figsize=(16, 20))
    envs = [i for i in fit_data[bfa_name]]
    c = 0
    bc_list = sorted(list(nd[bfa_name]['Full.BC']))
    for subarr in subps:
        for sub in subarr:
            if c < len(envs):
                env = envs[c]
                c += 1
                tmp = fit_data[bfa_name][env][0]
                use_s = [tmp['iva_s'][i] for i in range(len(bc_list)) if bc_list[i] in putative_neuts[bfa_name]]
                use_s_2 = [tmp['ave_s'][i] for i in range(len(bc_list)) if bc_list[i] in putative_neuts[bfa_name]]
                sub.hist(np.clip(np.nan_to_num(use_s)/8, -0.05, 0.05), bins=50, color='r', alpha=0.5, normed=True)
                sub.hist(np.clip(np.nan_to_num(use_s_2)/8, -0.05, 0.05), bins=50, color='y', alpha=0.5, normed=True)
                sub.set_title(env, y=0.8, x=0.15)
    fig.savefig('prelim_graphs/neut_s_distribs/' + bfa_name + '_neut_s_distrib.png')
    
    reps = ['R1', 'R2', 'R3']
    if bfa_name == 'hBFA1':
        reps = ['R1', 'R2']
    for r in reps:
        fig, subps = pl.subplots(5, 2, figsize=(16, 20))
        c = 0
        for subarr in subps:
            for sub in subarr:
                if c < len(envs):
                    env = envs[c]
                    bcs_inferred_neut = [bc_list[i] for i in range(len(bc_list)) if fit_data[bfa_name][env][1][i]]
                    c += 1
                    tmp = fit_data[bfa_name][env][0]
                    if r in tmp:
                        use_s = [tmp[r]['aveFitness'][i] for i in range(len(bc_list)) if bc_list[i] in putative_neuts[bfa_name]]
                        sub.hist(np.clip(np.nan_to_num(use_s)/8, -0.05, 0.05), bins=50, color='r', alpha=0.5, normed=True)
                        sub.set_title(env, y=0.8, x=0.15)
        fig.savefig('prelim_graphs/neut_s_distribs/by_rep/' + bfa_name + '_' + r + '_neut_s_distrib_.png')
        pl.close("all")
    

In [23]:
for bfa_name in nd:   
    td = nd[bfa_name]
    envs = [i for i in fit_data[bfa_name]]
    fig, subps = pl.subplots(len(envs), 3, figsize=(16, 4*len(envs)))
    c = 0
    bc_list = sorted(list(td['Full.BC']))
    for e in range(len(envs)):
        bcs_inferred_neut = [bc_list[i] for i in range(len(bc_list)) if fit_data[bfa_name][envs[e]][1][i]]
        td_highlight = td.loc[td['Full.BC'].isin(bcs_inferred_neut)]
        td_given_neuts = td.loc[td['Full.BC'].isin(putative_neuts[bfa_name])]
        for r in range(3):
            if not 'EXCLUDE ALL' in tp_ex[bfa_name][envs[e]]['R'+str(r+1)]:
                excluded_tps = [bfa_name + '-' + envs[e] + '-R' + str(r+1) + '-Time' + i + '.logfreq.plot' for i in tp_ex[bfa_name][envs[e]]['R'+str(r+1)]]
                #print(bfa_name, envs[e], r+1, excluded_tps)
                rep = envs[e] + '-R' + str(r+1)
                tps = [i for i in td.columns if 'Time' in i and rep in i and '.logfreq' in i and i not in excluded_tps]
                if len(tps) != 0:
                    times = [int(x[x.index('Time')+4:x.index('.log')]) for x in tps]
                    rows = td.as_matrix(tps)
                    all_lines = np.zeros((len(rows), len(times), 2), float)
                    for j in range(len(rows)):
                        all_lines[j, :, 1] = rows[j]
                        all_lines[j, :, 0] = times
                    lines = LineCollection(all_lines, color='k', alpha=0.1, linewidths=1)
                    subps[e][r].add_collection(lines)

                    rows = td_given_neuts.as_matrix(tps)
                    all_lines = np.zeros((len(rows), len(times), 2), float)
                    for j in range(len(rows)):
                        all_lines[j, :, 1] = rows[j]
                        all_lines[j, :, 0] = times
                    lines = LineCollection(all_lines, color='b', alpha=0.25, linewidths=1)
                    subps[e][r].add_collection(lines)

                    rows = td_highlight.as_matrix(tps)
                    all_lines = np.zeros((len(rows), len(times), 2), float)
                    for j in range(len(rows)):
                        all_lines[j, :, 1] = rows[j]
                        all_lines[j, :, 0] = times
                    lines = LineCollection(all_lines, color='r', alpha=0.25, linewidths=1)
                    subps[e][r].add_collection(lines)
                    subps[e][r].plot(times, np.median(rows, axis=0), color='g')
                    subps[e][r].set_ylim([-6, 0])
                    subps[e][r].set_xlim([8, max(times)])
                    subps[e][r].set_title(envs[e])

    fig.savefig('prelim_graphs/neut_trajectories/' + bfa_name + '_neutral_inferred.png')
    pl.close("all")  
            