In [1]:
import pandas as pd
from collections import defaultdict, Counter

In [2]:
bcd = pd.read_csv('../BC_extraction/bc_assignments.csv')
bcd.head(2)

Unnamed: 0,Barcode,File,BFAs
0,AAATAAAATTAAAACAGAGTTTGGTCCCATGAACATATTTGTCAAT...,FLC_2N_Plate3_FLC-P3-E8-CGAGGCTG-TACTCCTT_S26,dBFA2;hBFA2
1,AGGAAAATGGCTAATTGTCTTAATGGTTCCCAAGCGAATTATATTT...,CLM_clones_1N_P1-G12-ATCTCAGG-ATTAGACG_S84,hBFA1;hBFA2


In [3]:
short_names = {
    'ReAnn_Dip_variants.xlsx': 'Dip_clone_Env',
    'ReAnn_CLM_R2_1N_variants.xlsx': 'CLM_R2_clones_1N_',
    'CLM_2N_Batch2_Filter_InWork_ReAnn.xlsx': 'CLM_2N_Batch2',
    'ReAnn_GlyEtOH_variants_InWork.xlsx': 'GlyEtOH_2N_20170616',
    'ReAnn_GlyEtOH_1N_variants.xlsx': 'GlyEtOH_1N',
    'ReAnn_CLM_variants_InWork.xlsx': 'CLM_2N',
    'ReAnn_FLC_R1R2_1N_variants.xlsx': 'FLC_R2_clones_1N',
    'ReAnn_FLC_variants_InWork.xlsx': 'FLC_2N_Plate2',
    'ReAnn_MiloAddFiles_variants.xlsx': 'GlyEtOH_1N',
    'ReAnn_CLM_1N_variants.xlsx': 'CLM_clones_1N'
}

sheets = {'CLM_2N_Batch2_Filter_InWork_ReAnn.xlsx': 'ReAnn_Manual_Variants',
          'CLM_2N_Batch2_SNP_Filter_InWork_short.xlsx': 'ReAnn_Manual_Variants',
          'ReAnn_CLM_1N_variants.xlsx': 'Curated_Variants',
          'ReAnn_CLM_R2_1N_variants.xlsx': 'Curated_Variants',
          'ReAnn_FLC_R1R2_1N_variants.xlsx': 'Curated_Variants',
          'ReAnn_FLC_variants_InWork.xlsx': 'Variants'}

fds = dict()
for f in short_names:
    if f in sheets:
        fds[f.split('.')[0]] = pd.read_excel('../Mutation_data/'+f, sheet_name=sheets[f])
    else:
        fds[f.split('.')[0]] = pd.read_excel('../Mutation_data/'+f)
    fds[f.split('.')[0]]['Short_name'] = [short_names[f]]*len(fds[f.split('.')[0]])
    fds[f.split('.')[0]]['File'] = fds[f.split('.')[0]]['Short_name']+'_'+fds[f.split('.')[0]]['Strain']
    
        
dd = []
for f in fds:
    if 'Manual.Annotation' in fds[f]:
        fds[f] = fds[f].rename(columns={'Manual.Annotation': 'Manual.Verification'})
    if 'Manual.annotation' in fds[f]:
        fds[f] = fds[f].rename(columns={'Manual.annotation': 'Manual.Verification'})
    fds[f]['Run'] = [f]*len(fds[f])
    dd.append(fds[f][['Run', 'File', 'GENE', 'EFFECT', 'IMPACT', 'CHROM', 'POS', 'REF', 'ALT', 'Manual.Verification', 'AD', 'DP', 'QUAL']])

    
def heterozygous_call(ad):
    ref, alt = int(ad.split(',')[0]), int(ad.split(',')[1])
    tot = ref+alt
    if alt < 2:
        return np.nan
    elif tot < 4:
        return np.nan
    else:
        if alt/tot >= 0.15 and alt/tot <= 0.85:
            return 'Heterozygous'
        elif alt/tot < 0.15:
            return np.nan
        else:
            return 'Homozygous'

muts = pd.concat(dd)
muts['Call'] = muts['AD'].apply(heterozygous_call)

In [4]:
muts.Call.value_counts()

Homozygous      1247
Heterozygous    1140
Name: Call, dtype: int64

In [5]:
for run in sorted(set(muts.Run)):
    print(run)
    print(muts[muts.Run==run].Call.value_counts())

CLM_2N_Batch2_Filter_InWork_ReAnn
Heterozygous    122
Homozygous        2
Name: Call, dtype: int64
ReAnn_CLM_1N_variants
Homozygous      392
Heterozygous      6
Name: Call, dtype: int64
ReAnn_CLM_R2_1N_variants
Homozygous      165
Heterozygous      3
Name: Call, dtype: int64
ReAnn_CLM_variants_InWork
Heterozygous    73
Name: Call, dtype: int64
ReAnn_Dip_variants
Heterozygous    295
Homozygous        2
Name: Call, dtype: int64
ReAnn_FLC_R1R2_1N_variants
Homozygous      442
Heterozygous     19
Name: Call, dtype: int64
ReAnn_FLC_variants_InWork
Heterozygous    334
Homozygous        4
Name: Call, dtype: int64
ReAnn_GlyEtOH_1N_variants
Homozygous      209
Heterozygous     11
Name: Call, dtype: int64
ReAnn_GlyEtOH_variants_InWork
Heterozygous    276
Homozygous        7
Name: Call, dtype: int64
ReAnn_MiloAddFiles_variants
Homozygous      24
Heterozygous     1
Name: Call, dtype: int64


In [14]:
# merging with barcode data also pulled from sequencing files
mu = muts.merge(bcd[['File', 'Barcode']], on='File', how='inner')
# Filter out synonymous / noncoding mutations and mutations that didn't pass our het/homozygous calling
mu = mu[(mu['IMPACT'].isin(['MODERATE', 'HIGH'])) & (mu['Call'].notnull()) & (mu['Call']!='unknown')]
# Adding mutation ID column
#mu['mut_ID'] = mu['GENE']+'_'+mu['EFFECT']+'_'mu['CHROM']+'_'+mu['POS'].apply(lambda p: str(p))+'_'+mu['ALT'].apply(lambda p: str(p))+'_'+mu['Call']
#mu.to_csv('../Mutation_data/PLT_mutation_calls_simple.csv', index=False)

#                                     #
# Combining mutations near each other #
#                                     #
bc_to_mut_ids = defaultdict(list)
# like bc_to_rows[bcs] = [[mut_id,mut_id], [mut_id], [mut_id,mut_id]] mut_ids combined in the inner lists if they are within 35 bp of another mut_id

for jnk, row in mu.iterrows():
    mid_new = (row['CHROM'], row['POS'], row['ALT'], row['Call'], row['GENE'], row['EFFECT'])
    merged = []
    mids = bc_to_mut_ids[row['Barcode']]
    for i in range(len(mids)):
        for mid_old in mids[i]:
            if mid_old[0] == mid_new[0] and np.abs(int(mid_old[1])-int(mid_new[1]))<=35:
                merged.append(i)
    all_mids = []
    mid_to_add = [mid_new]
    for i in range(len(mids)):
        if i in merged:
            mid_to_add += mids[i]
        else:
            all_mids.append(mids[i])
    all_mids.append(mid_to_add)
    bc_to_mut_ids[row['Barcode']] = all_mids

# Counting up how many times we see each gene hit
gene_counts = Counter()
for bc in bc_to_mut_ids:
    for mid_group in bc_to_mut_ids[bc]:
        gene_counts[mid_group[0][4]] += 1
    
#
#  Creating a dataframe like BC, Gene_list (';'-separated), Effect_list (';'-sep), mut_ID_list (';'-sep, like CHROM_POS_ALT_Call)
#  Genes, Effects, mut_IDs sorted by # hit (gene_counts)
#  For merged mutations (within 35 bp of each other), Effects and mut_IDs are separated by spaces. For genes, we assume they are shared
#
mat = []
for bc in bc_to_mut_ids:
    tmprec = bc_to_mut_ids[bc]
    tmp = [bc]
    gene_list = []
    #effect_list = []
    mut_id_list = []
    for entry_list in sorted(tmprec, key=lambda x: -1*gene_counts[x[0][4]]):
        entry_list = list(set(entry_list)) # This deduplicates the same exact mutations found in multiple files associated with the same barcode
        gene_list.append(entry_list[0][4])
        #effect_list.append(' '.join([m[5] for m in entry_list]))
        mut_id_list.append(' '.join([m[4]+'_'+m[0]+'_'+str(m[1])+'_'+m[2]+'_'+m[3]+'_'+m[5] for m in entry_list]))
    
    mat.append([bc, ';'.join(gene_list), ';'.join(mut_id_list)])


md = pd.DataFrame(mat, columns=['Barcode', 'Genes', 'mut_IDs']) 

def putative_drivers(gene_list):
    if len(gene_list) == 1:
        return gene_list[0]
    else:
        drivers = [g for g in gene_list if gene_counts[g]>1]
        return ';'.join(drivers)
    
def get_full_driver_mutations(row):
    drivers = row['Putative_Drivers'].split(';')
    genes = row['Genes'].split(';')
    mutids = row['mut_IDs'].split(';')
    return ';'.join(sorted([mutids[i] for i in range(len(genes)) if genes[i] in drivers]))
    
md['Putative_Drivers'] = md['Genes'].apply(lambda g: putative_drivers(g.split(';')))
md['Putative_Driver_Mutations'] = md.apply(get_full_driver_mutations, axis=1)
md.to_csv('Final_data_sets/Barcode_mutation_association.csv', index=False)
md.head(2)

Unnamed: 0,Barcode,Genes,mut_IDs,Putative_Drivers,Putative_Driver_Mutations
0,TGCACAAAATAAAATATCCTTCCTGAGTGTCAATCGATTTTTTTGT...,ACS2;BIO3;GAS3;VPS34,ACS2_XII_446579_T_Heterozygous_NON_SYNONYMOUS_...,,
1,TTGTTAAACCAGAATCCTCTTATACAAACGAAACAGTGTTGTAATT...,KTR2,KTR2_XI_558029_C_Heterozygous_NON_SYNONYMOUS_C...,KTR2,KTR2_XI_558029_C_Heterozygous_NON_SYNONYMOUS_C...


## Methods
* Call heterozygous or homozygous

```
if alt < 2:
    return np.nan
elif tot < 4:
    return np.nan
else:
    if alt/tot >= 0.15 and alt/tot <= 0.85:
        return 'Heterozygous'
    elif alt/tot < 0.25:
        return np.nan
    else:
        return 'Homozygous'
```

* Filter out synonymous / noncoding mutations and mutations that didn't pass our het/homozygous calling
    * ` mu = mu[(mu['IMPACT'].isin(['MODERATE', 'HIGH'])) & (mu['Call'].notnull()) & (mu['Call']!='unknown')] `
* Mutations get merged if they are within 35 bp of each other and associated with the same barcode (all mutations info from merged mutations will be separated by spaces)  
* Call a mutations a putative driver if they are in a gene in which we observe at least two unique nonsyn mutations (very lenient definition)
* Now, combine it with BFA data...

## OK Let's combine it with the BFA data

In [8]:
def get_identifier(row):
    if pd.notnull(row['Putative_Driver_Mutations']):
        return row['Putative_Driver_Mutations']
    elif pd.notnull(row['mut_IDs']):
        return row['mut_IDs']
    else:
        return row['Barcode']

    
fd = pd.read_csv('Final_data_sets/All_fitness_tidy.csv')
bfa_dats = dict()
good_envs = dict()
envs_use = ['YPD', 'SC', '37C', 'pH3_8', 'pH7_3', 'GlyEtOH', 'FLC4', 'CLM', '21C', '02M_NaCl']
icols = ['Barcode', 'Home_Environment', 'Putative_Neutral']
for bfa in ['hBFA1_alpha', 'hBFA2_alpha', 'hBFA2_2N', 'dBFA2_2N']:
    td = fd[(fd['BFA']==bfa.split('_')[0]) & (fd['ploidy']==bfa.split('_')[1])]
    bfa_dats[bfa] = td[['Barcode', 'Test_Environment', 's']].pivot(index='Barcode', columns='Test_Environment', values='s')
    good_envs[bfa] = [i for i in envs_use if len([j for j in bfa_dats[bfa][i] if pd.isnull(j)])!=len(bfa_dats[bfa])]
    bfa_dats[bfa] = bfa_dats[bfa].merge(td[icols].drop_duplicates(), on='Barcode', how='left')[icols+good_envs[bfa]]

# Adding mutation data
# md = pd.read_csv('Final_data_sets/Barcode_mutation_association.csv')
mut_based = dict()
for bfa in bfa_dats:
    bfa_dats[bfa] = bfa_dats[bfa].merge(md, on='Barcode', how='left')
    bfa_dats[bfa]['Lineage_ID'] = bfa_dats[bfa].apply(get_identifier, axis=1)
    # combining data by driver mutation(s), mutations, or barcode
    mut_based[bfa] = bfa_dats[bfa][['Home_Environment', 'Lineage_ID']+good_envs[bfa]].groupby(['Home_Environment', 'Lineage_ID']).mean().reset_index()
    mut_based[bfa] = mut_based[bfa].merge(bfa_dats[bfa][['Lineage_ID', 'Putative_Drivers']].drop_duplicates(), on='Lineage_ID', how='left')
    mut_based[bfa]['Has_Mutation_Data'] = mut_based[bfa]['Lineage_ID'].apply(lambda x: '_' in x)
    mut_based[bfa].to_csv('Final_data_sets/'+bfa+'_with_mutation_data.csv', index=False)

In [13]:
# Showing which envs. have mutation data in each bfa
for bfa in mut_based:
    print(bfa, ':  ', len(mut_based[bfa]), 'lineages,', len(mut_based[bfa][mut_based[bfa]['Has_Mutation_Data']]), 'with mutation data. Envs:')
    print(mut_based[bfa][mut_based[bfa]['Has_Mutation_Data']]['Home_Environment'].value_counts())

hBFA1_alpha :   1810 lineages, 101 with mutation data. Envs:
GlyEtOH_alpha    40
CLM_alpha        33
FLC4_alpha       28
Name: Home_Environment, dtype: int64
hBFA2_alpha :   1528 lineages, 41 with mutation data. Envs:
CLM_alpha        33
GlyEtOH_alpha     6
FLC4_alpha        2
Name: Home_Environment, dtype: int64
hBFA2_2N :   840 lineages, 42 with mutation data. Envs:
GlyEtOH_2N    27
FLC4_2N       15
Name: Home_Environment, dtype: int64
dBFA2_2N :   4097 lineages, 88 with mutation data. Envs:
CLM_2N         21
21C_2N         17
FLC4_2N        15
GlyEtOH_2N     10
37C_2N         10
02M_NaCl_2N     9
pH7_3_2N        6
Name: Home_Environment, dtype: int64
