In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [28]:
gene_info = pd.read_csv('../accessory_files/yeast_gene_annotations.tsv', delimiter='\t')
orf_to_gene = {i[0]:i[1] for i in np.array(gene_info[['ORF', 'Gene_ORF']])}

orf_hits = pd.read_csv('../../Output/WGS/combined_option/gene_hit_data.tsv', delimiter='\t')
wells = [i.split('_')[0] for i in orf_hits if '_present' in i]
well_orfs_present = defaultdict(list)
for j, row in orf_hits.iterrows():
    for well in wells:
        if row[well+'_present']>0:
            well_orfs_present[well].append(row['Gene_ORF'])
# One excluded autodiploid well is not in this data, so I'll add it manually:
td = pd.read_csv('../../Output/WGS/combined_option/processed_well_output/P1B03_processed.tsv', delimiter='\t')
td['ORF_hit']
well_orfs_present['P1B03'] = [orf_to_gene.get(i, i) for i in set(td['ORF_hit']) if pd.notnull(i)]

In [36]:
plates = ['P1', 'P2', 'P3']
fds = []
for p in plates:
    td = pd.read_csv('../../Output/Fitness/' + p + '_freq_and_s_data.csv')
    td['plate'] = [p]*len(td)
    fds.append(td[['plate', 'Well'] + [i for i in td if '_s_scaled' in i and 'R' not in i]])

fit_data = pd.concat(fds)
fit_data['platewell'] = fit_data['plate'] + fit_data['Well']
td = pd.read_csv('../accessory_files/VLTE_by_well_info.csv')[['platewell', 'contam', 'strain']]
fit_data = fit_data.merge(td, on='platewell', how='left')
pg = pd.DataFrame([[w, ';'.join(well_orfs_present[w])] for w in well_orfs_present], columns=['platewell', 'genes_w_nonsyn_muts'])
fit_data = fit_data.merge(pg, on='platewell', how='left')  
fit_data['focal'] = pd.notnull(fit_data['genes_w_nonsyn_muts'])
fit_data[fit_data['focal']]
fit_data.to_csv('../../Output/Browser/well_fitness_info_etc.tsv', sep='\t', index=False)

In [34]:
def get_mutation_info(row):
    ann = str(row['ANN']).split('|')
    if len(ann) > 2:
        return ann[1]+', '+ann[2]
    else:
        return ''

wells = list(pg['platewell'])
base_use_cols = ['CHROM', 'POS', 'REF', 'ALT', 'QUAL', 'SVTYPE', 'ANN_simpler', 'af_trajectory', 'perc_of_alt', 'mutation_group', 
                 'ORF_hit', 'Gene_ORF', 'briefDescription', 'info', 'G70_allele_counts', 'G1410_allele_counts', 'G2640_allele_counts',
                 'G5150_allele_counts', 'G7530_allele_counts', 'G10150_allele_counts']
seq_gens = [70, 1410, 2640, 5150, 7530, 10150]
for well in sorted(wells):
    td = pd.read_csv('../../Output/WGS/combined_option/processed_well_output/' + well + '_processed.tsv', delimiter='\t')
    td['info'] = td.apply(lambda row: get_mutation_info(row), axis=1)
    td = td.merge(gene_info[['ORF', 'Gene_ORF', 'briefDescription']], left_on='ORF_hit', right_on='ORF', how='left')
    for gen in seq_gens:
        if 'G'+str(gen)+'_alt_counts' in td:
            td['G'+str(gen)+'_allele_counts'] = td.apply(lambda r: str(r['G'+str(gen)+'_ref_counts'])+','+str(r['G'+str(gen)+'_alt_counts']), axis=1)
    svc = len(td[pd.notnull(td['SVTYPE'])])
    #print(well, svc)
    use_cols = [c for c in base_use_cols if c in td]
    td[use_cols].to_csv('../../Output/Browser/Allele_freqs/' + well + '.tsv', sep='\t', index=False)
