In [1]:
import pandas as pd
import itertools
import xlwt

In [2]:
def quality_filter(in_table):
    in_table['  % Reads Mapped to Downtag Constant Seq'] = pd.to_numeric(in_table['  % Reads Mapped to Downtag Constant Seq'], errors='coerce')
    in_table['  % Reads Mapped to Uptag Constant Seq'] = pd.to_numeric(in_table['  % Reads Mapped to Uptag Constant Seq'], errors='coerce')
    
    # Downtag
    c1 = in_table['  Domint Downtag Seq'].apply(lambda x: len(x) == 25)
    c2 = in_table[' Freq Domint Downtag Seq'] >= 0.25
    c3 = in_table['  Domint Downtag Seq'].apply(lambda x: 'N' not in x)
    
    # Uptag
    c4 = in_table['  Domint Uptag Seq'].apply(lambda x: len(x) == 25)
    c5 = in_table[' Freq Domint Uptag Seq'] >= 0.25
    c6 = in_table['  Domint Uptag Seq'].apply(lambda x: 'N' not in x)

    # ORF filter
    c7 = in_table[" % Domint ORF"] >= 30
    c8 = in_table[" % 2nd Domint ORF"] <= 5
    c9 = in_table[" % Domint ORF"] >= 20
    return in_table[c1 & c2 & c3 & c4 & c5 & c6 & c7  & c8 & c9]


def decode_well(well_id):
    return ord(well_id[0]) - 64, int(well_id[1:])


class WorkBook():
    def __init__(self, name):
        self.name = name
        self.line = 0
        self.book = xlwt.Workbook(encoding="utf-8")
        self.sheet1 = self.book.add_sheet(name)
        
        self.write_line(['ID', 902])
        self.write_line(['Name', '384gl'])
        self.write_line(['Type', 384])
        self.write_line(['Category', 'Gel Plate with Lid'])
        self.write_line(['c', 'r', 'Gene'])

    def write_line(self, line_list):
        for entry, column in zip(line_list, range(len(line_list))):
            self.sheet1.write(self.line, column, entry)
        self.line += 1
    
    def write_destination_line(self, gname):
        cp = (self.line - 5) % 24 + 1
        rp = (self.line - 5) // 24 + 1
        self.write_line([cp, rp, gname])
        
    def saveit(self, location=''):
        self.book.save("{}{}.xls".format(location, self.name))

class Summary_file():
    def __init__(self, name, location=''):
        self.w_file = open(location + name +'.tsv','w')
        print('Source Plate\tSource Row\tSource Column\tDestination Plate'
              '\tDestination Row\tDestination Column\tGene', file=self.w_file)
    
    def writeline(self, source_loc, gene, dest_plate, dest_line):
        print(source_loc[0],source_loc[1],source_loc[2], dest_plate, 
              (dest_line - 5) // 24 + 1, (dest_line - 5) % 24 + 1, gene, 
              sep='\t', file=self.w_file)
    
    def closeit(self):
        self.w_file.close()
        
def well_format(well):
    idx = well.split('_')[0]
    row = (ord(well.split('_')[1][0]) - 64) * 2 - 1
    col = (int(well.split('_')[1][1:])) * 2 - 1
    
    if idx == 'C' or idx == 'D':
        row += 1
    if idx == 'B' or idx == 'D':
        col += 1
    return 'x_' + chr(row + 64) + str(col)

In [3]:
db_source = quality_filter(pd.read_csv('./input/finalYeastOrfeomeDBSummaryFile_v2.txt', sep='\t'))
db_source['Well'] = db_source['Well'].map(well_format)


name_data = list(zip(db_source["  Domint ORF"], db_source["  Domint Downtag Seq"], db_source["  Domint Uptag Seq"],
                    db_source["Plate"], db_source["Well"]))

clist = list(db_source["  Domint ORF"])
name_data.sort(key = lambda x: clist.count(x[0]))

all_bcs1 = set()
all_bcs2 = set()
all_orfs = set()
bc_orf_dict = dict()

for gene_name, bc1, bc2, xplate, xwell in name_data:
    gene_name = gene_name.split('_')[0]
    if gene_name not in all_orfs and bc1 not in all_bcs1 and bc2 not in all_bcs2:
        all_bcs1.add(bc1)
        all_bcs2.add(bc2)
        all_orfs.add(gene_name)
        bc_orf_dict[gene_name] = [bc1, bc2, 'pass1_' + xplate + '_' + xwell.split('_')[1]  ]

orf_mappings = sorted([[orf] + items for orf, items in bc_orf_dict.items()], key=lambda x: int(x[-1].split('-')[1].split('_')[0]))
print('We have {} transferable colonies.'.format(len(orf_mappings)))
# print(bc_orf_dict)

We have 2925 transferable colonies.


## Make Source file

In [4]:
locations = {}
for orf, downtag, uptag, location in orf_mappings:
    splate = int(location.split('-')[1].split('_')[0])
    row, column = decode_well(location.split('_')[-1])
    locations[(splate, row, column)] = orf

In [5]:
empty_colony_count = 1
for plate in range(1, 31):
    source_wb = WorkBook('DB_pass1_SPlate_{}'.format(plate))
    for row in range(1, 17):
        for column in range(1, 25):
            if (plate, row, column) in locations:
                source_wb.write_line([column, row, locations[(plate, row, column)]])
            else:
                source_wb.write_line([column, row, 'dummy_colony_{}'.format(empty_colony_count)])
                empty_colony_count += 1
    source_wb.saveit('./output/ORF_selection_20160519/')

In [6]:
w_file = open('./output/ORF_selection_20160519/DB_barcodeORFeome_20160519.tsv', 'w')
header = '\t'.join(['Location', 'ORF', 'Downtag Sequence', 'Uptag Sequence', 'Source']) + '\n'
w_file.write(header)

wb_count = 1
total_picks = 0
dest_wb = WorkBook('DB_BFG_yeastORFeome_Plate{}'.format(wb_count))

for line in orf_mappings:
    dest_wb.write_destination_line(line[0])
    dest_line = dest_wb.line - 1            
    row = chr((dest_line - 5) // 24 + 65)
    column = (dest_line - 5) % 24 + 1
    
    print('DB{}_{}{}\t'.format(wb_count, row, column) + '\t'.join(line), file=w_file)

    if dest_wb.line == 389:
            dest_wb.saveit('./output/ORF_selection_20160519/')
            wb_count += 1
            dest_wb = WorkBook('DB_BFG_yeastORFeome_Plate{}'.format(wb_count))

dest_wb.saveit('./output/ORF_selection_20160519/')
w_file.close()