In [1]:
import os 
import pandas as pd

### Build Reference Fasta

In [2]:
orig_seqs = pd.read_excel('./input/Yeast ORFeome HIP ORFeome collection_v3 Details inc 384-format.xlsx')

seq_source = {str(seq_name): str(seq) for seq_name, seq in zip(orig_seqs['ORF_NAME'], orig_seqs['SEQ']) if str(seq_name) != 'nan'}

with open('./temp/ref.fasta','w') as handle:
    for seq in seq_source:
        handle.write('>{}\n{}\n'.format(seq, seq_source[seq]))

### Build query Fasta

In [3]:
all_quey_names = []
qw_file = open('./temp/query.fasta', 'w')

for file in os.listdir('./input/'):
    if file[-4:] != '.seq':
        continue
    name = file.replace('--', '-') 
    name = '_'.join(name.split('_')[0].split('-')).upper()
    all_quey_names.append(name)
    with open('./input/{}'.format(file), 'r') as handle:
        handle.__next__()
        print('>{}\n{}'.format(name, handle.__next__()), file=qw_file)
qw_file.close()

### Blasting

In [4]:
os.system('blastn -query ./temp/query.fasta -subject ./temp/ref.fasta -outfmt 6 -out ./temp/blastresults.txt')

blast_hits = {}
with open('./temp/blastresults.txt', 'r') as handle:
    for line in handle:
        line = line.split()
        blast_hits[line[0]] = blast_hits.get(line[0], []) + [line[1]]


### Check matches

In [5]:
# Fetch reference file
source_dict = {}

my_table = pd.read_table('./input/Transfer_summary_AD.tsv')
for a,b,c, gene in zip(my_table['Destination Plate'], my_table['Destination Row'], my_table['Destination Column'], my_table['Gene']):
    built_name = 'AD_{}_{}_{}'.format(a,b,c)
    source_dict[built_name] = gene

my_table = pd.read_table('./input/Transfer_summary_DB.tsv')
for a, b, c, gene in zip(my_table['Destination Plate'], my_table['Destination Row'], my_table['Destination Column'], my_table['Gene']):
    built_name = 'DB_{}_{}_{}'.format(a,b,c)
    source_dict[built_name] = gene

In [6]:
for query in sorted(all_quey_names):
    if query not in blast_hits:
        print('WARNING: No blast hits for colony "{}"'.format(query))
    elif source_dict[query] in blast_hits[query]:
        print('OK: Gene "{}" is in spot {}'.format(source_dict[query], query))
    else:
        print('MISSMATCH: Spot "{}" differs, expected "{}" got "{}"'.format(query, source_dict[query], blast_hits[query]))


MISSMATCH: Spot "AD_2_14_11" differs, expected "YMR160W" got "['YMR074C']"
OK: Gene "YAL058W" is in spot AD_2_7_18
OK: Gene "YGL050W" is in spot AD_3_16_15
MISSMATCH: Spot "AD_4_13_23" differs, expected "YOR212W" got "['YDR186C']"
MISSMATCH: Spot "AD_4_1_3" differs, expected "YDR186C" got "['YBR058C']"
OK: Gene "YBR063C" is in spot AD_5_16_11
OK: Gene "YKL157W" is in spot AD_5_1_20
OK: Gene "YBL061C" is in spot AD_6_11_23
OK: Gene "YGR261C" is in spot AD_6_8_24
OK: Gene "YKL047W" is in spot AD_7_3_7
OK: Gene "YGR103W" is in spot DB_1_10_9
OK: Gene "YLR233C" is in spot DB_2_16_13
OK: Gene "YBR227C" is in spot DB_3_12_19
OK: Gene "YER149C" is in spot DB_3_7_17
OK: Gene "YJR116W" is in spot DB_4_15_4
OK: Gene "YPL243W" is in spot DB_4_5_10
OK: Gene "YOR056C" is in spot DB_5_5_10
OK: Gene "YPL160W" is in spot DB_5_9_18
MISSMATCH: Spot "DB_6_3_22" differs, expected "YKR067W" got "['YLR383W']"
OK: Gene "YOR178C" is in spot DB_6_4_24
