##### Set up packages and directories

In [18]:
full_run = False

In [19]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.fft import fft, fftfreq
from Bio import Entrez, SeqIO, AlignIO, pairwise2, Align, Seq, motifs
from Bio.Seq import Seq
from Bio.SeqFeature import SeqFeature, FeatureLocation
from scipy.stats import binom
import math
from tqdm.auto import tqdm
from Comparative_Analysis import Sequence_Analysis_Routines as sar
from Comparative_Analysis import Utilities as util
from Comparative_Analysis import Alignment as align
from Comparative_Analysis import ORF_Functions as orffn
import random
import copy
from joblib import Parallel, delayed
import os
import subprocess
from Bio.Blast import NCBIWWW, NCBIXML
from Bio.Align.Applications import MuscleCommandline
muscle_exe = 'C:/Users/nicho/Muscle/muscle3.8.31_i86win32.exe'

In [20]:
project_dir = 'F:/Project_Data/Project_10'
literature_datasets_dir = 'F:/Datasets/Data_From_Publications'
output_dir = project_dir + '/Output'
refseq_dir = 'F:/Datasets/NCBI_Refseq_Mycobacteriaceae_All_Levels/ncbi_dataset/data'
num_cores = 8
core_numbers = list(range(1, num_cores+1))

In [21]:
species_list = util.list_dirs(refseq_dir)
reference_species = 'GCF_000195955.2'
species_list_excl_ref = [x for x in species_list if x!= reference_species]

In [53]:
for record in SeqIO.parse(refseq_dir + '/'+reference_species+'/genomic.gbff', "genbank"):
    full_sequence = str(record.seq)

In [22]:
min_nts = 60

In [115]:
with open('F:/Project_Data/Project_11/Thoth_Full_Run/zero_and_non_zero_mutation_counts.pkl', 'rb') as f:
    full_sample_zero_and_non_zero_mutation_counts = pickle.load(f)   

In [116]:
def bin_formula(position_3_counts, tot_bin_counts):
    return 1- binom.cdf(position_3_counts-1, tot_bin_counts,1/3)
def mutation_bin_probability(mutation_counts):
    bin_counts = [0,0,0]
    for i, c in enumerate(mutation_counts):
        bin_counts[i % 3] += min(c,10000000)
    if sum(bin_counts) == 0:
        return (bin_counts, 2)
    else:
        return (bin_counts, bin_formula(bin_counts[2], sum(bin_counts)))  

In [23]:
def produce_blast_file(record_list, output_filename):
    with open(output_filename, 'w',  newline='') as outfile:
        line_length = 60
        for record in tqdm(record_list):
            sequence = record[1]
            lines = []
            sequence_length = len(sequence)
            number_of_lines = math.ceil(sequence_length / line_length)
            lines.append(">" +record[0]+ "\n")
            for i in range(number_of_lines):
                subsequence = sequence[i*line_length:(i+1)*line_length]
                lines.append(subsequence + "\n")
            outfile.write(''.join(lines))

##### Function to find maximal open reading frame between two co-ordinates 

In [24]:
def find_all_maximal_orfs(sequence, seq_start, seq_stop, output_all_orfs = False, min_orf_length = 0):
    max_len = 0
    orfs_found = []
    start_pos = -999
    end_pos = -999
    for frame in ['Forward', 'Reverse']:
        if frame == 'Forward':
            temp = (sequence[seq_start: seq_stop])
        else:
            temp = align.reverse_complement(sequence[seq_start: seq_stop])
        seq_len = len(temp)
        for rf in range(3):
            i = rf
            while i < seq_len - 2:
                orf_length = 0
                test_codon = temp[i: i+3] 
                if test_codon in ['ATG','GTG','TTG']:  
                    for j in range(i + 3, seq_len - 2, 3):
                        test_codon_2 = temp[j: j+3] 
                        if test_codon_2 in ['TAG','TGA','TAA']:
                            orf_length = j - i
                            break
                            
                if orf_length > 0:
                    if frame == 'Forward':
                        orf_start =  seq_start + i
                        orf_end = seq_start + j+3
                        orf_strand = 1
                    else:
                        orf_start =  seq_start + seq_len-(j+3)
                        orf_end = seq_start + seq_len-i
                        orf_strand = -1
                    
                    if orf_length >= min_orf_length:
                        orfs_found.append((orf_start, orf_end, orf_strand, orf_length))

                if orf_length > max_len and orf_length >= min_orf_length:                                           
                    max_len = orf_length
                    start_pos = orf_start
                    end_pos = orf_end
                    strand = orf_strand 

                if orf_length > 0:
                    i = j
                else:
                    i +=3
    if output_all_orfs == True:
        sorted_orfs = sorted(orfs_found, key=lambda x: x[3], reverse=True)
        return sorted_orfs                
    elif start_pos == -999:
        return(0,0,0,0)
    else:
        return(start_pos, end_pos, strand, max_len)   

##### Function to find nearest larger and smaller ORFs enclosing region with same stop codon

In [25]:
def find_nearest_upstream_orf_sequence(sequence, seq_start, seq_stop, strand, max_lookback = 100):
    out_seq = ''
    if strand == 1:
            temp = (sequence[seq_start - max_lookback * 3: seq_stop])
    else:
            temp = align.reverse_complement(sequence[seq_start: seq_stop + max_lookback * 3])
    for lookback in range(max_lookback, -1, -1):
        if temp[lookback * 3: (lookback+1) *3 ] in ['TAG','TGA','TAA']:
            #  Not possible without first encountering stop - just return original sequence
            out_seq = translate_sequence(temp[max_lookback * 3:],1,0)
            break
        if temp[lookback * 3: (lookback+1) *3 ] in ['ATG','GTG','TTG']:
            out_seq = translate_sequence(temp[lookback * 3:],1,0)
            break
    return(out_seq)   

In [26]:
def find_nearest_downstream_orf_sequence(sequence, seq_start, seq_stop, strand):
    max_lookforward = max(1, int((seq_stop - seq_start) / 3) - 4)
    out_seq = ''
    if strand == 1:
            temp = sequence[seq_start:seq_stop]
    else:
            temp = align.reverse_complement(sequence[seq_start:seq_stop])
    for lookback in range(max_lookforward):
        if temp[lookback * 3: (lookback+1) *3 ] in ['TAG','TGA','TAA']:
            #  Not possible without first encountering stop - just return original sequence
            out_seq = translate_sequence(temp,1,0)
            break
        if temp[lookback * 3: (lookback+1) *3 ] in ['ATG','GTG','TTG']:
            out_seq = translate_sequence(temp[lookback * 3:],1,0)
            break
    return(out_seq)   

#####  Function to process output from BLAST into dataframe with looked up values

In [27]:
def process_blast_output(infile_loc, outfile_loc):
    trans = util.Translator()
    blast_results = pd.read_csv(infile_loc, header = None)
    blast_results.columns = ['query_accession_ver', 'subject_accession_ver', 'query_length', 'subject_length', 'percent_identical_matches','alignment_length', 'number_mismatches', 'number_of_gap_openings', 'query_start_alignment', 'query_end_alignment', 'subject_start_alignment', 'subject_end_alignment', 'e_value', 'bit_score']
    blast_results['query_info']=  blast_results['query_accession_ver'].map(protein_reference_dict)
    blast_results['subject_info']=  blast_results['subject_accession_ver'].map(protein_reference_dict)
    blast_results['query_feature']=  blast_results['query_accession_ver'].map(orf_feature_dict)
    blast_results['subject_feature']=  blast_results['subject_accession_ver'].map(orf_feature_dict)
    for i, r in blast_results.iterrows():
        blast_results.at[i, 'query_species'] = '_'.join(r.query_accession_ver.split('_')[0:2])
        blast_results.at[i, 'subject_species'] = '_'.join(r.subject_accession_ver.split('_')[0:2])
    blast_results = blast_results.query('not (query_species == subject_species)')
    blast_results['query_species_name'] = blast_results['query_species'].map(names_dict)
    blast_results['subject_species_name'] = blast_results['subject_species'].map(names_dict)
    for i, r in blast_results.iterrows():
        if r.query_info[2] == 1:
            blast_results.at[i, 'query_start_pos'] = r.query_info[0] + (r.query_start_alignment - 1) * 3
            blast_results.at[i, 'query_end_pos'] = r.query_info[0] + (r.query_end_alignment - 1) * 3 + 6 # doesn't include stop codon in blast
        else:
            blast_results.at[i, 'query_start_pos'] = r.query_info[1] - (r.query_end_alignment) * 3
            blast_results.at[i, 'query_end_pos'] = r.query_info[1] - (r.query_start_alignment - 1) * 3

        if r.subject_info[2] == 1:
            blast_results.at[i, 'subject_start_pos'] = r.subject_info[0] + (r.subject_start_alignment - 1) * 3
            blast_results.at[i, 'subject_end_pos'] = r.subject_info[0] + (r.subject_end_alignment - 1) * 3 + 6 # doesn't include stop codon in blast
        else:
            blast_results.at[i, 'subject_start_pos'] = r.subject_info[1] - (r.subject_end_alignment) * 3
            blast_results.at[i, 'subject_end_pos'] = r.subject_info[1] - (r.subject_start_alignment - 1) * 3
    blast_results = blast_results.loc[blast_results.groupby(['query_accession_ver','subject_species'])['bit_score'].idxmax()]
    blast_results['species_count'] = blast_results.groupby('query_accession_ver')['query_accession_ver'].transform('size')
    for i, r in blast_results.iterrows():
        subject_strand = r.subject_info[2]
        #blast_results.at[i,'sequence'] = translate_orf(myco_info_dict[r.subject_species][1], int(r.subject_start_pos), int(r.subject_end_pos), subject_strand)
        #blast_results.at[i,'nearest_upstream_orf_sequence'] = find_nearest_upstream_orf_sequence(myco_info_dict[r.subject_species][1], int(r.subject_start_pos), int(r.subject_end_pos), subject_strand, max_lookback = 100)
        #blast_results.at[i,'nearest_downstream_orf_sequence'] = find_nearest_downstream_orf_sequence(myco_info_dict[r.subject_species][1], int(r.subject_start_pos), int(r.subject_end_pos), subject_strand)
    with open(outfile_loc, 'wb') as f:
        pickle.dump(blast_results, f)
    return blast_results

In [28]:
def keep_reciprocal_best_hits(query_df, reverse_query_df, outfile_loc):
    temp_1_dict = {}
    temp_2_dict = {}
    for i, r in query_df.iterrows():
        temp_1_dict[r['query_accession_ver']] = r['subject_accession_ver']
    for i, r in reverse_query_df.iterrows():
        temp_2_dict[r['query_accession_ver']] = r['subject_accession_ver']
    for i, r in query_df.iterrows():
        if temp_1_dict[r['query_accession_ver']] in temp_2_dict and temp_2_dict[temp_1_dict[r['query_accession_ver']]] == r['query_accession_ver']:
            query_df.at[i, 'reciprocal_best_hit'] = 'Y'
        else:
            query_df.at[i, 'reciprocal_best_hit'] = 'N'
    output = query_df[query_df.reciprocal_best_hit == 'Y'] 
    with open(outfile_loc, 'wb') as f:
        pickle.dump(output, f)
    return output

##### Extract full sequences from each organism and create directory of start and stops for each annotated cds (use Mycobrowser for MTb)

In [29]:
def generate_myco_info(num_subsets, subset_num, species_master_list):
    trans = util.Translator()
    output = []
    species_list = util.chunk_list(species_master_list, num_subsets, subset_num)
    for species in species_list:
        features = []
        genome_record = next(SeqIO.parse(refseq_dir + '/'+species+'/genomic.gbff', "genbank"))
        full_sequence = str(genome_record.seq)
        if full_sequence.count('A') + full_sequence.count('C') + full_sequence.count('G') + full_sequence.count('T') < len(full_sequence):
            continue
        organism = genome_record.annotations['organism']
        
        #  Read feature information
        if species == reference_species:
            mycobrowser_df = pd.read_excel(literature_datasets_dir+'/Mycobrowser_Release_4.xlsx')
            for i, r in mycobrowser_df.iterrows():
                if r['Strand'] == '+':
                    strand = 1
                else:
                    strand = -1
                features.append((r['Locus'],r['Start']-1, r['Stop'], strand))
            
        else:
            
            for feature in genome_record.features:
                    a = feature.qualifiers
                    if feature.type == 'gene' and a.get("locus_tag")!= None and int(feature.location.end) - int(feature.location.start) < 100000:  #  Exclude strange Biopython parsing where starts with complement join and looks like a CDS is full length of genome!   
                        locus_tag = a.get("locus_tag")[0]
                        features.append([locus_tag, int(feature.location.start), int(feature.location.end), int(feature.location.strand)])
        
        features.sort(key=lambda x: x[1])
        #  Find maximal orfs, non-overlapping orfs and their protein sequences, and assign a reference to each
        maximal_orfs = find_all_maximal_orfs(full_sequence, 0, len(full_sequence), True, min_nts)
        protein_references = []
        maximal_orf_proteins = []
        morf_feature_map = []
        non_overlapping_maximal_orfs = []
        non_overlapping_maximal_orf_proteins = []
        non_overlapping_orfs = []
        non_overlapping_orf_proteins = []
        for i, orf in enumerate(maximal_orfs):
            #translation = translate_orf(full_sequence, orf[0], orf[1], orf[2])[:-1]
            translation = trans.translate_sequence(full_sequence[orf[0]:orf[1]], orf[2],0)[:-1]
            morf_name = species+'_'+str(i)
            maximal_orf_proteins.append([morf_name, translation])
            protein_references.append((morf_name, orf))
            overlap = False
            for cds in features:
                if ((orf[2] == 1 and orf[0] <= cds[1] and orf[1] == cds[2]) or (orf[2] == -1 and orf[0] == cds[1] and orf[1] >= cds[2])) and orf[2] == cds[3]:
                    morf_feature_map.append((morf_name, cds[0]))
            for cds in features:    
                if min(cds[2], orf[1]) - max(cds[1], orf[0]) > 0.3 * (orf[1] - orf[0]):
                    overlap = True
                    break
            if overlap == False:
                nomorf_name = species+'_NOM_'+str(i)
                non_overlapping_maximal_orfs.append(orf)
                non_overlapping_maximal_orf_proteins.append([nomorf_name, translation])
                protein_references.append((nomorf_name, orf))
        
        non_overlapping_features = []

        for i, cds in enumerate(features):
            if i > 0 and cds[2] > features[i-1][2]:
                non_overlapping_features.append(cds)
        for i, cds in enumerate(non_overlapping_features):
            if i < len(non_overlapping_features) - 1:
                temp = find_all_maximal_orfs(full_sequence, cds[2], non_overlapping_features[i+1][1], True, min_nts)
                for orf in temp:
                    non_overlapping_orfs.append(orf)
        for i, orf in enumerate(non_overlapping_orfs):
            #translation = translate_orf(full_sequence, orf[0], orf[1], orf[2])[:-1]
            translation = trans.translate_sequence(full_sequence[orf[0]:orf[1]], orf[2],0)[:-1]
            noorf_name = species+'_NO_'+str(i)
            non_overlapping_orf_proteins.append([noorf_name, translation])
            protein_references.append((noorf_name, orf))
            
        output.append((species, organism, full_sequence, features, maximal_orfs, non_overlapping_maximal_orfs, maximal_orf_proteins, non_overlapping_maximal_orf_proteins, protein_references, morf_feature_map, non_overlapping_orfs, non_overlapping_orf_proteins))
    return output

In [30]:
if full_run == True:
#if 1==1:
    myco_info_dict = {}
    names_dict = {}
    parallel_output = Parallel(n_jobs=-1)(delayed(generate_myco_info)(num_cores, core_number, species_list) for core_number in core_numbers)
    for core_output in parallel_output:
        for results in core_output:
            myco_info_dict[results[0]] = (results[1], results[2], results[3], results[4], results[5], results[6], results[7], results[8], results[9], results[10], results[11])
            names_dict[results[0]] = results[1]
    with open(project_dir + '/' + 'myco_info_dict.pkl', 'wb') as f:
        pickle.dump(myco_info_dict, f)
    with open(project_dir + '/' + 'names_dict.pkl', 'wb') as f:
        pickle.dump(names_dict, f)

In [31]:
if not(full_run == True):
    with open(project_dir + '/' + 'myco_info_dict.pkl', 'rb') as f:
        myco_info_dict = pickle.load(f)    
    with open(project_dir + '/' + 'names_dict.pkl', 'rb') as f:
        names_dict = pickle.load(f)    

In [32]:
orf_feature_dict = {}
feature_morf_dict = {}
protein_reference_dict = {}
orf_protein_dict = {}
for species in species_list:
    if species in myco_info_dict:
        for ref in myco_info_dict[species][7]:
            protein_reference_dict[ref[0]] = ref[1]
            orf_protein_dict[(species, ref[1])] = ref[0]
        for ref in myco_info_dict[species][8]:
            orf_feature_dict[ref[0]] = ref[1]
            feature_morf_dict[ref[1]] = ref[0]

##### Output two blast files - one (subject) containing all translated mORFs, the other (query) just the ones for the reference species 

In [33]:
if full_run == True:
    subject_protein_list = []
    for species in species_list_excl_ref:
        if species in myco_info_dict:
            for maximal_orf_protein in myco_info_dict[species][5]:
                subject_protein_list.append(maximal_orf_protein)
    produce_blast_file(subject_protein_list, project_dir + '/subject_proteins.faa')

    query_protein_list = myco_info_dict[reference_species][6]
    produce_blast_file(query_protein_list, project_dir + '/no_overlap_morf_query_proteins.faa')

    query_protein_list = myco_info_dict[reference_species][10]
    produce_blast_file(query_protein_list, project_dir + '/no_overlap_orf_query_proteins.faa')

    query_protein_list = myco_info_dict[reference_species][5]
    produce_blast_file(query_protein_list, project_dir + '/morf_query_proteins.faa')

##### Create blast databases

In [34]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in subject_proteins.faa -dbtype prot -out subj_prot', shell=True, capture_output = True)
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in morf_query_proteins.faa -dbtype prot -out query_prot', shell=True, capture_output = True)
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in no_overlap_morf_query_proteins.faa -dbtype prot -out no_overlap_morf_query_prot', shell=True, capture_output = True)
    subprocess.run('cd '+ project_dir + ' &  makeblastdb -in no_overlap_orf_query_proteins.faa -dbtype prot -out no_overlap_orf_query_prot', shell=True, capture_output = True)
    os.chdir(w_d)

In [35]:
###  To do - copy database files into BLAST subfolders - currently done manually 

In [36]:
if full_run == True:
    w_d = os.getcwd()
    os.chdir("F:/")
    subprocess.run('cd f:\\Datasets\\BLAST\\subj_prot & blastp -query morf_query_proteins.faa -db subj_prot -out blastp_results_query_subject.csv -evalue 1e-7 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd f:\\Datasets\\BLAST\\subj_prot & blastp -query no_overlap_morf_query_proteins.faa -db subj_prot -out blastp_results_no_morf_query_subject.csv -evalue 1e-7 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd f:\\Datasets\\BLAST\\subj_prot & blastp -query no_overlap_orf_query_proteins.faa -db subj_prot -out blastp_results_no_orf_query_subject.csv -evalue 1e-7 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)

    subprocess.run('cd f:\\Datasets\\BLAST\\query_prot & blastp -query subject_proteins.faa -db query_prot -out blastp_results_subject_query.csv -evalue 1e-7 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd f:\\Datasets\\BLAST\\no_overlap_morf_query_prot & blastp -query subject_proteins.faa -db no_overlap_morf_query_prot -out blastp_results_subject_no_morf_query.csv -evalue 1e-7 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)
    subprocess.run('cd f:\\Datasets\\BLAST\\no_overlap_orf_query_prot & blastp -query subject_proteins.faa -db no_overlap_orf_query_prot -out blastp_results_subject_no_orf_query.csv -evalue 1e-7 -seg no -outfmt  "10 qaccver saccver qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore" -num_threads 16', shell=True, capture_output = True)

    os.chdir(w_d)

In [37]:
if full_run == True:
#if 1==1:
    blast_results_qs = process_blast_output('F:\\Datasets\\BLAST\\subj_prot\\blastp_results_query_subject.csv', project_dir + '/blast_results_qs.pkl')
    blast_results_sq = process_blast_output('F:\\Datasets\\BLAST\\query_prot\\blastp_results_subject_query.csv', project_dir + '/blast_results_sq.pkl')
    blast_results_nomqs = process_blast_output('F:\\Datasets\\BLAST\\subj_prot\\blastp_results_no_morf_query_subject.csv', project_dir + '/blast_results_nomqs.pkl')
    blast_results_snomq = process_blast_output('F:\\Datasets\\BLAST\\no_overlap_morf_query_prot\\blastp_results_subject_no_morf_query.csv', project_dir + '/blast_results_snomq.pkl')
    blast_results_nooqs = process_blast_output('F:\\Datasets\\BLAST\\subj_prot\\blastp_results_no_orf_query_subject.csv', project_dir + '/blast_results_nooqs.pkl')
    blast_results_snooq = process_blast_output('F:\\Datasets\\BLAST\\no_overlap_orf_query_prot\\blastp_results_subject_no_orf_query.csv', project_dir + '/blast_results_snooq.pkl')
    rbh_orf = keep_reciprocal_best_hits(blast_results_qs, blast_results_sq, project_dir + '/rbh_orf_results.pkl')
    rbh_non_overlap_morf = keep_reciprocal_best_hits(blast_results_nomqs, blast_results_snomq, project_dir + '/rbh_non_overlap_morf_results.pkl')
    rbh_non_overlap_orf = keep_reciprocal_best_hits(blast_results_nooqs, blast_results_snooq, project_dir + '/rbh_non_overlap_orf_results.pkl')

In [38]:
if not(full_run == True):
     with open(project_dir + '/' + 'rbh_non_overlap_morf_results.pkl', 'rb') as f:
        rbh_non_overlap_morf = pickle.load(f)    

#####   Filter promising candidates and record alternative start codons

In [120]:
trans = util.Translator()
temp_df = rbh_non_overlap_morf.query("abs(query_end_alignment - query_length) < 10 and abs(subject_end_alignment - subject_length) <10 and species_count > 5")
temp_df['species_count'] = temp_df.groupby('query_accession_ver')['query_accession_ver'].transform('size')
temp_df = temp_df.query("species_count > 10")

In [121]:
start_codon_positions_dict = {}
for i, r in temp_df.iterrows():
    (start, stop, strand, length) = r['query_info']
    a = trans.translate_sequence(full_sequence[start:stop], strand, 0, True, True)
    positions = []
    for n, x in enumerate(a):
        if x == 'Z':
            positions.append(n+1)   # Make 1 based in line with BLAST
    start_codon_positions_dict[r['query_accession_ver']] = positions

In [122]:
features = []
genome_record = next(SeqIO.parse(project_dir + '/annot.gbk', "genbank"))
for feature in genome_record.features:
        a = feature.qualifiers
        if feature.type == 'CDS' and a.get("locus_tag")!= None and int(feature.location.end) - int(feature.location.start) < 100000:  #  Exclude strange Biopython parsing where starts with complement join and looks like a CDS is full length of genome!   
            locus_tag = a.get("locus_tag")[0]
            if a.get("product") != None:
                product = a.get("product")[0]
            else:
                product = ''
            features.append([locus_tag, product, int(feature.location.start), int(feature.location.end), int(feature.location.strand)])

In [123]:
new_annotation_features_dict = {}
for (locus_tag, product, start, end, strand) in features:
    if strand == 1:
        new_annotation_features_dict[end] = (locus_tag, product, start, end, strand)
    else:
        new_annotation_features_dict[start] = (locus_tag, product, start, end, strand)

In [129]:
for i, r in temp_df.iterrows():
    (start, stop, strand, length) = r['query_info']
    if strand == 1:
        temp = stop
        prob = mutation_bin_probability(full_sample_zero_and_non_zero_mutation_counts[start+3:stop-3])
    else:
        temp = start
        prob = mutation_bin_probability(reversed(full_sample_zero_and_non_zero_mutation_counts[start+3:stop-3]))
    if temp in new_annotation_features_dict:
        temp_df.at[i,'product'] = new_annotation_features_dict[temp][1]
    else:
        temp_df.at[i,'product'] = 'Unmatched'
    temp_df.at[i, 'prob'] = prob[1]

In [126]:
temp_df['target_start_codon_positions']=  temp_df['query_accession_ver'].map(start_codon_positions_dict)

In [127]:
temp_df.to_csv(project_dir + '/temp_df.csv')

In [128]:
len(temp_df['query_accession_ver'].unique())

48