In [1]:
import re
import csv

In [1]:
#Create a python script that generates BBH for your two species
#Species 1, A_cellulolyticus, 2139 sequences
#Species 2, R_kristinae, 2022 sequences

#I have two blastp files, which come from running blastp on A_cellulolyticus protein sequences on a database made from R_kristinae 
#and vice versa. I will use these files to find the BBH.

In [3]:
# ------------------------------ Save to CVS ---------------------------------
def save_to_cvs(variable, name):
    header = ["queryid", "hitid"]
    with open(f"{name}.csv", "w") as outfile:
        writer = csv.writer(outfile)
        writer.writerow(header)
        for queryid, hitid in variable.items():
            writer.writerow([queryid, hitid])

In [4]:
# ------------------------------ Parse BLAST results ---------------------------------

def parse_blast_results(input):
    file_total_hits=open(input, "r") #open the file with all the hits from BLAST
    best_hits={} #dictionary to store best hits
    for hit in file_total_hits:
        element = re.split("\t", hit) #split the line by tabs
        queryid = element[0]
        hitid = element[1]
        if (not (queryid in best_hits.keys())):
            best_hits[queryid] = hitid #only choose the first hit, since this is the best hit
    return best_hits #return the dictionary with the best hits

AC_input = "modified_A_cel_vs_R_kris_blast_results.tab"
print("A. cel total hits: ", len(open(AC_input).readlines()))

AC_best_hits = parse_blast_results(AC_input)
print("A. cel best hits: ", len(AC_best_hits))

RK_input = "modified_R_kris_vs_A_cel_blast_results.tab"
print("R. kris total hits: ", len(open(RK_input).readlines()))

RK_best_hits = parse_blast_results(RK_input)
print("R. kris best hits: ", len(RK_best_hits))

save_to_cvs(AC_best_hits, "AC_best_hits")
save_to_cvs(RK_best_hits, "RK_best_hits")

A. cel total hits:  10959
A. cel best hits:  886
R. kris total hits:  14889
R. kris best hits:  1284


In [5]:
# ------------------------------ Parse paralogs ---------------------------------

def parse_paralogs(input):
    file_total_hits=open(input, "r") #open the file with all the hits from BLAST
    best_hits={} #dictionary to store best hits
    for hit in file_total_hits:
        element = re.split("\t", hit) #split the line by tabs
        queryid = element[0]
        hitid = element[1]
        if (not (queryid in best_hits.keys()) and queryid != hitid):
            best_hits[queryid] = hitid #only choose the first hit, since this is the best hit
    return best_hits #return the dictionary with the best hits

AC_paralogs = "blast_paralogs_AC.tab"
AC_parsed_paralogs = parse_paralogs(AC_paralogs)
print("A. cel parsed paralogs: ", len(AC_parsed_paralogs))

RK_paralogs = "blast_paralogs_RK.tab"
RK_parsed_paralogs = parse_paralogs(RK_paralogs)
print("R. kris parsed paralogs: ", len(RK_parsed_paralogs))

save_to_cvs(AC_parsed_paralogs, "AC_parsed_paralogs")
save_to_cvs(RK_parsed_paralogs, "RK_parsed_paralogs")

A. cel parsed paralogs:  1542
R. kris parsed paralogs:  2022


In [6]:
# ------------------------------ Find paralogs ---------------------------------

#Find paralogs in the file
def find_paralogs(parsed_paralogs):
    paralogs = {} #Create a dictionary to store the paralogs
    for queryid, hitid in parsed_paralogs.items():
        if queryid[0:2] == hitid[0:2]:
            paralogs[queryid] = hitid
    return paralogs

AC_found_paralogs = find_paralogs(AC_parsed_paralogs)
print("A. cel # paralogs: ", len(AC_found_paralogs))

RK_found_paralogs = find_paralogs(RK_parsed_paralogs)
print("R. kris # paralogs: ", len(RK_found_paralogs))

A. cel # paralogs:  625
R. kris # paralogs:  677


In [7]:
# ------------------------------ BBH ---------------------------------

#Find the best bidirectional hits (BBH)
def find_BBH(best_hits_A, best_hits_B):
    BBH = {} #Create a dictionary to store the BBHs
    for query, hit in best_hits_A.items(): #for each query and hit in the first dictionary, check if the query and hit are the same in the second dictionary
        if query != hit: #if the query and hit are not the same, then they are not BBHs
            if hit in best_hits_B: #if the hit is in the second dictionary, then check if the query and hit are the same in the second dictionary
                if best_hits_B[hit] == query: #if the query and hit are the same in the second dictionary, then they are BBH
                    BBH[query] = [hit] #add the query and hit to the BBH dictionary
    return BBH #return the BBH dictionary
 
best_bidirectional_hits = find_BBH(AC_best_hits, RK_best_hits)
best_bidirectional_hits_2 = find_BBH(RK_best_hits, AC_best_hits)
print("Best birectional hits:", len(best_bidirectional_hits))

#Save BBH to a csv file
save_to_cvs(best_bidirectional_hits, "BBHs")

Best birectional hits: 310


In [8]:
# ------------------------------ Find orthologs ---------------------------------

def find_orthologs(best_bidirectional_hits, paralogs):
    for query, hit in best_bidirectional_hits.items():
        if query in paralogs.keys():
            best_bidirectional_hits[query].append(paralogs[query])
    return best_bidirectional_hits

orthologs = find_orthologs(best_bidirectional_hits, AC_found_paralogs)
orthologs_2 = find_orthologs(best_bidirectional_hits_2, RK_found_paralogs)

save_to_cvs(orthologs, "orthologs")
save_to_cvs(orthologs_2, "orthologs_2")