In [2]:
import subprocess
from Bio import Entrez
from Bio import SeqIO
from Bio import SearchIO
Entrez.email = "o.william.white@gmail.com"
import pandas as pd

In [3]:
# input parameters
reference_accession = "NC_000932.1"
query_accession = "NC_000932.1"

In [4]:
## download genbank of reference data and write annotated genes to fasta

# efetch accession
handle = Entrez.efetch(db="nucleotide", id=reference_accession, rettype="gb", retmode="text")
record = SeqIO.read(handle, "gb")
handle.close()

# list and dictionary to store annotations
list_genes, dict_genes = [], {}

# loop through sequence feature and append to list
for feature in record.features:
    # cds features
    if feature.type == "CDS" and "gene" in feature.qualifiers.keys():
        # get annotation info
        gene_name = str(feature.qualifiers["gene"][0] + "_" + reference_accession)
        gene_description = feature.qualifiers["product"][0]
        gene_seq = feature.extract(record)
        gene_seq.id = gene_name
        gene_seq.name = gene_name
        gene_seq.description = gene_description
        # add gene to dictionary and list if not already present
        if dict_genes.get(gene_name) == None:
            list_genes.append(gene_seq)
            dict_genes[gene_name] = gene_seq.seq
            
        # identify duplicate annotations
        else:
            if dict_genes.get(gene_name) == gene_seq.seq:
                print(f"Ignoring duplicate annotation for {gene_name}")
        
# write reference fasta
with open("reference_cds.fasta", "w") as ref:
    SeqIO.write(list_genes, ref, "fasta")

Ignoring duplicate annotation for rps12_NC_000932.1
Ignoring duplicate annotation for rps7_NC_000932.1
Ignoring duplicate annotation for ndhB_NC_000932.1
Ignoring duplicate annotation for ycf2_NC_000932.1
Ignoring duplicate annotation for rpl23_NC_000932.1
Ignoring duplicate annotation for rpl2_NC_000932.1


In [58]:
## download query accession and write to fasta

# efetch accession
handle = Entrez.efetch(db="nucleotide", id=query_accession, rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")
handle.close()

# write reference fasta
with open("query.fasta", "w") as que:
    SeqIO.write(record, que, "fasta")

In [66]:
# run blat
subprocess.run(["blat", "-out=blast8", "reference.fasta",  "query.fasta", "output.txt"])

Loaded 79482 letters in 85 sequences
Searched 154478 bases in 1 sequences


CompletedProcess(args=['blat', '-out=blast8', 'reference.fasta', 'query.fasta', 'output.txt'], returncode=0)

In [72]:
blat_output = SearchIO.read("output.txt", "blast-tab")

In [73]:
print(blat_output)


Program: <unknown program> (<unknown version>)
  Query: NC_000932.1
         <unknown description>
 Target: <unknown target>
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      2  ycf2_2  <unknown description>
            1      2  ycf2  <unknown description>
            2      2  ycf1_2  <unknown description>
            3      1  rpoC2  <unknown description>
            4      1  rpoB  <unknown description>
            5      2  psaA  <unknown description>
            6      2  ndhF  <unknown description>
            7      2  psaB  <unknown description>
            8      2  rpoC1  <unknown description>
            9      1  psbB  <unknown description>
           10      1  atpA  <unknown description>
           11      1  matK  <unknown description>
           12      1  atpB  <unknown description>
           13     

In [62]:
for hit in blat_output:
    print(hit)

Query: NC_000932.1
       <unknown description>
  Hit: rps19 (279)
       <unknown description>
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0         ?          ?       ?  [154365:154478]                [0:113]
          1         ?          ?       ?    [84004:84283]                [0:279]
Query: NC_000932.1
       <unknown description>
  Hit: rpl2_2 (825)
       <unknown description>
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0         ?          ?       ?  [152805:154312]                [0:825]
          1         ?          ?       ?    [84336:85843]                [0:82

In [80]:
blast_fmt = pd.read_csv("output.txt", sep="\t", 
            names=["qseqid", "sseqid", "pident", "length", "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"])
blast_fmt

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,NC_000932.1,ycf2_2,100.0,6885,0,0,86474,93358,1,6885,0.000000e+00,13318.0
1,NC_000932.1,ycf2_2,100.0,6885,0,0,145291,152175,6885,1,0.000000e+00,13318.0
2,NC_000932.1,ycf2,100.0,6885,0,0,86474,93358,1,6885,0.000000e+00,13318.0
3,NC_000932.1,ycf2,100.0,6885,0,0,145291,152175,6885,1,0.000000e+00,13318.0
4,NC_000932.1,ycf1_2,100.0,5361,0,0,123884,129244,5361,1,0.000000e+00,10284.0
...,...,...,...,...,...,...,...,...,...,...,...,...
116,NC_000932.1,psbI,100.0,111,0,0,7583,7693,1,111,1.100000e-55,214.0
117,NC_000932.1,psbM,100.0,105,0,0,28707,28811,105,1,5.400000e-52,202.0
118,NC_000932.1,psbT,100.0,102,0,0,74082,74183,1,102,3.400000e-50,196.0
119,NC_000932.1,petL,100.0,96,0,0,65712,65807,1,96,7.000000e-47,185.0


In [93]:
best_hits = blast_fmt.loc[blast_fmt.groupby('sseqid')['bitscore'].idxmax()]
best_hits

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
21,NC_000932.1,accD,100.0,1467,0,0,57075,58541,1,1467,0.000000e+00,2827.0
17,NC_000932.1,atpA,100.0,1524,0,0,9938,11461,1524,1,0.000000e+00,2957.0
19,NC_000932.1,atpB,100.0,1497,0,0,52660,54156,1497,1,0.000000e+00,2910.0
72,NC_000932.1,atpE,100.0,399,0,0,52265,52663,399,1,4.300000e-224,773.0
68,NC_000932.1,atpF,100.0,411,0,0,11529,11939,555,145,1.000000e-229,792.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4,NC_000932.1,ycf1_2,100.0,5361,0,0,123884,129244,5361,1,0.000000e+00,10284.0
2,NC_000932.1,ycf2,100.0,6885,0,0,86474,93358,1,6885,0.000000e+00,13318.0
0,NC_000932.1,ycf2_2,100.0,6885,0,0,86474,93358,1,6885,0.000000e+00,13318.0
98,NC_000932.1,ycf3,100.0,226,0,0,43526,43751,352,127,1.300000e-122,436.0
