In [None]:
import subprocess
from Bio import Entrez
from Bio import SeqIO
from Bio import SearchIO
Entrez.email = "o.william.white@gmail.com"

In [20]:
# input parameters
reference_accession = "NC_000932.1"
query_accession = "NC_000932.1"

In [None]:
## download genbank of reference data and write annotated genes to fasta

# efetch accession
handle = Entrez.efetch(db="nucleotide", id=reference_accession, rettype="gb", retmode="text")
record = SeqIO.read(handle, "gb")
handle.close()

# set up empty list
gene_list = []
gene_count = {}

# loop through sequence feature and append to list
for feature in record.features:
    # cds features
    if feature.type == "CDS" and "gene" in feature.qualifiers.keys():
        
        gene_name = feature.qualifiers["gene"][0]

        # blat requires reference data with uniq names
        if gene_count.get(gene_name) is None:
            gene_count[gene_name] = 1
        else: 
            gene_count[gene_name] += 1
            gene_name = gene_name + "_" + str(gene_count[gene_name])
        
        gene_seq = feature.extract(record)
        gene_seq.id = gene_name
        gene_seq.description = ""
        gene_list.append(gene_seq)

# write reference fasta
with open("reference.fasta", "w") as ref:
    SeqIO.write(gene_list, ref, "fasta")


In [58]:
## download query accession and write to fasta

# efetch accession
handle = Entrez.efetch(db="nucleotide", id=query_accession, rettype="fasta", retmode="text")
record = SeqIO.read(handle, "fasta")
handle.close()

# write reference fasta
with open("query.fasta", "w") as que:
    SeqIO.write(record, que, "fasta")

In [60]:
# run blat
subprocess.run(["blat", "reference.fasta",  "query.fasta", "output.psl"])

Loaded 79482 letters in 85 sequences
Searched 154478 bases in 1 sequences


CompletedProcess(args=['blat', 'reference.fasta', 'query.fasta', 'output.psl'], returncode=0)

In [61]:
blat_output = SearchIO.read("output.psl", "blat-psl")

In [25]:
print(blat_output)


Program: blat (<unknown version>)
  Query: NC_000932.1 (154478)
         <unknown description>
 Target: <unknown target>
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      2  rps19  <unknown description>
            1      2  rpl2_2  <unknown description>
            2      2  rpl23_2  <unknown description>
            3      2  rpl23  <unknown description>
            4      2  rpl2  <unknown description>
            5      2  rps7_2  <unknown description>
            6      2  rps7  <unknown description>
            7      3  rps12_2  <unknown description>
            8      3  rps12  <unknown description>
            9      2  ndhF  <unknown description>
           10      1  rpl32  <unknown description>
           11      1  ccsA  <unknown description>
           12      2  ycf1  <unknown description>
           13 

In [62]:
for hit in blat_output:
    print(hit)

Query: NC_000932.1
       <unknown description>
  Hit: rps19 (279)
       <unknown description>
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0         ?          ?       ?  [154365:154478]                [0:113]
          1         ?          ?       ?    [84004:84283]                [0:279]
Query: NC_000932.1
       <unknown description>
  Hit: rpl2_2 (825)
       <unknown description>
 HSPs: ----  --------  ---------  ------  ---------------  ---------------------
          #   E-value  Bit score    Span      Query range              Hit range
       ----  --------  ---------  ------  ---------------  ---------------------
          0         ?          ?       ?  [152805:154312]                [0:825]
          1         ?          ?       ?    [84336:85843]                [0:82