Common URL API: https://ncbi.github.io/blast-cloud/dev/api.html  
Biopython BLAST code: https://github.com/biopython/biopython/blob/171697883aca6894f8367f8f20f1463ce7784d0c/Bio/Blast/NCBIWWW.py  
Useful comments: https://github.com/biopython/biopython/blob/a338d04074585afe7e1f8f418c9107c05dd5e38d/Doc/Tutorial/chapter_blast.tex  
Example use: http://fenyolab.org/presentations/Introduction_Biostatistics_Bioinformatics_2014/tutorials/week3/BLAST%20with%20BioPython.pdf

In [None]:
# Copyright 1999 by Jeffrey Chang.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
#
# Patched by Brad Chapman.
# Chris Wroe added modifications for work in myGrid

"""
Code to invoke the NCBI BLAST server over the internet.
This module provides code to work with the WWW version of BLAST
provided by the NCBI. https://blast.ncbi.nlm.nih.gov/
"""

___

In [None]:
%load_ext blackcellmagic

___

In [None]:
import time
import sys

from urllib.request import urlopen
from urllib.parse import urlencode
from urllib.request import Request
import requests

import re
from bs4 import BeautifulSoup
from bs4 import Comment

def blast(
    program,
    database,
    sequence,
    ncbi_gi=False,
    descriptions=500,
    alignments=500,
    hitlist_size=50,
    expect=10.0,
    low_comp_filt=False,
    megablast=True,
    format_type="XML",
):
    """
    BLAST search using NCBI's QBLAST server.
    Supports all parameters of the old qblast API for Put (https://ncbi.github.io/blast-cloud/dev/api.html).
    Please note that NCBI uses the new Common URL API for BLAST searches
    on the internet (http://ncbi.github.io/blast-cloud/dev/api.html). Thus,
    some of the parameters used by this function are not (or are no longer)
    officially supported by NCBI. Although they are still functioning, this
    may change in the future.

    Some useful parameters:
     - program        blastn, blastp, blastx, tblastn, or tblastx
     - database       nt, nr, refseq_rna, refseq_protein, swissprot, pdbaa, or pdbnt
                      (More info: https://ncbi.github.io/blast-cloud/blastdb/available-blastdbs.html)
     - sequence       The sequence to search.
     - ncbi_gi        True/False whether to return NCBI GI identifiers. Default False.
     - descriptions   int or None. Limit number of descriptions to show. Default 500.
     - alignments     int or None. Limit number of alignments to show. Default 500.
     - hitlist_size   int or None. Limit number of hits to return. Default 50.
     - expect         int or None. An expect value cutoff. Default 10.0.
     - low_comp_filt  True/False whether to low complexity filter. Default False.
     - megablast      True/False whether to use the MEGABLAST algorithm (blastn only). Default True.
     - format_type    Return format. "HTML", "Text", "ASN.1", or "XML".  Default "XML".

    This function does not check the validity of the parameters
    and passes the values to the server as is.  More help is available at:
    https://ncbi.github.io/blast-cloud/dev/api.html
    """
    # Server rules:
    # 1. Do not contact the server more often than once every 10 seconds.
    # 2. Do not poll for any single RID more often than once a minute.
    # 3. Use the URL parameter email and tool, so that the NCBI
    #    can contact you if there is a problem.
    # 4. Run scripts weekends or between 9 pm and 5 am Eastern time
    #    on weekdays if more than 50 searches will be submitted.
    # Reference: https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=DeveloperInfo
    
    # Define server URL and content type
    url_base = "https://blast.ncbi.nlm.nih.gov/Blast.cgi"
    content_type = "application/x-www-form-urlencoded"
    client = "ggetClient"
    
    ## Clean up parameters
    # If the path to afasta file was provided as sequence,
    # read the file and extract the sequence
    if ".fa" in sequence:
        from Bio import SeqIO
        sequence = SeqIO.read(sequence, format="fasta").seq

    # Convert program to lower case
    program = program.lower()
    # Check if programs was defined as expected
    programs = ["blastn", "blastp", "blastx", "tblastn", "tblastx"]
    if program not in programs:
        raise ValueError(
            "Program specified is %s. Expected one of %s"
            % (program, ", ".join(programs))
        )
    
    # Translate filter and ncbi_gi parameters
    if low_comp_filt == False:
        low_comp_filt = None
    else:
        low_comp_filt = "T"

    if ncbi_gi == False:
        ncbi_gi = None
    else:
        ncbi_gi = "T"

    if megablast == False:
        megablast = None
    else:
        megablast = "on"
        
    parameters = [
        ("PROGRAM", program),
        ("DATABASE", database),
        ("QUERY", sequence),
        ("NCBI_GI", ncbi_gi),
        ("DESCRIPTIONS", descriptions),
        ("ALIGNMENTS", alignments),
        ("HITLIST_SIZE", hitlist_size),
        ("EXPECT", expect),
        ("FILTER", low_comp_filt),
        ("MEGABLAST", megablast),
        ("CMD", "Put"),
    ]
    
    ## Define query
    query = [x for x in parameters if x[1] is not None]
    message = urlencode(query).encode()

    ## Submit search
    request = Request(url_base, message, {"User-Agent": client})
    handle = urlopen(request)

    # Fetch Request ID (RID) and estimated time to completion (RTOE)
    RID, RTOE = _parse_qblast_ref_page(handle)
    sys.stderr.write(f"Request ID: {RID}.\n")
    
    # Wait for search to complete
    # (At least 10 seconds to comply with server rule 1.)
    sys.stderr.write(f"Estimated time to completion: {RTOE} seconds.\n")
    if RTOE < 10:
        time.sleep(10)
    else:
        time.sleep(int(RTOE))
        
    ## Poll NCBI until the results are ready
    status_query = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID="
    searching = True
    i = 0
    while searching == True:
        if i != 0:
            # Sleep for 65 seconds if first fetch was not succesful
            # to comply with server rules
            time.sleep(65)
        
        # Query for search status
        r_status = requests.get(
            status_query + RID, 
            headers={
                "Content-Type" : content_type,
                "User-agent": client
            }
        )
        # Raise errors if status code is not 200
        if r_status.status_code != 200:
            raise RuntimeError(f"HTTPS request response status code {r_status.status_code}. Please try again.\n")

        # Get search status
        soup_status = BeautifulSoup(r_status.content, "html.parser")
        comments = soup_status.find_all(string=lambda text: isinstance(text, Comment))
        status = [i for i in comments if i.startswith('\n                QBlastInfoBegin')][0].split("\n")[2].split()[0]

        if status == "Status=WAITING":
            sys.stderr.write("Searching...\n")
            i = 1
            continue

        if status == "Status=FAILED":
            sys.stderr.write(f"Search {RID} failed; please report to blast-help@ncbi.nlm.nih.gov.\n")
            searching = False

        if status == "Status=UNKNOWN":
            sys.stderr.write(f"Search {RID} expired.\n")
            searching = False  

        if status == "Status=READY":
            sys.stderr.write("Search complete, retrieving results...\n")
            searching = False

        else:
            sys.stderr.write(f"""
            Something unexpected happened. \n
            Search {RID} possibly failed; please report to blast-help@ncbi.nlm.nih.gov\n
            or post an issue on Github: https://github.com/lauraluebbert/gget\n
            """)
            searching = False
        
    ## Retrieve search results
    results_query = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&RID=";

    r_results = requests.get(
        results_query + RID, 
        headers={
            "Content-Type" : content_type,
            "User-agent": client
        }
    )
    # Raise errors if status code is not 200
    if r_results.status_code != 200:
        raise RuntimeError(f"HTTPS request response status code {r_results.status_code}. Please try again.\n")
        
    soup = BeautifulSoup(r_results.content, "html.parser")
    
    return soup.find('pre')

def _parse_qblast_ref_page(handle):
    """
    Extract a tuple of RID, RTOE from the NCBI 'please wait' page.
    RTOE = 'Estimated time fo completion.' 
    RID = 'Request ID'.
    """
    s = handle.read().decode()
    i = s.find("RID =")
    if i == -1:
        rid = None
    else:
        j = s.find("\n", i)
        rid = s[i + len("RID =") : j].strip()

    i = s.find("RTOE =")
    if i == -1:
        rtoe = None
    else:
        j = s.find("\n", i)
        rtoe = s[i + len("RTOE =") : j].strip()

    if not rid and not rtoe:
        # Can we reliably extract the error message from the HTML page?
        # e.g.  "Message ID#24 Error: Failed to read the Blast query:
        #       Nucleotide FASTA provided for protein sequence"
        # or    "Message ID#32 Error: Query contains no data: Query
        #       contains no sequence data"
        #
        # This used to occur inside a <div class="error msInf"> entry:
        i = s.find('<div class="error msInf">')
        if i != -1:
            msg = s[i + len('<div class="error msInf">') :].strip()
            msg = msg.split("</div>", 1)[0].split("\n", 1)[0].strip()
            if msg:
                raise ValueError("Error message from NCBI: %s" % msg)
        # In spring 2010 the markup was like this:
        i = s.find('<p class="error">')
        if i != -1:
            msg = s[i + len('<p class="error">') :].strip()
            msg = msg.split("</p>", 1)[0].split("\n", 1)[0].strip()
            if msg:
                raise ValueError("Error message from NCBI: %s" % msg)
        # Generic search based on the way the error messages start:
        i = s.find("Message ID#")
        if i != -1:
            # Break the message at the first HTML tag
            msg = s[i:].split("<", 1)[0].split("\n", 1)[0].strip()
            raise ValueError("Error message from NCBI: %s" % msg)
        # If we cannot recognise the error layout:
        raise ValueError(
            "No RID and no RTOE found in the 'please wait' page, "
            "there was probably an error in your request but we "
            "could not extract a helpful error message."
        )
    elif not rid:
        # Can this happen?
        raise ValueError(
            "No RID found in the 'please wait' page. (Although RTOE = %r)" % rtoe
        )
    elif not rtoe:
        # Can this happen?
        raise ValueError(
            "No RTOE found in the 'please wait' page. (Although RID = %r)" % rid
        )

    try:
        return rid, int(rtoe)
    except ValueError:
        raise ValueError(
            "A non-integer RTOE found in the 'please wait' page, %r" % rtoe
        ) from None

___

In [None]:
program = "blastn"
db = "nt"

seq = "seq_results.fa"
# seq = "TTTGTAGTTACATAGCAAAATGCGCGTTTATTTCGGCTCAGTAAATTAAGAACATTTTCGTTACACGTTGCCACGCCCCCCCATCCAACGGCAAACCACCCCCCGCCTCCGCCTTCAAGACAGAGAGAGAGATAGAATAAAACAGAGAGAGAGCAACAACAACAATATATAACCAAAACGAAAGATTTCCGCAAGAAACCACAAAAAAAAACAACAATTGCTTTGGTTCTGATGATTCTCGTAACAGCAAAAACAACAAGATTTATAATCTAAATCAAAAGAAACTGTGCTCTCCGCTGTGCGTGGCTGTGAGTGCGTTTGTGCGGTACGGTGCGTGCATTATAACAATTATTGCACAAAAGGCATGACAAACGTGGGGCGGGGGTGTCTGTAAATGTGAGTGCGATAGAGCGCGACATATTCAAACAAGCTAGGGACGGAGTAGGTGAGCGAGGGAGAGAGAGAACGCGACGGTGGCAACAACAAAGGGCTGTGATTTTACTTTGCCCCCTCCACCCTGTACCCCGTCCCATCCATACACTTGCGTATTTACCAAATAAACGGAACATACATAAGCCAAAATAGGCAGAACAACAACAACATTAAGAGTCAACTAAACAGCAAATCGAATAGTTTCGTGGGAGAAAGGACAGCGTAGCAGAGTCATTGGAAAACTGGCCGCGTAAGGGAGCGCAATGGATGTCCTGGAAATGCTGCGCGCCAGCGCCAGCGGCAGCTACAACACAATATTCTCGGACGCGTGGTGCCAATATGTCTCCAAGCAGATCACAGCAACGGTAAGTGCACTTCACCCTTTCTTTGTCTTTTTATTTGCTTTGACTTTGGCTTGCTGTTTGTTTGCAATATGAGTGAGTCAGTTGGAAGACAATATGTGCGAGAGCGAGAGGA"

In [None]:
blast(program, db, seq)