In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 50
%matplotlib inline

# Biopython Entrez
Provides code to access NCBI over the WWW.  

[Package info](https://biopython.org/DIST/docs/api/Bio.Entrez-module.html)

* `api_key` Personal API key from NCBI. If not set, only 3 queries per second are allowed. 10 queries per seconds otherwise with a valid API key.

### 1. Basics

In [2]:
from Bio import Entrez

In [3]:
from Bio import SeqIO

In [4]:
handle = Entrez.efetch(db="nucleotide", id="NC_045512",
                       rettype="gb", retmode="text")
handle.readline().strip()

'LOCUS       NC_045512              29903 bp ss-RNA     linear   VRL 18-JUL-2020'

### 2. Search via `esearch`

In [6]:
handle = Entrez.esearch(db="nucleotide", retmax=10, term="SARS-CoV-2", idtype="acc")
record = Entrez.read(handle)
handle.close()
print(record)

{'Count': '12266', 'RetMax': '10', 'RetStart': '0', 'IdList': ['NC_045512.2', 'NM_021804.3', 'NM_001371415.1', 'MT773134.1', 'MT773133.1', 'MT772580.1', 'MT772579.1', 'MT772578.1', 'MT772577.1', 'MT772576.1'], 'TranslationSet': [{'From': 'SARS-CoV-2', 'To': '"Severe acute respiratory syndrome coronavirus 2"[Organism] OR SARS-CoV-2[All Fields]'}], 'TranslationStack': [{'Term': '"Severe acute respiratory syndrome coronavirus 2"[Organism]', 'Field': 'Organism', 'Count': '11145', 'Explode': 'Y'}, {'Term': 'SARS-CoV-2[All Fields]', 'Field': 'All Fields', 'Count': '12266', 'Explode': 'N'}, 'OR', 'GROUP'], 'QueryTranslation': '"Severe acute respiratory syndrome coronavirus 2"[Organism] OR SARS-CoV-2[All Fields]'}


### 3. Method From `coronaversing`
To fetch the latest data from NCBI using keywords and a genome size range.

In [7]:
# data folders
root_folder = "./data/genbank/"
viruses = ["coronaviridae"]

# file periodically removed to update with the latest data
cache_vrs_file = ".cache_vrs.pkl" 
cache_cds_file = ".cache_cds.pkl" 
cache_regions_file = ".cache_regions.pkl" 

hidden_columns = ["sequence", "file_path", "strain", "collection_date"]
hidden_cds_columns = ["translation", "file_path", "location"]
hidden_regions_columns = ["id", "rna", "protein", "path"]


base_folder = "../data/genbank/"
# Entrez.email = 'lukedwoods@gmail.com'

size_and_date = "(\"25000\"[SLEN] : \"35000\"[SLEN])"
genbank_search_query = {"coronaviridae": ["(\"Coronaviridae\"[Organism]) AND " + size_and_date]}

def fetch_gb(path, gid):
    handle = Entrez.efetch(db="nucleotide", id=gid, rettype="gb")
    local_file = open(path + "/" + gid + ".gb", 'w')
    local_file.write(handle.read())
    handle.close()
    local_file.close()

genome_files = genbank_search_query
viruses = genbank_search_query.keys()

In [8]:
for vrs in viruses:
    print(vrs)

coronaviridae


In [None]:
count = 0
for vrs in viruses:
    
    data_folder = base_folder + vrs
    if not os.path.exists(data_folder):
        os.makedirs(data_folder)
    
    for keywords in genome_files[vrs]:
        handle = Entrez.esearch(db="nucleotide", retmax=5000, term=keywords, idtype="acc")
        record = Entrez.read(handle)
        ids = record["IdList"]
        print("[+] got", len(ids), "records")
        handle.close()

        for genbank_id in ids:
            file_name = data_folder + "/" + genbank_id + ".gb"
            count += 1
            if not os.path.exists(file_name):
                print("[+]  --- * fetching (", vrs, ")", file_name)
                downloaded = False
                tries = 0
                while not downloaded:
                    try:
                        fetch_gb(data_folder, genbank_id)
                        downloaded = True
                    except:
                        if tries == 5:
                            print("[+]  --- * couldn't get (", vrs, ")", genbank_id)
                            break
                        tries += 1
                        time.sleep(3)
                        pass

print("[+] Collected", count, " gb files")