In [7]:
from Bio import Entrez
from Bio import SeqIO
import time

# test how to find the 
Entrez.email = "wwyh.2017@gmail.com"     # Always tell NCBI who you are
handle = Entrez.esearch(db="nuccore",
                        term="txid50557[Organism:exp] AND (opsin[All Fields] AND complete[All Fields] AND cds[All Fields] NOT voucher)", 
                        api_key = "1efb120056e1cea873ba8d85d6692abd5d09", # using api key allows 10 fetch per second
                        retmax = 1300)
record = Entrez.read(handle)

print(record)

{'Count': '1265', 'RetMax': '20', 'RetStart': '0', 'IdList': ['2512316970', '2512316968', '2512316966', '2447590709', '2441950478', '2441950476', '2441950474', '2441535910', '2441535908', '2441535906', '2441535902', '2441535900', '2441535898', '2441535896', '2441535894', '2441535892', '2441535890', '2441535888', '2441535886', '2441535884'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'txid50557[Organism:exp]', 'Field': 'Organism', 'Count': '41121869', 'Explode': 'Y'}, {'Term': 'opsin[All Fields]', 'Field': 'All Fields', 'Count': '38288', 'Explode': 'N'}, {'Term': 'complete[All Fields]', 'Field': 'All Fields', 'Count': '57639424', 'Explode': 'N'}, 'AND', {'Term': 'cds[All Fields]', 'Field': 'All Fields', 'Count': '169440166', 'Explode': 'N'}, 'AND', {'Term': 'voucher[All Fields]', 'Field': 'All Fields', 'Count': '11290204', 'Explode': 'N'}, 'NOT', 'GROUP', 'AND'], 'QueryTranslation': 'txid50557[Organism:exp] AND (opsin[All Fields] AND complete[All Fields] AND cds[All Fields] NOT

In [11]:
# create a function that return the collection of all NCBI fetch result

def ncbi_fetch(email, term, ncbi_db = "nuccore", rettype = "gb", format = "genbank"):
  # Classify the species into 3 levels based on the number of results: >=10,000, 1000-9999, 0-999
  Entrez.email = email    # Always tell NCBI who you are
  handle = Entrez.esearch(db=ncbi_db,
                        term=term, 
                        api_key = "1efb120056e1cea873ba8d85d6692abd5d09") # using api key allows 10 fetch per second
  record = Entrez.read(handle)
  full_res = int(record["Count"])
  
  handle = Entrez.esearch(db=ncbi_db,
                        term=term, 
                        api_key = "1efb120056e1cea873ba8d85d6692abd5d09",
                        retmax = full_res + 1)
  q_list = record["IdList"]

# create a list for all the sequence fetched from NCBI
  NCBI_seq =[]
  for x in q_list:
    # fetch result from previous search
    fet = Entrez.efetch(db=ncbi_db, id=x, rettype=rettype)
    seq = SeqIO.read(fet, format)
    fet.close()
    
    NCBI_seq.append(seq)

    time.sleep(0.5)
  
  return NCBI_seq
  

In [None]:
NCBI_seq = ncbi_fetch("wwyh.2017@gmail.com", 
                    "txid50557[Organism:exp] AND (opsin[All Fields] AND complete[All Fields] AND cds[All Fields] NOT voucher)",)

In [39]:
# create empty lists
Accession = []
DNA = []
Genus = []
Species = []
gene_des = []
version = []
Protein = []

# loop through the result list obtained from the NCBI search
# may take over 10 minutes
# took 16m 19.3s in 1st complete run
for seq in NCBI_seq:
    # get genus nd speceis name seperately
    spe_name = seq.annotations["organism"]
    g_s_name = spe_name.split()

    # get and append protein sequence
    if seq.features:
        for feature in seq.features:
            if feature.type == "CDS":
                if "translation" in feature.qualifiers.keys():
                    pro_seq = feature.qualifiers['translation'][0]

    # attached them to lists
    Accession.append(seq.name)
    DNA.append(seq.seq)
    Genus.append(g_s_name[0])
    Species.append(g_s_name[1])
    gene_des.append(seq.description)
    version.append(seq.id)


In [42]:
import pandas as pd

# create a dataframe for the information
ncbi_insecta_op = pd.DataFrame(
    {'Genus': Genus,
     'Species': Species,
     'Accession': Accession,
     'DNA': DNA,
     'Protein': Protein,
     'gene_des': gene_des,
     'version': version
    })


Unnamed: 0,Genus,Species,Accession,DNA,Protein,gene_des,version
0,Anopheles,stephensi,LC710140,"(A, T, G, T, T, T, C, T, G, G, T, G, A, A, T, ...","(M, F, L, V, N, E, T, M, A, E, G, A, M, L, L, ...",Anopheles stephensi Asop9 mRNA for SWS visual ...,LC710140.1
1,Anopheles,stephensi,LC710139,"(A, T, G, G, G, T, A, T, T, G, T, C, C, A, G, ...","(M, G, I, V, Q, L, D, N, Q, T, A, Y, R, P, E, ...",Anopheles stephensi Asop8 mRNA for UV visual o...,LC710139.1
2,Anopheles,stephensi,LC710138,"(A, T, G, G, C, A, G, C, C, T, T, C, G, T, T, ...","(M, A, A, F, V, E, P, H, F, D, A, W, T, Q, G, ...",Anopheles stephensi Asop1 mRNA for LWS visual ...,LC710138.1
3,Zeugodacus,cucurbitae,ON693226,"(A, T, C, A, T, T, C, A, C, A, G, C, A, T, C, ...","(I, I, H, S, I, Q, S, I, A, G, N, F, K, T, M, ...","Zeugodacus cucurbitae opsin Rh4 mRNA, complete...",ON693226.1
4,Phauda,flammans,ON383210,"(A, T, G, A, A, A, A, T, G, G, C, A, A, C, C, ...","(M, K, M, A, T, N, F, S, E, Y, G, I, G, P, V, ...","Phauda flammans opsin Blue mRNA, complete cds",ON383210.1
...,...,...,...,...,...,...,...
1260,Antheraea,pernyi,AB073299,"(A, G, T, A, G, C, C, T, C, C, A, C, T, G, C, ...","(S, S, L, H, C, L, G, V, P, A, A, Q, *, D, W, ...","Antheraea pernyi mRNA for anceropsin, complete...",AB073299.1
1261,Bombyx,mori,AB064496,"(A, G, T, A, G, C, C, T, C, C, A, C, T, G, C, ...","(S, S, L, H, C, L, G, N, P, A, T, P, R, T, T, ...","Bombyx mori mRNA for Boceropsin, complete cds",AB064496.1
1262,Papilio,xuthus,AB007425,"(G, C, G, G, C, C, G, C, T, A, G, C, G, C, A, ...","(A, A, A, S, A, L, R, A, S, K, M, A, L, N, Y, ...","Papilio xuthus PxRh3 mRNA for opsin, complete cds",AB007425.1
1263,Papilio,xuthus,AB007424,"(A, A, C, G, A, T, T, T, T, C, T, T, C, A, G, ...","(N, D, F, L, Q, Y, F, C, Y, *, P, S, Q, A, N, ...","Papilio xuthus PxRh2 mRNA for opsin, complete cds",AB007424.1


In [43]:
# create a csv for further comparison
ncbi_insecta_op.to_csv("ncbi_insecta_op.csv", index=False)

In [None]:
# create a list of all species' full name appeared on the ncbi search
f_name = []

for seq in NCBI_seq:
    # get species' names
    spe_name = seq.annotations["organism"]
    
    f_name.append(spe_name)

# delete the replicated elements
f_name = [*set(f_name)]
len(f_name)

In [None]:
# create a function that output the species' NCBI popularity in a dataframe with species name 

def sp_pop(f_name, email, ncbi_db = "nuccore"):
  # Classify the species into 3 levels based on the number of results: >=10,000, 1000-9999, 0-999
  pop_level = []

  for name in f_name:
    
    Entrez.email = email     # Always tell NCBI who you are

    handle = Entrez.esearch(db = ncbi_db,
                        term = name, 
                        api_key = "1efb120056e1cea873ba8d85d6692abd5d09" # using api key allows 10 fetch per second
                        )
    record = Entrez.read(handle)

    if int(record["Count"]) >= 10000:
        pop_level.append(3)
        
    elif int(record["Count"]) >= 1000:
        pop_level.append(2)

    else:
        pop_level.append(1)

  # combine the two lists to a df and return
  sp_pop = pd.DataFrame(
    {"name": f_name,
     "popularity": pop_level
    })
  return sp_pop

In [None]:
# call the function to get the popularity of the insect species among the NCBI ranked by number of search result
insect_pop = sp_pop(f_name, "wwyh.2017@gmail.com", "nuccore")

In [None]:
# select most popular species
pop_3 = insect_pop.loc[insect_pop['popularity'] == 3]
len(pop_3)
pop_3

In [None]:
# export to a csv file
pop_3.to_csv("hotspot_insect_species.csv", index=False)