In [1]:
from Bio import Entrez
from pprint import pprint

import pandas as pd

Entrez.email = "a.zabelkin@itmo.ru"

### 1. queries the base of nucleotide sequences for all sequences according to the name of the gene

In [2]:
esearch_handle = Entrez.esearch(db="nucleotide", term='"human"[Organism] AND GSPT1[Gene]')
ids = Entrez.read(esearch_handle)['IdList']

esummary_handle = Entrez.esummary(db="nucleotide", id=','.join(ids))
summaries = Entrez.read(esummary_handle)

### 2. returns a table

In [3]:
def get_table(summaries):
    return pd.DataFrame([[s['Id'], s['Caption'], s['Length']] for s in summaries], 
                        columns=['UID', 'accession number', 'sequence length'])

get_table(summaries)

Unnamed: 0,UID,accession number,sequence length
0,1676355513,NM_001130006,7138
1,1676319656,NM_001130007,7166
2,1519312966,NM_002094,7141
3,568815582,NC_000016,90338345
4,74273666,CM000267,75226909
5,74230029,CH471112,14690834
6,33874733,BC009503,2523
7,39754980,AY398991,1562
8,307685420,AB590486,1919


### 3. returns the nucleotide sequences in fasta format and writes to the file;*


In [4]:
def get_fasta_and_write_to_file(id, folder):
    fasta = Entrez.efetch(db="nucleotide", id=id, rettype="fasta", retmode="text").read()
    with open(folder + '{id}.fasta', 'w') as f:
        f.write(fasta)
    return fasta

for id in ids[:3]:
    print(id)
    get_fasta_and_write_to_file(id, 'data_part1/task3/')

1676355513
1676319656
1519312966


### 4. downloads all sequences from the paper with given PMID

In [5]:
elink_handle = Entrez.elink(fromdb='pubmed', db='nucleotide', id='12890024')
bs = Entrez.read(elink_handle)

ids = [id_element['Id'] for b in bs for link_set_db in b['LinkSetDb'] for id_element in link_set_db['Link']]

for id in ids[:3]:
    print(id)
    get_fasta_and_write_to_file(id, 'data_part1/task4/')

19568061
19568059
19568057
