### Import 
- os - to create output directory for sequences
- Bio.Entrez - package that allows communication to Entrez from NCBI

In [24]:
import os

from Bio import Entrez
from Bio import SeqIO

### Configure output folder

In [25]:
output_directory = "dbs"

os.makedirs(output_directory, exist_ok=True)
os.getcwd()

'C:\\Users\\Komputer\\source\\pw\\persistent\\bachelors\\research\\databases'

### Configure Entrez
- Entrez requires e-mail address

In [26]:
Entrez.email = "patryk.gryz.stud@pw.edu.pl"
Entrez.tool = "sequences-databases-research"

### Search database for sequences
- get all 1000 length sequences that are part of genomic

In [27]:
with Entrez.esearch(
    db="nucleotide",
    term="(all[Filter]) AND (1000[Sequence Length]) AND (gene_in_genomic[PROP])",
    retstart=10
) as handle:
    record = Entrez.read(handle)
    identifiers = record.get("IdList", [])
    
identifiers

['2687871986', '2687867249', '2687866140', '2687860224', '2687856402', '2687744291', '2687735969', '2687730299', '2687727002', '2687717978', '2687711865', '2687711319', '2687699374', '2687694164', '2687686868', '2687674807', '2687667051', '2687653162', '2687653128', '2687649762']

### Download first sequence

In [28]:
with Entrez.efetch(
    db="nucleotide",
    id=identifiers[0],
    rettype="fasta",
    retmode="text"
) as handle:
    seq = SeqIO.read(handle, "fasta")

seq

SeqRecord(seq=Seq('CTACCTTTTGGCCAACTGCCTGTTGCTTTGAAAAAGACTGTAAGAGGCTCTATA...TTT'), id='XR_010080109.1', name='XR_010080109.1', description='XR_010080109.1 PREDICTED: Prinia subflava uncharacterized LOC134549442 (LOC134549442), transcript variant X7, ncRNA', dbxrefs=[])

### Write with SeqIO

In [29]:
SeqIO.write(seq, os.path.join(output_directory, f"{identifiers[0]}.fasta"), "fasta")

1

## Simple Script

In [None]:
import os
import logging
from typing import List

from Bio import Entrez


def configure_entrez():
    Entrez.email = "<email>"
    Entrez.tool = "Sequences-databases-research"


def download_single_nucleotide(identifier: str, filename: str) -> None:
    with Entrez.efetch(
            db="nucleotide", id=identifier, rettype="fasta", retmode="text"
    ) as handle:
        with open(filename, "w") as file:
            file.write(handle.read())


def search_database() -> List[str]:
    with Entrez.esearch(
            db="nucleotide",
            term="(all[Filter]) AND (1000[Sequence Length]) AND (gene_in_genomic[PROP])",
            retstart=10
    ) as handle:
        record = Entrez.read(handle)
        logging.info(f"Record: {record}")
        return record.get("IdList", [])


def test_download_first():
    identifier = search_database()

    logging.debug(f"Identifiers: {identifier}")

    for identifier in identifier:
        logging.info(f"Downloading {identifier}")
        download_single_nucleotide(identifier, f"databases/{identifier}.fasta")
        break


if __name__ == "__main__":
    logging.getLogger().setLevel(logging.DEBUG)

    configure_entrez()
    os.makedirs("databases", exist_ok=True)
    test_download_first()
