<a href="https://colab.research.google.com/github/mervekldrm/bioinformatics/blob/main/ralstonia_solanacearum_genomes_elink.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:

# If running in Colab, ensure Biopython is installed
!pip -q install biopython


In [8]:

from Bio import Entrez
from Bio import SeqIO
from pathlib import Path

# --- REQUIRED: Fill in your email (NCBI requires a valid email) ---
Entrez.email = "mervekaldirim@posta.mu.edu.tr"

# Optional: If you have an NCBI API key, uncomment and set it to speed up requests
# Entrez.api_key = "YOUR_NCBI_API_KEY"

out_fasta = Path("/content/R_solanacearum_genomes.fasta")


## 1) Get the Taxonomy ID for *Ralstonia solanacearum*

In [10]:

from xml.etree import ElementTree as ET

term = "Ralstonia solanacearum[Scientific Name]"
with Entrez.esearch(db="taxonomy", term=term, retmode="xml") as handle:
    tx = Entrez.read(handle)

if not tx["IdList"]:
    raise RuntimeError("No taxonomy ID found for Ralstonia solanacearum.")

tax_id = tx["IdList"][0]
print("TaxID:", tax_id)


TaxID: 305


## 2) Use `Entrez.elink` from Taxonomy ➜ Nuccore

In [11]:

# We link from taxonomy to nuccore to get all nucleotide records associated with this taxon.
# `linkname="taxonomy_nuccore"` is the standard link for Taxonomy ➜ Nuccore.
linked_ids = []

with Entrez.elink(dbfrom="taxonomy", db="nuccore", id=tax_id, linkname="taxonomy_nuccore", retmode="xml") as handle:
    linkset = Entrez.read(handle)

# Collect all linked UIDs
for linksetdb in linkset[0].get("LinkSetDb", []):
    if linksetdb.get("DbTo") == "nuccore":
        linked_ids.extend([lnk["Id"] for lnk in linksetdb.get("Link", [])])

print(f"Total linked nuccore IDs: {len(linked_ids)}")

# Optional sanity check: deduplicate and keep as strings
linked_ids = list(dict.fromkeys(linked_ids))
print(f"Unique nuccore IDs: {len(linked_ids)}")

# If you want to *roughly* restrict to "genome" entries, you can filter via ESummary titles.
# Some projects prefer *not* to filter and deliver all linked nuccore records for the taxon.
FILTER_FOR_GENOME_TITLES = True


Total linked nuccore IDs: 63149
Unique nuccore IDs: 63149


### (Optional) Filter for records with 'genome' in the title

In [5]:

def filter_for_genome_titles(id_list, chunk_size=200):
    kept = []
    for i in range(0, len(id_list), chunk_size):
        chunk = id_list[i:i+chunk_size]
        with Entrez.esummary(db="nuccore", id=",".join(chunk), retmode="xml") as h:
            summ = Entrez.read(h)
        for doc in summ:
            title = doc.get("Title", "") or ""
            # Keep entries that look like genome-level sequences.
            if "genome" in title.lower():
                kept.append(doc["Id"])
    return kept

if FILTER_FOR_GENOME_TITLES:
    print("Filtering for 'genome' in titles via ESummary...")
    filtered_ids = filter_for_genome_titles(linked_ids)
    print(f"Kept {len(filtered_ids)} of {len(linked_ids)} linked records that look like genome entries.")
    target_ids = filtered_ids if filtered_ids else linked_ids
else:
    target_ids = linked_ids

print(f"Target ID count: {len(target_ids)}")
if not target_ids:
    raise RuntimeError("No target IDs to fetch. Try setting FILTER_FOR_GENOME_TITLES=False to include all linked nuccore records.")


Filtering for 'genome' in titles via ESummary...
Kept 46693 of 63149 linked records that look like genome entries.
Target ID count: 46693


## 3) Fetch sequences and write a single FASTA file

In [6]:

def batched(iterable, n):
    for i in range(0, len(iterable), n):
        yield iterable[i:i+n]

# Write/overwrite FASTA
out_fasta.write_text("")

batch_size = 200  # reasonable batch for efetch
count_written = 0

for batch in batched(target_ids, batch_size):
    with Entrez.efetch(db="nuccore", id=",".join(batch), rettype="fasta", retmode="text") as handle:
        fasta_chunk = handle.read()
    # Some batches may include non-sequence placeholders; guard by simple heuristic:
    if fasta_chunk.strip():
        with out_fasta.open("a") as f:
            f.write(fasta_chunk)
        # Count sequences by '>' lines
        count_written += fasta_chunk.count("\n>") + (1 if fasta_chunk.startswith(">") else 0)
    print(f"Fetched & appended batch with {len(batch)} IDs. Total sequences written so far: {count_written}")

print("\nDone.")
print("FASTA saved to:", out_fasta)
print("Approximate record count (headers):", count_written)


Fetched & appended batch with 200 IDs. Total sequences written so far: 198
Fetched & appended batch with 200 IDs. Total sequences written so far: 396
Fetched & appended batch with 200 IDs. Total sequences written so far: 594
Fetched & appended batch with 200 IDs. Total sequences written so far: 792
Fetched & appended batch with 200 IDs. Total sequences written so far: 990
Fetched & appended batch with 200 IDs. Total sequences written so far: 1189
Fetched & appended batch with 200 IDs. Total sequences written so far: 1387
Fetched & appended batch with 200 IDs. Total sequences written so far: 1585
Fetched & appended batch with 200 IDs. Total sequences written so far: 1783
Fetched & appended batch with 200 IDs. Total sequences written so far: 1981
Fetched & appended batch with 200 IDs. Total sequences written so far: 2179
Fetched & appended batch with 200 IDs. Total sequences written so far: 2377
Fetched & appended batch with 200 IDs. Total sequences written so far: 2576
Fetched & appende

## 4) Quick verification

In [7]:

# Show first 10 lines of the resulting FASTA file to verify
print("\n".join(out_fasta.read_text().splitlines()[:10]))



>CBINGP010000001.1 MAG TPA_asm: Ralstonia solanacearum isolate SRR16817036_concoct_32 genome assembly, contig: ERZ27393611.2, whole genome shotgun sequence
CCAAACCAAACCAAACCAAACCAAACCACCATCAGCCAAACACCAACGCCCCCCACAAAGTTTGCACACA
CTCCCACAACAACCCTCATGCCTCATACATCAACGCACGAACCCCAAGCCTTCTAGAGCCCCCCCTCGAA
TTGCGACCTTGGCGTTTTAAAACATCCGTTCCTAACATGCCCGCACCACCCAATACATACGTTTGAATCG
TACGTTCGAGCGCTGCACCGCAGCAATGGAACGCGTGATCGGCCCCCAAAACAACGAGGCAGGAGACATG
GCCCGTCCTGGCGCAGGAGACACCAAGAACCGCATTCTGGAAGCCACCGAACTGCTGTTCATCGAGTTCG
GCTACGAGGCGATGTCGTTGCGGCAGATCACGGCGCGCGCCAAGGTCAACCTGGCCGCCGTCAACTATCA
CTTCGGCAGCAAGGAAGCGCTGATGCAGTCGGTGCTCGGCCGCCGGCTCGACCCGCTCAACACGCGCCGG
CTGGCGCTGCTGACCGCGTGCGAGGAGCGCTGGCCGCAGCGGCTGAGCTGCGAGCATGTGCTGGGCGCGC


In [12]:
from google.colab import files

files.download("/content/R_solanacearum_genomes.fasta")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>