In [61]:
import requests
import pandas as pd
from io import StringIO
import os
import urllib.request

In [62]:
base_url = "https://ftp.ncbi.nlm.nih.gov/genomes/refseq/"

domains = [
    'archaea', 'bacteria', 'fungi', 'invertebrate',
    'plant', 'protozoa', 'vertebrate_mammalian',
    'vertebrate_other', 'viral'
]

# no assembly_summary.txt files for plastid, plasmid, or mitochondrion


In [63]:
assembly_dir = "assembly_summaries"
genome_sequences_dir = "genome_sequences"
os.makedirs(genome_sequences_dir, exist_ok=True)

def download_assembly_summary(domain):
    url = f"{base_url}{domain}/assembly_summary.txt"
    local_path = os.path.join(assembly_dir, f"{domain}_assembly_summary.txt")
    try:
        response = requests.get(url)
        response.raise_for_status()
        lines = response.text.splitlines()
        if lines[0].startswith("##"):
            lines = lines[1:] 
        if lines[0].startswith("#"):
            lines[0] = lines[0][1:]  
        with open(local_path, 'w') as file:
            file.write("\n".join(lines))
        print(f"downloaded and cleaned the {domain} assembly summary.")
    except Exception as e:
        print(f"error downloading {domain} assembly summary: {e}")

for domain in domains:
    download_assembly_summary(domain)

Downloaded and cleaned archaea assembly summary.
Downloaded and cleaned bacteria assembly summary.
Downloaded and cleaned fungi assembly summary.
Downloaded and cleaned invertebrate assembly summary.
Downloaded and cleaned plant assembly summary.
Downloaded and cleaned protozoa assembly summary.
Downloaded and cleaned vertebrate_mammalian assembly summary.
Downloaded and cleaned vertebrate_other assembly summary.
Downloaded and cleaned viral assembly summary.


In [64]:
#this code reads the assembly summary file for each domain, then checnks for required columns and converts size cols to numeric vals. Then filters
# for complete genomes and finds the smallest


def process_assembly_summary(domain):
    local_path = os.path.join(assembly_dir, f"{domain}_assembly_summary.txt")
    try:
        df = pd.read_csv(local_path, sep='\t', dtype=str)
        
        required_columns = ['assembly_accession', 'assembly_level', 'genome_size', 'genome_size_ungapped', 'ftp_path', 'organism_name']
        if all(col in df.columns for col in required_columns):
            df['genome_size'] = pd.to_numeric(df['genome_size'], errors='coerce')
            df['genome_size_ungapped'] = pd.to_numeric(df['genome_size_ungapped'], errors='coerce')
            complete_genomes = df[df['assembly_level'] == 'Complete Genome']
            if not complete_genomes.empty:
                smallest_genome = complete_genomes.loc[complete_genomes['genome_size_ungapped'].idxmin()]
                return smallest_genome
        else:
            print(f"missing columns in {domain}: {set(required_columns) - set(df.columns)}")
    except Exception as e:
        print(f"error processing {domain}: {e}")
    return None

smallest_genomes_list = []
for domain in domains:
    smallest_genome = process_assembly_summary(domain)
    if smallest_genome is not None:
        smallest_genomes_list.append(smallest_genome)

smallest_genomes_df = pd.DataFrame(smallest_genomes_list)


output_excel_path = "/Users/tiananoll-walker/Documents/biotokens/smallest_genomes.xlsx"
if not smallest_genomes_df.empty:
    smallest_genomes_df.to_excel(output_excel_path, index=False)
    print(f"saved to {output_excel_path}")
else:
    print("no data found")

saved to /Users/tiananoll-walker/Documents/biotokens/smallest_genomes.xlsx


In [69]:
import os
import requests

assembly_reports_dir = "assembly_reports"
genome_sequences_dir = "genome_sequences"
os.makedirs(assembly_reports_dir, exist_ok=True)
os.makedirs(genome_sequences_dir, exist_ok=True)

organismnames = {
    "Nanobdella aerobiophila": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/023/169/545/GCF_023169545.1_ASM2316954v1/GCF_023169545.1_ASM2316954v1_assembly_report.txt",
    "Candidatus Karelsulcia muelleri": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/889/585/GCF_016889585.1_ASM1688958v1/GCF_016889585.1_ASM1688958v1_assembly_report.txt",
    "Malassezia restricta": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/290/485/GCF_003290485.1_ASM329048v1/GCF_003290485.1_ASM329048v1_assembly_report.txt",
    "Caenorhabditis elegans": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/985/GCF_000002985.6_WBcel235/GCF_000002985.6_WBcel235_assembly_report.txt",
    "Ostreococcus lucimarinus CCE9901": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/092/065/GCF_000092065.1_ASM9206v1/GCF_000092065.1_ASM9206v1_assembly_report.txt",
    "Theileria orientalis strain Shintoku": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/740/895/GCF_000740895.1_ASM74089v1/GCF_000740895.1_ASM74089v1_assembly_report.txt",
    "Bubalus bubalis": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/019/923/935/GCF_019923935.1_NDDB_SH_1/GCF_019923935.1_NDDB_SH_1_assembly_report.txt",
    "Mugil cephalus": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/022/458/985/GCF_022458985.1_CIBA_Mcephalus_1.1/GCF_022458985.1_CIBA_Mcephalus_1.1_assembly_report.txt",
    "Rice yellow mottle virus satellite": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/839/085/GCF_000839085.1_ViralProj14152/GCF_000839085.1_ViralProj14152_assembly_report.txt"
}

def download_assembly_report(organism_name, url):
    response = requests.get(url)
    if response.status_code == 200:
        file_path = os.path.join(assembly_reports_dir, f"{organism_name.replace(' ', '_')}_assembly_report.txt")
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded assembly report for {organism_name}")
    else:
        print(f"Failed to download assembly report for {organism_name}. Status code: {response.status_code}")
        print(f"URL: {url}")

for organism_name, url in organismnames.items():
    download_assembly_report(organism_name, url)

Downloaded assembly report for Nanobdella aerobiophila
Downloaded assembly report for Candidatus Karelsulcia muelleri
Downloaded assembly report for Malassezia restricta
Downloaded assembly report for Caenorhabditis elegans
Downloaded assembly report for Ostreococcus lucimarinus CCE9901
Downloaded assembly report for Theileria orientalis strain Shintoku
Downloaded assembly report for Mugil cephalus


In [70]:
import os
import re
import gzip
import requests


gcf_files_path = "GCF_files"
genome_sequences_dir = "genome_sequences"
os.makedirs(assembly_reports_dir, exist_ok=True)
os.makedirs(genome_sequences_dir, exist_ok=True)

organisms = {
    "Nanobdella aerobiophila": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/023/169/545/GCF_023169545.1_ASM2316954v1/GCF_023169545.1_ASM2316954v1_genomic.fna.gz",
    "Candidatus Karelsulcia muelleri": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/016/889/585/GCF_016889585.1_ASM1688958v1/GCF_016889585.1_ASM1688958v1_genomic.fna.gz",
    "Malassezia restricta": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/290/485/GCF_003290485.1_ASM329048v1/GCF_003290485.1_ASM329048v1_genomic.fna.gz",
    "Caenorhabditis elegans": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/985/GCF_000002985.6_WBcel235/GCF_000002985.6_WBcel235_genomic.fna.gz",
    "Ostreococcus lucimarinus CCE9901": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/092/065/GCF_000092065.1_ASM9206v1/GCF_000092065.1_ASM9206v1_genomic.fna.gz",
    "Theileria orientalis strain Shintoku": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/740/895/GCF_000740895.1_ASM74089v1/GCF_000740895.1_ASM74089v1_genomic.fna.gz",
    "Mugil cephalus": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/022/458/985/GCF_022458985.1_CIBA_Mcephalus_1.1/GCF_022458985.1_CIBA_Mcephalus_1.1_genomic.fna.gz",
    "Rice yellow mottle virus satellite": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/839/085/GCF_000839085.1_ViralProj14152/GCF_000839085.1_ViralProj14152_genomic.fna.gz",
    "Bubalus bubalis": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/019/923/935/GCF_019923935.1_NDDB_SH_1/GCF_019923935.1_NDDB_SH_1_genomic.fna.gz"

}

def download_genome_sequence(organism_name, url):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        local_path = os.path.join(gcf_files_path, url.split('/')[-1])
        with open(local_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
        print(f"downloaded genome sequence for {organism_name}: {local_path}")
        return local_path
    else:
        print(f"failed to download genome sequence for {organism_name}. status code: {response.status_code}")
        return None

def extract_and_clean_sequence(file_path, organism_name):
    base_filename = f"{organism_name.replace(' ', '_')}"
    cleaned_filename = f"{genome_sequences_dir}/{base_filename}_cleaned.txt"
    
    if os.path.exists(cleaned_filename):
        print(f"cleaned sequence for {organism_name} already exists. Skipping extraction.")
        return cleaned_filename
    
    with gzip.open(file_path, 'rt') as infile:
        lines = infile.readlines()
    
    sequence = ''.join([line.strip() for line in lines if not line.startswith(">")])
    cleaned_sequence = re.sub(r'[^ACGTacgt]', '', sequence).upper()
    
    with open(cleaned_filename, 'w') as outfile:
        outfile.write(cleaned_sequence)
    
    print(f"cleaned sequence saved to {cleaned_filename}")
    return cleaned_filename

for organism_name, url in organisms.items():
    file_path = download_genome_sequence(organism_name, url)
    if file_path:
        extract_and_clean_sequence(file_path, organism_name)

downloaded genome sequence for Nanobdella aerobiophila: GCF_files/GCF_023169545.1_ASM2316954v1_genomic.fna.gz
cleaned sequence for Nanobdella aerobiophila already exists. Skipping extraction.
downloaded genome sequence for Candidatus Karelsulcia muelleri: GCF_files/GCF_016889585.1_ASM1688958v1_genomic.fna.gz
cleaned sequence for Candidatus Karelsulcia muelleri already exists. Skipping extraction.
downloaded genome sequence for Malassezia restricta: GCF_files/GCF_003290485.1_ASM329048v1_genomic.fna.gz
cleaned sequence for Malassezia restricta already exists. Skipping extraction.
downloaded genome sequence for Caenorhabditis elegans: GCF_files/GCF_000002985.6_WBcel235_genomic.fna.gz
cleaned sequence for Caenorhabditis elegans already exists. Skipping extraction.
downloaded genome sequence for Ostreococcus lucimarinus CCE9901: GCF_files/GCF_000092065.1_ASM9206v1_genomic.fna.gz
cleaned sequence for Ostreococcus lucimarinus CCE9901 already exists. Skipping extraction.
downloaded genome sequ

In [67]:
import random

genome_sequences_dir = "genome_sequences"

def generate_random_genome(length):
    return ''.join(random.choices('ACGT', k=length))

random_genome_length = 1000000
random_genome = generate_random_genome(random_genome_length)

random_genome_file = os.path.join(genome_sequences_dir, "random_genome.txt")
with open(random_genome_file, 'w') as f:
    f.write(random_genome)

print(f"random genome of length {random_genome_length} saved to {genome_sequences_dir}")

random genome of length 1000000 saved to genome_sequences


In [68]:
import os
import math
from collections import Counter

def shannon_entropy(s):
    frequency = Counter(s)
    probabilities = [freq / len(s) for freq in frequency.values()]
    entropy = -sum(p * math.log2(p) for p in probabilities)
    return entropy

lengths = []
entropies = []

# calculate length and entropy for cleaned sequences
for filename in os.listdir(genome_sequences_dir):
    if filename.endswith("_cleaned.txt") or filename == "random_genome.txt":
        filepath = os.path.join(genome_sequences_dir, filename)
        with open(filepath, 'r') as f:
            sequence = f.read()
        genome_length = len(sequence)
        genome_entropy = shannon_entropy(sequence)
        lengths.append(genome_length)
        entropies.append(genome_entropy)
        print(f'{filename}: Length = {genome_length}, Shannon Entropy = {genome_entropy}')

results_df = pd.DataFrame({
    "Filename": [filename.replace("_cleaned.txt", "").replace(".txt", "") for filename in os.listdir(genome_sequences_dir) if filename.endswith("_cleaned.txt") or filename == "random_genome.txt"],
    "Length": lengths,
    "Shannon Entropy": entropies
})

results_filename = "small_n_random_genome_sequences_lengths_and_entropy.xlsx"
results_df.to_excel(results_filename, index=False)
print(f"results saved to {results_filename}")


Ostreococcus_lucimarinus_CCE9901_cleaned.txt: Length = 13204888, Shannon Entropy = 1.968334374463055
Theileria_orientalis_strain_Shintoku_cleaned.txt: Length = 9006764, Shannon Entropy = 1.9792812782136475
Candidatus_Karelsulcia_muelleri_cleaned.txt: Length = 142117, Shannon Entropy = 1.8322368021416449
random_genome.txt: Length = 1000000, Shannon Entropy = 1.9999951310060542
Plasmodium_falciparum_cleaned.txt: Length = 2925236, Shannon Entropy = 1.700437913566233
Arabidopsis_thaliana_cleaned.txt: Length = 119482427, Shannon Entropy = 1.9431467139424183
Escherichia_coli_K-12_cleaned.txt: Length = 4641652, Shannon Entropy = 1.9998190013576105
Saccharomyces_cerevisiae_cleaned.txt: Length = 230218, Shannon Entropy = 1.9664766384439507
Rice_yellow_mottle_virus_satellite_cleaned.txt: Length = 220, Shannon Entropy = 1.9408982701656994
Malassezia_restricta_cleaned.txt: Length = 7369627, Shannon Entropy = 1.9907365513928101
Caenorhabditis_elegans_cleaned.txt: Length = 100286401, Shannon Entropy