In [45]:
import os
import re
import gzip
import pandas as pd
from collections import Counter
from Bio import SeqIO
import math
import requests

In [46]:
# directory for downloaded assembly reports and sequences
assembly_reports_dir = "assembly_reports"
genome_sequences_dir = "genome_sequences"
combined_assembly_summary_path = "combined_assembly_summary.txt"

os.makedirs(assembly_reports_dir, exist_ok=True)
os.makedirs(genome_sequences_dir, exist_ok=True)

# Define organisms and their assembly report URLs
organisms = {
    "Escherichia coli K-12": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2/GCF_000005845.2_ASM584v2_assembly_report.txt",
    "Halobacterium salinarum NRC-1": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/006/805/GCF_000006805.1_ASM680v1/GCF_000006805.1_ASM680v1_assembly_report.txt",
    "Plasmodium falciparum": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/765/GCF_000002765.6_GCA_000002765/GCF_000002765.6_GCA_000002765_assembly_report.txt",
    "Arabidopsis thaliana": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1/GCF_000001735.4_TAIR10.1_assembly_report.txt",
    "Saccharomyces cerevisiae": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64/GCF_000146045.2_R64_assembly_report.txt"
}

def download_assembly_report(organism_name, url):
    response = requests.get(url)
    if response.status_code == 200:
        file_path = os.path.join(assembly_reports_dir, f"{organism_name.replace(' ', '_')}_assembly_report.txt")
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded assembly report for {organism_name}")
    else:
        print(f"Failed to download assembly report for {organism_name}. Status code: {response.status_code}")
        print(f"URL: {url}")

for organism_name, url in organisms.items():
    download_assembly_report(organism_name, url)



Downloaded assembly report for Escherichia coli K-12
Downloaded assembly report for Halobacterium salinarum NRC-1
Downloaded assembly report for Plasmodium falciparum
Downloaded assembly report for Arabidopsis thaliana
Downloaded assembly report for Saccharomyces cerevisiae


In [47]:
# define cols for the assembly summary
columns = [
    'assembly_accession', 'bioproject', 'biosample', 'wgs_master',
    'refseq_category', 'taxid', 'species_taxid', 'organism_name',
    'infraspecific_name', 'isolate', 'version_status', 'assembly_level',
    'release_type', 'genome_rep', 'seq_rel_date', 'asm_name', 'asm_submitter',
    'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path', 'excluded_from_refseq',
    'relation_to_type_material', 'asm_not_live_date', 'assembly_type', 'group',
    'genome_size', 'genome_size_ungapped', 'gc_percent', 'replicon_count',
    'scaffold_count', 'contig_count', 'annotation_provider', 'annotation_name',
    'annotation_date', 'total_gene_count', 'protein_coding_gene_count',
    'non_coding_gene_count', 'pubmed_id', 'length', 'Shannon'
]

data = [
    {
        'assembly_accession': 'GCF_000005845.2', 'bioproject': 'PRJNA225',
        'biosample': 'SAMN02604091', 'wgs_master': 'na',
        'refseq_category': 'Reference Genome', 'taxid': 511145,
        'species_taxid': 511145, 'organism_name': 'Escherichia coli K-12',
        'infraspecific_name': 'strain=K-12 substr. MG1655', 'isolate': 'na',
        'version_status': 'latest', 'assembly_level': 'Complete Genome',
        'release_type': 'Major', 'genome_rep': 'Full',
        'seq_rel_date': '2013/09/26', 'asm_name': 'ASM584v2',
        'asm_submitter': 'Univ. Wisconsin', 'gbrs_paired_asm': 'GCA_000005845.2',
        'paired_asm_comp': 'identical',
        'ftp_path': 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/005/845/GCF_000005845.2_ASM584v2',
        'excluded_from_refseq': 'na', 'relation_to_type_material': 'na',
        'asm_not_live_date': 'na', 'assembly_type': 'haploid', 'group': 'bacteria',
        'genome_size': 4641652, 'genome_size_ungapped': 4641652,
        'gc_percent': 50.79, 'replicon_count': 1, 'scaffold_count': 1,
        'contig_count': 1, 'annotation_provider': 'NCBI RefSeq',
        'annotation_name': 'NCBI Prokaryotic Genome Annotation Pipeline (PGAP)',
        'annotation_date': '2023/04/26', 'total_gene_count': 4200,
        'protein_coding_gene_count': 4000, 'non_coding_gene_count': 200,
        'pubmed_id': 'na', 'length': 4641652, 'Shannon': 1.9998190013576105
    },
    {
        'assembly_accession': 'GCF_000001735.4', 'bioproject': 'PRJNA10719',
        'biosample': 'SAMN03081427', 'wgs_master': 'na',
        'refseq_category': 'Reference Genome', 'taxid': 3702,
        'species_taxid': 3702, 'organism_name': 'Arabidopsis thaliana',
        'infraspecific_name': 'ecotype=Columbia', 'isolate': 'na',
        'version_status': 'latest', 'assembly_level': 'Chromosome',
        'release_type': 'Minor', 'genome_rep': 'Full',
        'seq_rel_date': '2018/03/15', 'asm_name': 'TAIR10.1',
        'asm_submitter': 'The Arabidopsis Information Resource (TAIR)', 'gbrs_paired_asm': 'GCA_000001735.2',
        'paired_asm_comp': 'identical',
        'ftp_path': 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/735/GCF_000001735.4_TAIR10.1',
        'excluded_from_refseq': 'na', 'relation_to_type_material': 'na',
        'asm_not_live_date': 'na', 'assembly_type': 'haploid', 'group': 'plants',
        'genome_size': 135650600, 'genome_size_ungapped': 135650600,
        'gc_percent': 36.1, 'replicon_count': 5, 'scaffold_count': 5,
        'contig_count': 7, 'annotation_provider': 'TAIR',
        'annotation_name': 'TAIR10.1', 'annotation_date': '2018/03/15',
        'total_gene_count': 27415, 'protein_coding_gene_count': 27415,
        'non_coding_gene_count': 'na', 'pubmed_id': 'na', 'length': 30263312, 'Shannon': 1.9416279304861543
    },
    {
        'assembly_accession': 'GCF_000006805.1', 'bioproject': 'PRJNA412908',
        'biosample': 'SAMN02604216', 'wgs_master': 'na',
        'refseq_category': 'Reference Genome', 'taxid': 64091,
        'species_taxid': 64091, 'organism_name': 'Halobacterium salinarum NRC-1',
        'infraspecific_name': 'strain=NRC-1; ATCC 700922', 'isolate': 'na',
        'version_status': 'latest', 'assembly_level': 'Complete Genome',
        'release_type': 'Major', 'genome_rep': 'Full',
        'seq_rel_date': '2001/01/09', 'asm_name': 'ASM680v1',
        'asm_submitter': 'University of Massachusetts-Amherst, University of Washington', 'gbrs_paired_asm': 'GCA_000006805.1',
        'paired_asm_comp': 'identical',
        'ftp_path': 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/006/805/GCF_000006805.1_ASM680v1',
        'excluded_from_refseq': 'na', 'relation_to_type_material': 'na',
        'asm_not_live_date': 'na', 'assembly_type': 'haploid', 'group': 'archaea',
        'genome_size': 2014239, 'genome_size_ungapped': 2014239,
        'gc_percent': 68.0, 'replicon_count': 3, 'scaffold_count': 3,
        'contig_count': 3, 'annotation_provider': 'NCBI RefSeq',
        'annotation_name': 'NCBI Prokaryotic Genome Annotation Pipeline (PGAP)',
        'annotation_date': '2001/01/09', 'total_gene_count': 2700,
        'protein_coding_gene_count': 2600, 'non_coding_gene_count': 100,
        'pubmed_id': 'na', 'length': 2014239, 'Shannon': 1.905323387374254
    },
    {
    'assembly_accession': 'GCF_000002765.6', 'bioproject': 'PRJNA13173',
    'biosample': 'SAMN00102897', 'wgs_master': 'na',
    'refseq_category': 'Representative Genome', 'taxid': 36329,
    'species_taxid': 36329, 'organism_name': 'Plasmodium falciparum 3D7',
    'infraspecific_name': 'Isolate=3D7', 'isolate': '3D7',
    'version_status': 'latest', 'assembly_level': 'Complete Genome',
    'release_type': 'Minor', 'genome_rep': 'Full',
    'seq_rel_date': '2016/04/07', 'asm_name': 'GCA_000002765',
    'asm_submitter': 'Plasmodium falciparum Genome Sequencing Consortium',
    'gbrs_paired_asm': 'GCA_000002765.3', 'paired_asm_comp': 'identical',
    'ftp_path': 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/765/GCF_000002765.6_GCA_000002765',
    'excluded_from_refseq': 'na', 'relation_to_type_material': 'na',
    'asm_not_live_date': 'na', 'assembly_type': 'haploid', 'group': 'protozoa',
    'genome_size': 23313821, 'genome_size_ungapped': 23313821,
    'gc_percent': 19.4, 'replicon_count': 14, 'scaffold_count': 14,
    'contig_count': 14, 'annotation_provider': 'Plasmodium falciparum Genome Sequencing Consortium',
    'annotation_name': 'GCA_000002765', 'annotation_date': '2016/04/07',
    'total_gene_count': 5400, 'protein_coding_gene_count': 5200, 'non_coding_gene_count': 200,
    'pubmed_id': 'na', 'length': 2925236, 'Shannon': 1.700437913566233
},
{
    'assembly_accession': 'GCF_000146045.2', 'bioproject': 'PRJNA43747',
    'biosample': 'na', 'wgs_master': 'na',
    'refseq_category': 'Reference Genome', 'taxid': 559292,
    'species_taxid': 559292, 'organism_name': 'Saccharomyces cerevisiae S288C',
    'infraspecific_name': 'strain=S288C', 'isolate': 'na',
    'version_status': 'latest', 'assembly_level': 'Complete Genome',
    'release_type': 'Major', 'genome_rep': 'Full',
    'seq_rel_date': '2014/12/17', 'asm_name': 'R64',
    'asm_submitter': 'Saccharomyces Genome Database', 'gbrs_paired_asm': 'GCA_000146045.2',
    'paired_asm_comp': 'identical',
    'ftp_path': 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/146/045/GCF_000146045.2_R64',
    'excluded_from_refseq': 'na', 'relation_to_type_material': 'na',
    'asm_not_live_date': 'na', 'assembly_type': 'haploid', 'group': 'fungi',
    'genome_size': 12157105, 'genome_size_ungapped': 12157105,
    'gc_percent': 38.3, 'replicon_count': 17, 'scaffold_count': 17,
    'contig_count': 17, 'annotation_provider': 'NCBI RefSeq',
    'annotation_name': 'NCBI Prokaryotic Genome Annotation Pipeline (PGAP)',
    'annotation_date': '2014/12/17', 'total_gene_count': 6700,
    'protein_coding_gene_count': 6600, 'non_coding_gene_count': 100,
    'pubmed_id': 'na', 'length': 230218, 'Shannon': 1.9664766384439507
}
]

df = pd.DataFrame(data, columns = columns)

output_file_path = 'updated_assembly_summary_w_lengths_and_entropy.xlsx'
df.to_excel(output_file_path, index=False)

print(f"excel file created: {output_file_path}")

excel file created: updated_assembly_summary_w_lengths_and_entropy.xlsx


In [48]:

gcf_files_path = "/Users/tiananoll-walker/Documents/biotokens/GCF_files"
genome_sequences_dir = "genome_sequences"
if not os.path.exists(genome_sequences_dir):
    os.makedirs(genome_sequences_dir)

# define organisms and their corresponding GCF files
organisms = {
    "Escherichia coli K-12": "GCF_000005845.2_ASM584v2_genomic.fna.gz",
    "Halobacterium salinarum NRC-1": "GCF_000006805.1_ASM680v1_genomic.fna.gz",
    "Plasmodium falciparum": "GCF_000002765.6_GCA_000002765_genomic.fna.gz",
    "Arabidopsis thaliana": "GCF_000001735.4_TAIR10.1_genomic.fna.gz",
    "Saccharomyces cerevisiae": "GCF_000146045.2_R64_genomic.fna.gz"
}

def extract_and_clean_sequence(file_path, organism_name):
    base_filename = f"{organism_name.replace(' ', '_')}"
    cleaned_filename = f"{genome_sequences_dir}/{base_filename}_cleaned.txt"
    
    if os.path.exists(cleaned_filename):
        print(f"cleaned sequence for {organism_name} already exists. Skipping extraction.")
        return cleaned_filename
    
    with gzip.open(file_path, 'rt') as infile:
        lines = infile.readlines()
    
    sequence = ''.join([line.strip() for line in lines if not line.startswith(">")])
    cleaned_sequence = re.sub(r'[^ACGTacgt]', '', sequence).upper()
    
    with open(cleaned_filename, 'w') as outfile:
        outfile.write(cleaned_sequence)
    
    print(f"Cleaned sequence saved to {cleaned_filename}")
    return cleaned_filename

for organism_name, gcf_filename in organisms.items():
    file_path = os.path.join(gcf_files_path, gcf_filename)
    extract_and_clean_sequence(file_path, organism_name)

Cleaned sequence for Escherichia coli K-12 already exists. Skipping extraction.
Cleaned sequence for Halobacterium salinarum NRC-1 already exists. Skipping extraction.
Cleaned sequence for Plasmodium falciparum already exists. Skipping extraction.
Cleaned sequence for Arabidopsis thaliana already exists. Skipping extraction.
Cleaned sequence for Saccharomyces cerevisiae already exists. Skipping extraction.


In [49]:

def shannon_entropy(s):
    frequency = Counter(s)
    probabilities = [freq / len(s) for freq in frequency.values()]
    entropy = -sum(p * math.log2(p) for p in probabilities)
    return entropy

lengths = []
entropies = []

# calculate length and entropy for cleaned sequences
for filename in os.listdir(genome_sequences_dir):
    if filename.endswith("_cleaned.txt"):
        filepath = os.path.join(genome_sequences_dir, filename)
        with open(filepath, 'r') as f:
            sequence = f.read()
        genome_length = len(sequence)
        genome_entropy = shannon_entropy(sequence)
        lengths.append(genome_length)
        entropies.append(genome_entropy)
        print(f'{filename}: Length = {genome_length}, Shannon Entropy = {genome_entropy}')

results_df = pd.DataFrame({
    "Filename": [filename.replace("_cleaned.txt", "") for filename in os.listdir(genome_sequences_dir) if filename.endswith("_cleaned.txt")],
    "Length": lengths,
    "Shannon Entropy": entropies
})

results_filename = "genome_sequences_lengths_and_entropy.xlsx"
results_df.to_excel(results_filename, index=False)
print(f"results saved to {results_filename}")

Plasmodium_falciparum_cleaned.txt: Length = 2925236, Shannon Entropy = 1.700437913566233
Arabidopsis_thaliana_cleaned.txt: Length = 119482427, Shannon Entropy = 1.9431467139424183
Escherichia_coli_K-12_cleaned.txt: Length = 4641652, Shannon Entropy = 1.9998190013576105
Saccharomyces_cerevisiae_cleaned.txt: Length = 230218, Shannon Entropy = 1.9664766384439507
Halobacterium_salinarum_NRC-1_cleaned.txt: Length = 2014239, Shannon Entropy = 1.905323387374254
Results saved to genome_sequences_lengths_and_entropy.xlsx
