In [None]:
import os
import gzip
import requests
import tempfile
import pandas as pd
from urllib.parse import urljoin

species_info_list = [

     {
        "species_name": "Gallus_gallus",
        "ftp_directory": "https://ftp.ensembl.org/pub/release-112/fasta/gallus_gallus/",
        "cds_file": "cds/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.cds.all.fa.gz",
        "ncrna_file": "ncrna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.ncrna.fa.gz",
        "dna_file": "dna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.dna.toplevel.fa.gz",
        "pep_file": "pep/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.pep.all.fa.gz"
        },
        {
        "species_name": "Mycobacterium_tuberculosis",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_4_collection/mycobacterium_tuberculosis_gca_001318445//",
        "cds_file": "cds/Mycobacterium_tuberculosis_gca_001318445.6505_5_10.cds.all.fa.gz",
        "ncrna_file": "",
        "dna_file": "dna/Mycobacterium_tuberculosis_gca_001318445.6505_5_10.dna.toplevel.fa.gz",
        "pep_file": "pep/Mycobacterium_tuberculosis_gca_001318445.6505_5_10.pep.all.fa.gz"
        },
        {
        "species_name": "Chlorobaculum_tepidum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-60/fasta/bacteria_0_collection/chlorobaculum_tepidum_tls_gca_000006985/",
        "cds_file": "cds/Chlorobaculum_tepidum_tls_gca_000006985.ASM698v1.cds.all.fa.gz",
        "ncrna_file": "",
        "dna_file": "dna/Chlorobaculum_tepidum_tls_gca_000006985.ASM698v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Chlorobaculum_tepidum_tls_gca_000006985.ASM698v1.pep.all.fa.gz"
        },
                {
        "species_name": "Ignisphaera_aggregans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_79_collection/ignisphaera_aggregans_dsm_17230_gca_000145985/",
        "cds_file": "cds/Ignisphaera_aggregans_dsm_17230_gca_000145985.ASM14598v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Ignisphaera_aggregans_dsm_17230_gca_000145985.ASM14598v1.ncrna.fa.gz",
        "dna_file": "dna/Ignisphaera_aggregans_dsm_17230_gca_000145985.ASM14598v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Ignisphaera_aggregans_dsm_17230_gca_000145985.ASM14598v1.pep.all.fa.gz"
        },
        {
        "species_name": "Thermosphaera_aggregans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_4_collection/thermosphaera_aggregans_dsm_11486_gca_000092185/",
        "cds_file": "cds/Thermosphaera_aggregans_dsm_11486_gca_000092185.ASM9218v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Thermosphaera_aggregans_dsm_11486_gca_000092185.ASM9218v1.ncrna.fa.gz",
        "dna_file": "dna/Thermosphaera_aggregans_dsm_11486_gca_000092185.ASM9218v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Thermosphaera_aggregans_dsm_11486_gca_000092185.ASM9218v1.pep.all.fa.gz"
        },
        {
        "species_name": "Aeropyrum_pernix",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_0_collection/aeropyrum_pernix_k1_gca_000011125/",
        "cds_file": "cds/Aeropyrum_pernix_k1_gca_000011125.ASM1112v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Aeropyrum_pernix_k1_gca_000011125.ASM1112v1.ncrna.fa.gz",
        "dna_file": "dna/Aeropyrum_pernix_k1_gca_000011125.ASM1112v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Aeropyrum_pernix_k1_gca_000011125.ASM1112v1.pep.all.fa.gz"
        },
        {
        "species_name": "Thermoproteus_tenax",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_100_collection/thermoproteus_tenax_kra_1_gca_000253055/",
        "cds_file": "cds/Thermoproteus_tenax_kra_1_gca_000253055.ASM25305v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Thermoproteus_tenax_kra_1_gca_000253055.ASM25305v1.ncrna.fa.gz",
        "dna_file": "dna/Thermoproteus_tenax_kra_1_gca_000253055.ASM25305v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Thermoproteus_tenax_kra_1_gca_000253055.ASM25305v1.pep.all.fa.gz"
        },
        {
        "species_name": "Nanoarchaeum_equitans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/current/fasta/bacteria_0_collection/nanoarchaeum_equitans_kin4_m_gca_000008085/",
        "cds_file": "cds/Nanoarchaeum_equitans_kin4_m_gca_000008085.ASM808v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Nanoarchaeum_equitans_kin4_m_gca_000008085.ASM808v1.ncrna.fa.gz",
        "dna_file": "dna/Nanoarchaeum_equitans_kin4_m_gca_000008085.ASM808v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Nanoarchaeum_equitans_kin4_m_gca_000008085.ASM808v1.pep.all.fa.gz"
        },
        {
        "species_name": "Methanosarcina_acetivorans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_0_collection/methanosarcina_acetivorans_c2a_gca_000007345/",
        "cds_file": "cds/Methanosarcina_acetivorans_c2a_gca_000007345.ASM734v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Methanosarcina_acetivorans_c2a_gca_000007345.ASM734v1.ncrna.fa.gz",
        "dna_file": "dna/Methanosarcina_acetivorans_c2a_gca_000007345.ASM734v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Methanosarcina_acetivorans_c2a_gca_000007345.ASM734v1.pep.all.fa.gz"
        },
        {
        "species_name": "Halobacterium_salinarum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_60_collection/halobacterium_salinarum_gca_004799605/",
        "cds_file": "cds/Halobacterium_salinarum_gca_004799605.ASM479960v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Halobacterium_salinarum_gca_004799605.ASM479960v1.ncrna.fa.gz",
        "dna_file": "dna/Halobacterium_salinarum_gca_004799605.ASM479960v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Halobacterium_salinarum_gca_004799605.ASM479960v1.pep.all.fa.gz"
        },
        {
        "species_name": "Ignicoccus_hospitalis",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_37_collection/ignicoccus_hospitalis_kin4_i_gca_000017945/",
        "cds_file": "cds/Ignicoccus_hospitalis_kin4_i_gca_000017945.ASM1794v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Ignicoccus_hospitalis_kin4_i_gca_000017945.ASM1794v1.ncrna.fa.gz",
        "dna_file": "dna/Ignicoccus_hospitalis_kin4_i_gca_000017945.ASM1794v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Ignicoccus_hospitalis_kin4_i_gca_000017945.ASM1794v1.pep.all.fa.gz"
        },
        {
        "species_name": "Candidatus_Nitrosopelagicus",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_125_collection/candidatus_nitrosopelagicus_brevis_gca_000812185/",
        "cds_file": "cds/Candidatus_nitrosopelagicus_brevis_gca_000812185.ASM81218v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Candidatus_nitrosopelagicus_brevis_gca_000812185.ASM81218v1.ncrna.fa.gz",
        "dna_file": "dna/Candidatus_nitrosopelagicus_brevis_gca_000812185.ASM81218v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Candidatus_nitrosopelagicus_brevis_gca_000812185.ASM81218v1.pep.all.fa.gz"
        },
        {
        "species_name": "Mus_musculus",
        "ftp_directory": "https://ftp.ensembl.org/pub/release-112/fasta/mus_musculus/",
        "cds_file": "cds/Mus_musculus.GRCm39.cds.all.fa.gz",
        "ncrna_file": "ncrna/Mus_musculus.GRCm39.ncrna.fa.gz",
        "dna_file": "dna/Mus_musculus.GRCm39.dna.toplevel.fa.gz",
        "pep_file": "pep/Mus_musculus.GRCm39.pep.all.fa.gz"
        },
        {
        "species_name": "Plasmodium_falciparum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-59/fasta/plasmodium_falciparum/",
        "cds_file": "cds/Plasmodium_falciparum.ASM276v2.cds.all.fa.gz",
        "ncrna_file": "ncrna/Plasmodium_falciparum.ASM276v2.ncrna.fa.gz",
        "dna_file": "dna/Plasmodium_falciparum.ASM276v2.dna.toplevel.fa.gz",
        "pep_file": "pep/Plasmodium_falciparum.ASM276v2.pep.all.fa.gz"
        },
        {
        "species_name": "Dictyostelium_discoideum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-59/fasta/dictyostelium_discoideum/",
        "cds_file": "cds/Dictyostelium_discoideum.dicty_2.7.cds.all.fa.gz",
        "ncrna_file": "ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz",
        "dna_file": "dna/Dictyostelium_discoideum.dicty_2.7.dna.toplevel.fa.gz",
        "pep_file": "pep/Dictyostelium_discoideum.dicty_2.7.pep.all.fa.gz"
        },
        {
        "species_name": "Arabidopsis_thaliana",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/fasta/arabidopsis_thaliana/",
        "cds_file": "cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz",
        "ncrna_file": "ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz",
        "dna_file": "dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz",
        "pep_file": "pep/Arabidopsis_thaliana.TAIR10.pep.all.fa.gz"
        },
        {
        "species_name": "Oryza_sativa",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/fasta/oryza_sativa/",
        "cds_file": "cds/Oryza_sativa.IRGSP-1.0.cds.all.fa.gz",
        "ncrna_file": "ncrna/Oryza_sativa.IRGSP-1.0.ncrna.fa.gz",
        "dna_file": "dna/Oryza_sativa.IRGSP-1.0.dna.toplevel.fa.gz",
        "pep_file": "pep/Oryza_sativa.IRGSP-1.0.pep.all.fa.gz"
        },
        {
        "species_name": "Zea_mays",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/fasta/zea_mays/",
        "cds_file": "cds/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.cds.all.fa.gz",
        "ncrna_file": "ncrna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.ncrna.fa.gz",
        "dna_file": "dna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa.gz",
        "pep_file": "pep/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.pep.all.fa.gz"
        },
        {
        "species_name": "Aspergillus_nidulans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-59/fasta/aspergillus_nidulans/",
        "cds_file": "cds/Aspergillus_nidulans.ASM1142v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Aspergillus_nidulans.ASM1142v1.ncrna.fa.gz",
        "dna_file": "dna/Aspergillus_nidulans.ASM1142v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Aspergillus_nidulans.ASM1142v1.pep.all.fa.gz"
        },
        {
        "species_name": "Saccharomyces_cerevisiae",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-59/fasta/saccharomyces_cerevisiae/",
        "cds_file": "cds/Saccharomyces_cerevisiae.R64-1-1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz",
        "dna_file": "dna/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz",
        "pep_file": "pep/Saccharomyces_cerevisiae.R64-1-1.pep.all.fa.gz"
        },
        {
        "species_name": "Penicilliopsis_zonata",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-59/fasta/fungi_ascomycota3_collection/penicilliopsis_zonata_cbs_506_65_gca_001890105/",
        "cds_file": "cds/Penicilliopsis_zonata_cbs_506_65_gca_001890105.Aspzo1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Penicilliopsis_zonata_cbs_506_65_gca_001890105.Aspzo1.ncrna.fa.gz",
        "dna_file": "dna/Penicilliopsis_zonata_cbs_506_65_gca_001890105.Aspzo1.dna.toplevel.fa.gz",
        "pep_file": "pep/Penicilliopsis_zonata_cbs_506_65_gca_001890105.Aspzo1.pep.all.fa.gz"
        },
        {
        "species_name": "Apis_mellifera",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-59/fasta/apis_mellifera/",
        "cds_file": "cds/Apis_mellifera.Amel_HAv3.1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Apis_mellifera.Amel_HAv3.1.ncrna.fa.gz",
        "dna_file": "dna/Apis_mellifera.Amel_HAv3.1.dna.toplevel.fa.gz",
        "pep_file": "pep/Apis_mellifera.Amel_HAv3.1.pep.all.fa.gz"
        },
        {
        "species_name": "Caenorhabditis_elegans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-59/fasta/caenorhabditis_elegans/",
        "cds_file": "cds/Caenorhabditis_elegans.WBcel235.cds.all.fa.gz",
        "ncrna_file": "ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz",
        "dna_file": "dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz",
        "pep_file": "pep/Caenorhabditis_elegans.WBcel235.pep.all.fa.gz"
        },
        {
        "species_name": "Drosophila_melanogaster",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-59/fasta/drosophila_melanogaster/",
        "cds_file": "cds/Drosophila_melanogaster.BDGP6.46.cds.all.fa.gz",
        "ncrna_file": "ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz",
        "dna_file": "dna/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa.gz",
        "pep_file": "pep/Drosophila_melanogaster.BDGP6.46.pep.all.fa.gz"
        },
        {
        "species_name": "Streptococcus_pneumoniae",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_0_collection/streptococcus_pneumoniae_tigr4_gca_000006885/",
        "cds_file": "cds/Streptococcus_pneumoniae_tigr4_gca_000006885.ASM688v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Streptococcus_pneumoniae_tigr4_gca_000006885.ASM688v1.ncrna.fa.gz",
        "dna_file": "dna/Streptococcus_pneumoniae_tigr4_gca_000006885.ASM688v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Streptococcus_pneumoniae_tigr4_gca_000006885.ASM688v1.pep.all.fa.gz"
        },
        {
        "species_name": "Escherichia_coli",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_12_collection/escherichia_coli_gca_001606525/",
        "cds_file": "cds/Escherichia_coli_gca_001606525.ASM160652v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Escherichia_coli_gca_001606525.ASM160652v1.ncrna.fa.gz",
        "dna_file": "dna/Escherichia_coli_gca_001606525.ASM160652v1.dna.toplevel.fa.gz",
        "pep_file": "pep/Escherichia_coli_gca_001606525.ASM160652v1.pep.all.fa.gz"
        },
        {
        "species_name": "Tetrahymena_thermophila",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-59/fasta/tetrahymena_thermophila/",
        "cds_file": "cds/Tetrahymena_thermophila.JCVI-TTA1-2.2.cds.all.fa.gz",
        "ncrna_file": "ncrna/Tetrahymena_thermophila.JCVI-TTA1-2.2.ncrna.fa.gz",
        "dna_file": "dna/Tetrahymena_thermophila.JCVI-TTA1-2.2.dna.toplevel.fa.gz",
        "pep_file": "pep/Tetrahymena_thermophila.JCVI-TTA1-2.2.pep.all.fa.gz"  
        },
        {
        "species_name": "Danio_rerio",
        "ftp_directory": "https://ftp.ensembl.org/pub/release-112/fasta/danio_rerio/",
        "cds_file": "cds/Danio_rerio.GRCz11.cds.all.fa.gz",
        "ncrna_file": "ncrna/Danio_rerio.GRCz11.ncrna.fa.gz",
        "dna_file": "dna/Danio_rerio.GRCz11.dna.toplevel.fa.gz",
        "pep_file": "pep/Danio_rerio.GRCz11.pep.all.fa.gz"
        }
]



def count_non_actg(ftp_url, file_name):
    """Downloads FASTA file, counts non ACTG characters while streaming, and deletes the temp file after processing."""
    url = urljoin(ftp_url, file_name)
    response = requests.get(url, stream=True)
    
    if response.status_code != 200:
        print(f"error: failed to access {url}")
        return None, None, None, None

    total_bases = 0
    non_actg_count = 0

    valid_bases = {"A", "C", "T", "G"} 

    with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as temp_gz:
        for chunk in response.iter_content(chunk_size=8192):
            temp_gz.write(chunk)

    with gzip.open(temp_gz.name, 'rt') as f:
        for line in f:
            if line.startswith(">"):
                continue  
            sequence = line.strip().upper()
            total_bases += len(sequence)
            non_actg_count += sum(1 for base in sequence if base not in valid_bases)

    os.remove(temp_gz.name) 

    if total_bases > 0:
        non_actg_percentage = round((non_actg_count / total_bases) * 100,6)
    else:
        non_actg_percentage = 0

    return file_name, total_bases, non_actg_count, non_actg_percentage


results = []
for species in species_info_list:
    ftp_url = species["ftp_directory"]

    for file_type in ["cds_file", "ncrna_file"]:
        file_name = species.get(file_type, "")
        if file_name:
            file_result = count_non_actg(ftp_url, file_name)
            if file_result[0]:
                results.append([species["species_name"], file_type, *file_result])

            
df = pd.DataFrame(results, columns=["Species", "File Type", "File Name", "Total Bases", "Non-ACTG Count", "Non-ACTG %"])

from IPython.display import display
display(df)
df.to_csv("nonACGTs.csv", index=False)