In [None]:
import os
import csv
import re
import gzip
import random
import shutil
import requests
import numpy as np
import sentencepiece as spm
from urllib.parse import urlparse
import tempfile
import time

tokenizers_dir = "/Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenizers"
tokenized_sequences_dir = "/Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/tokenized_sequences"
os.makedirs(tokenizers_dir, exist_ok=True)
os.makedirs(tokenized_sequences_dir, exist_ok=True)

output_csv = "processing_log.csv"
species_info_csv = "/Users/tiananoll-walker/Documents/biotokens/used_code/cds_ncrna_processing_and_tokenization/species_info/species_info.csv"

def initialize_csv_log():
    #info log 
    with open(output_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Species", "File Type", "Subsample Size", "Time Taken (s)", "Status"])

def create_species_info_csv():
    if not os.path.exists(species_info_csv):
        os.makedirs(os.path.dirname(species_info_csv), exist_ok=True)

        species_data = [
        
        {
        "species_name": "Ignisphaera_aggregans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_79_collection/ignisphaera_aggregans_dsm_17230_gca_000145985/",
        "cds_file": "cds/Ignisphaera_aggregans_dsm_17230_gca_000145985.ASM14598v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Ignisphaera_aggregans_dsm_17230_gca_000145985.ASM14598v1.ncrna.fa.gz",
        "dna_file": "dna/Ignisphaera_aggregans_dsm_17230_gca_000145985.ASM14598v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Thermosphaera_aggregans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_4_collection/thermosphaera_aggregans_dsm_11486_gca_000092185/",
        "cds_file": "cds/Thermosphaera_aggregans_dsm_11486_gca_000092185.ASM9218v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Thermosphaera_aggregans_dsm_11486_gca_000092185.ASM9218v1.ncrna.fa.gz",
        "dna_file": "dna/Thermosphaera_aggregans_dsm_11486_gca_000092185.ASM9218v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Aeropyrum_pernix",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_0_collection/aeropyrum_pernix_k1_gca_000011125/",
        "cds_file": "cds/Aeropyrum_pernix_k1_gca_000011125.ASM1112v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Aeropyrum_pernix_k1_gca_000011125.ASM1112v1.ncrna.fa.gz",
        "dna_file": "dna/Aeropyrum_pernix_k1_gca_000011125.ASM1112v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Thermoproteus_tenax",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_100_collection/thermoproteus_tenax_kra_1_gca_000253055/",
        "cds_file": "cds/Thermoproteus_tenax_kra_1_gca_000253055.ASM25305v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Thermoproteus_tenax_kra_1_gca_000253055.ASM25305v1.ncrna.fa.gz",
        "dna_file": "dna/Thermoproteus_tenax_kra_1_gca_000253055.ASM25305v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Nanoarchaeum_equitans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/current/fasta/bacteria_0_collection/nanoarchaeum_equitans_kin4_m_gca_000008085/",
        "cds_file": "cds/Nanoarchaeum_equitans_kin4_m_gca_000008085.ASM808v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Nanoarchaeum_equitans_kin4_m_gca_000008085.ASM808v1.ncrna.fa.gz",
        "dna_file": "dna/Nanoarchaeum_equitans_kin4_m_gca_000008085.ASM808v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Methanosarcina_acetivorans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_0_collection/methanosarcina_acetivorans_c2a_gca_000007345/",
        "cds_file": "cds/Methanosarcina_acetivorans_c2a_gca_000007345.ASM734v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Methanosarcina_acetivorans_c2a_gca_000007345.ASM734v1.ncrna.fa.gz",
        "dna_file": "dna/Methanosarcina_acetivorans_c2a_gca_000007345.ASM734v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Halobacterium_salinarum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_60_collection/halobacterium_salinarum_gca_004799605/",
        "cds_file": "cds/Halobacterium_salinarum_gca_004799605.ASM479960v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Halobacterium_salinarum_gca_004799605.ASM479960v1.ncrna.fa.gz",
        "dna_file": "dna/Halobacterium_salinarum_gca_004799605.ASM479960v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Ignicoccus_hospitalis",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_37_collection/ignicoccus_hospitalis_kin4_i_gca_000017945/",
        "cds_file": "cds/Ignicoccus_hospitalis_kin4_i_gca_000017945.ASM1794v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Ignicoccus_hospitalis_kin4_i_gca_000017945.ASM1794v1.ncrna.fa.gz",
        "dna_file": "dna/Ignicoccus_hospitalis_kin4_i_gca_000017945.ASM1794v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Candidatus_Nitrosopelagicus",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_125_collection/candidatus_nitrosopelagicus_brevis_gca_000812185/",
        "cds_file": "cds/Candidatus_nitrosopelagicus_brevis_gca_000812185.ASM81218v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Candidatus_nitrosopelagicus_brevis_gca_000812185.ASM81218v1.ncrna.fa.gz",
        "dna_file": "dna/Candidatus_nitrosopelagicus_brevis_gca_000812185.ASM81218v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Mus_musculus",
        "ftp_directory": "https://ftp.ensembl.org/pub/release-112/fasta/mus_musculus/",
        "cds_file": "cds/Mus_musculus.GRCm39.cds.all.fa.gz",
        "ncrna_file": "ncrna/Mus_musculus.GRCm39.ncrna.fa.gz",
        "dna_file": "dna/Mus_musculus.GRCm39.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Gallus_gallus",
        "ftp_directory": "https://ftp.ensembl.org/pub/release-112/fasta/gallus_gallus/",
        "cds_file": "cds/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.cds.all.fa.gz",
        "ncrna_file": "ncrna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.ncrna.fa.gz",
        "dna_file": "dna/Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Mycobacterium_tuberculosis",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_4_collection/mycobacterium_tuberculosis_gca_001318445//",
        "cds_file": "cds/Mycobacterium_tuberculosis_gca_001318445.6505_5_10.cds.all.fa.gz",
        "ncrna_file": "",
        "dna_file": "dna/Mycobacterium_tuberculosis_gca_001318445.6505_5_10.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Chlorobaculum_tepidum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-60/fasta/bacteria_0_collection/chlorobaculum_tepidum_tls_gca_000006985/",
        "cds_file": "cds/Chlorobaculum_tepidum_tls_gca_000006985.ASM698v1.cds.all.fa.gz",
        "ncrna_file": "",
        "dna_file": "dna/Chlorobaculum_tepidum_tls_gca_000006985.ASM698v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Plasmodium_falciparum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-59/fasta/plasmodium_falciparum/",
        "cds_file": "cds/Plasmodium_falciparum.ASM276v2.cds.all.fa.gz",
        "ncrna_file": "ncrna/Plasmodium_falciparum.ASM276v2.ncrna.fa.gz",
        "dna_file": "dna/Plasmodium_falciparum.ASM276v2.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Dictyostelium_discoideum",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-59/fasta/dictyostelium_discoideum/",
        "cds_file": "cds/Dictyostelium_discoideum.dicty_2.7.cds.all.fa.gz",
        "ncrna_file": "ncrna/Dictyostelium_discoideum.dicty_2.7.ncrna.fa.gz",
        "dna_file": "dna/Dictyostelium_discoideum.dicty_2.7.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Arabidopsis_thaliana",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/fasta/arabidopsis_thaliana/",
        "cds_file": "cds/Arabidopsis_thaliana.TAIR10.cds.all.fa.gz",
        "ncrna_file": "ncrna/Arabidopsis_thaliana.TAIR10.ncrna.fa.gz",
        "dna_file": "dna/Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Oryza_sativa",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/fasta/oryza_sativa/",
        "cds_file": "cds/Oryza_sativa.IRGSP-1.0.cds.all.fa.gz",
        "ncrna_file": "ncrna/Oryza_sativa.IRGSP-1.0.ncrna.fa.gz",
        "dna_file": "dna/Oryza_sativa.IRGSP-1.0.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Zea_mays",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/fasta/zea_mays/",
        "cds_file": "cds/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.cds.all.fa.gz",
        "ncrna_file": "ncrna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.ncrna.fa.gz",
        "dna_file": "dna/Zea_mays.Zm-B73-REFERENCE-NAM-5.0.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Aspergillus_nidulans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-59/fasta/aspergillus_nidulans/",
        "cds_file": "cds/Aspergillus_nidulans.ASM1142v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Aspergillus_nidulans.ASM1142v1.ncrna.fa.gz",
        "dna_file": "dna/Aspergillus_nidulans.ASM1142v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Saccharomyces_cerevisiae",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-59/fasta/saccharomyces_cerevisiae/",
        "cds_file": "cds/Saccharomyces_cerevisiae.R64-1-1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Saccharomyces_cerevisiae.R64-1-1.ncrna.fa.gz",
        "dna_file": "dna/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Penicilliopsis_zonata",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/fungi/release-59/fasta/fungi_ascomycota3_collection/penicilliopsis_zonata_cbs_506_65_gca_001890105/",
        "cds_file": "cds/Penicilliopsis_zonata_cbs_506_65_gca_001890105.Aspzo1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Penicilliopsis_zonata_cbs_506_65_gca_001890105.Aspzo1.ncrna.fa.gz",
        "dna_file": "dna/Penicilliopsis_zonata_cbs_506_65_gca_001890105.Aspzo1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Apis_mellifera",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-59/fasta/apis_mellifera/",
        "cds_file": "cds/Apis_mellifera.Amel_HAv3.1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Apis_mellifera.Amel_HAv3.1.ncrna.fa.gz",
        "dna_file": "dna/Apis_mellifera.Amel_HAv3.1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Caenorhabditis_elegans",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-59/fasta/caenorhabditis_elegans/",
        "cds_file": "cds/Caenorhabditis_elegans.WBcel235.cds.all.fa.gz",
        "ncrna_file": "ncrna/Caenorhabditis_elegans.WBcel235.ncrna.fa.gz",
        "dna_file": "dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Drosophila_melanogaster",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/metazoa/release-59/fasta/drosophila_melanogaster/",
        "cds_file": "cds/Drosophila_melanogaster.BDGP6.46.cds.all.fa.gz",
        "ncrna_file": "ncrna/Drosophila_melanogaster.BDGP6.46.ncrna.fa.gz",
        "dna_file": "dna/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Streptococcus_pneumoniae",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_0_collection/streptococcus_pneumoniae_tigr4_gca_000006885/",
        "cds_file": "cds/Streptococcus_pneumoniae_tigr4_gca_000006885.ASM688v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Streptococcus_pneumoniae_tigr4_gca_000006885.ASM688v1.ncrna.fa.gz",
        "dna_file": "dna/Streptococcus_pneumoniae_tigr4_gca_000006885.ASM688v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Escherichia_coli",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/bacteria/release-59/fasta/bacteria_12_collection/escherichia_coli_gca_001606525/",
        "cds_file": "cds/Escherichia_coli_gca_001606525.ASM160652v1.cds.all.fa.gz",
        "ncrna_file": "ncrna/Escherichia_coli_gca_001606525.ASM160652v1.ncrna.fa.gz",
        "dna_file": "dna/Escherichia_coli_gca_001606525.ASM160652v1.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Tetrahymena_thermophila",
        "ftp_directory": "https://ftp.ensemblgenomes.ebi.ac.uk/pub/protists/release-59/fasta/tetrahymena_thermophila/",
        "cds_file": "cds/Tetrahymena_thermophila.JCVI-TTA1-2.2.cds.all.fa.gz",
        "ncrna_file": "ncrna/Tetrahymena_thermophila.JCVI-TTA1-2.2.ncrna.fa.gz",
        "dna_file": "dna/Tetrahymena_thermophila.JCVI-TTA1-2.2.dna.toplevel.fa.gz"
        },
        {
        "species_name": "Danio_rerio",
        "ftp_directory": "https://ftp.ensembl.org/pub/release-112/fasta/danio_rerio/",
        "cds_file": "cds/Danio_rerio.GRCz11.cds.all.fa.gz",
        "ncrna_file": "ncrna/Danio_rerio.GRCz11.ncrna.fa.gz",
        "dna_file": "dna/Danio_rerio.GRCz11.dna.toplevel.fa.gz"
        }
        
        # {
        # "species_name": "Sars_Cov2",
        # "ftp_directory": "https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/858/895/GCF_009858895.2_ASM985889v3/",
        # "cds_file": "cds/Sars_Cov2.cds.all.fa.gz",
        # "ncrna_file": "ncrna/Sars_Cov2.ncrna.fa.gz",
        # "dna_file": "dna/Sars_Cov2.dna.toplevel.fa.gz"
        # },
        # {
        # "species_name": "Human_alphaherpesvirus_2",
        # "ftp_directory": "",
        # "cds_file": "cds/Human_alphaherpesvirus_2.cds.all.fa.gz",
        # "ncrna_file": "ncrna/Human_alphaherpesvirus_2.ncrna.fa.gz",
        # "dna_file": "dna/Human_alphaherpesvirus_2.dna.toplevel.fa.gz"
        # },
        # {
        # "species_name": "Human_papillomavirus_5",
        # "ftp_directory": "",
        # "cds_file": "cds/Human_papillomavirus_5.cds.all.fa.gz",
        # "ncrna_file": "ncrna/Human_papillomavirus_5.ncrna.fa.gz",
        # "dna_file": "dna/Human_papillomavirus_5.dna.toplevel.fa.gz"
        # }
    
        ]

        with open(species_info_csv, mode='w', newline='') as f:
            fieldnames = ["species_name", "ftp_directory", "cds_file", "ncrna_file", "dna_file"]
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for species in species_data:
                writer.writerow(species)
        print(f"created species info file at {species_info_csv}")
    else:
        print(f"species info file already exists at {species_info_csv}")

create_species_info_csv()

def download_and_decompress(base_url, file_name):
    """downloads gzip file from the URL and decompresses it, saves it temporarily as a temp file. compressed file is removed once decompression is done"""
    url = f"{base_url}{file_name}"
    with tempfile.NamedTemporaryFile(delete=False, suffix=".gz") as temp_gz:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            for chunk in response.iter_content(chunk_size=8192):
                temp_gz.write(chunk)
        else:
            raise FileNotFoundError(f"file '{file_name}' not found at {url}")
    
    decompressed_file = tempfile.NamedTemporaryFile(delete=False, suffix=".fa")
    with gzip.open(temp_gz.name, 'rb') as f_in:
        with open(decompressed_file.name, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    
    os.remove(temp_gz.name)
    return decompressed_file.name

def clean_sequence_file(filepath):
    #each fasta file header is removed and each seq line is stripped of whitespaces and concatenated into a string
    cleaned_sequences = []
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith('>'):
                continue
            cleaned_sequences.append(line.strip())
    return ' '.join(cleaned_sequences)

def subsample_cds_ncrna(sequence, N=10**7, delimiter="."):
        """subsamples a cds or ncrna seq, adding delimiters between seqs

    Args:
        sequence (str): fhe full sequence string to subsample
        N (int): target length for subsampling
        delimiter (str): string to separate individual sequences

    Returns:
        str: a subsampled version of the sequence w delimiters added
    
    This function shuffles and concatenates sequence chunks until reaching
    the target length N. Delimiters are added to mark sequence boundaries.
    """
    cdss = sequence.split()
    subsampled_cdss = []
    subsample_length = 0
    np.random.seed(1872)
    randomized_indices = np.random.choice(range(len(cdss)), len(cdss), replace=False)
    for k in randomized_indices:
        c = cdss[k]
        subsample_length += len(c)
        subsampled_cdss.append(c + delimiter)
        if subsample_length >= N:
            break
    return "\n".join(subsampled_cdss)

def subsample_full_genome(sequence, N=10**7, chunk_size=4096):
        """subsamples a full genome sequence by creating chunks of a fixed size 4096

    Args:
        sequence (str): full genome sequence as a single string
        N (int): target length for the subsample
        chunk_size (int): size of each sequence chunk

    Returns:
        str: subsampled genome sequence w each chunk separated by a newline
    
    The function creates randomized chunks of size 4096(arbitrary num chosen to prevent setencepiece errors) until the 
    target length N is reached, adding each chunk as a separate line
    """
    chunks = [sequence[i:i + chunk_size] for i in range(0, len(sequence), chunk_size)]
    random.seed(1872)
    random.shuffle(chunks)
    sampled_chunks = []
    sampled_length = 0
    for chunk in chunks:
        sampled_chunks.append(chunk)
        sampled_length += len(chunk)
        if sampled_length >= N:
            break
    return "\n".join(sampled_chunks)

def train_and_tokenize(input_text, model_prefix, tokenized_output_file, vocab_size=10000):
    """
    trains a sp tokenizer on input_text, saves the model with the given prefix, and writes the tokenized output to file.
        
    Steps:
        1. use temp file to store input_text for processing by sptrainer
           - a temp file is created w read and write permissions (w+)
           - delete=True ensures it is removed from local after the with block ends.
           
        2. write input_text to this temp file
           - sp requires input data in a file format, so input_text is saved to this file,
             making it accessible for Sptrainer
        
        3. call temp_file.flush() to make sure all contents are written to disk.
           - This makes the data immediately available for any subsequent processes
        
        4. train tokenizer using sptrainer w specified params.
        
        5. load trained tokenizer and tokenize input_text, storing the results in tokens.
           -this encoding converts the input text into tokenized pieces based on the model
        
        6. Save those tokens to tokenized_output_file
           -the tokens are saved as a space-separated string
        
    the temp file is automatically deleted at the end of the with block. very cute, very demure
    """
    
    with tempfile.NamedTemporaryFile(mode="w+", delete=True) as temp_file:
        temp_file.write(input_text)
        temp_file.flush()
        spm.SentencePieceTrainer.train(
            input=temp_file.name,
            model_prefix=model_prefix,
            vocab_size=vocab_size,
            model_type='bpe',
            character_coverage=0.9995,
            max_sentence_length=5000
        )
    sp = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")
    tokens = sp.encode_as_pieces(input_text)
    with open(tokenized_output_file, "w") as f:
        f.write(" ".join(tokens))

def load_species_info(csv_file):
    #loads species info from csv file
    species_info = []
    with open(csv_file, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            species_info.append(row)
    return species_info

def process_all_species():
    initialize_csv_log()
    species_info_list = load_species_info(species_info_csv)
    for species in species_info_list:
        species_name = species['species_name']
        ftp_url = species['ftp_directory']
        
        file_info = {
            "cds": species['cds_file'],
            "ncrna": species['ncrna_file'],
            "dna": species['dna_file']
        }
        
        for file_type, file_name in file_info.items():
            try:
                model_prefix = os.path.join(tokenizers_dir, f"{species_name}_{file_type}_tokenizer")
                tokenized_output_file = os.path.join(tokenized_sequences_dir, f"{species_name}_{file_type}_tokenized.txt")

                if os.path.exists(model_prefix + ".model") and os.path.exists(tokenized_output_file):
                    print(f"tokenizer files already exist for {species_name} - {file_type}. skipping ")
                    continue

                decompressed_file = download_and_decompress(ftp_url, file_name)
                cleaned_seq = clean_sequence_file(decompressed_file)
                
                if file_type == 'dna':
                    subsample = subsample_full_genome(cleaned_seq)
                else:
                    subsample = subsample_cds_ncrna(cleaned_seq)

                train_and_tokenize(subsample, model_prefix, tokenized_output_file)
                
                os.remove(decompressed_file)
                print(f"processed and deleted {decompressed_file} for {species_name} - {file_type}")

            except Exception as e:
                print(f"failed processing {species_name} - {file_type}: {e}")

process_all_species()