# Compressing biotypes from Ensembl
# You don't need to run this notebook! It takes a while and the output is always the same (in csv files)

In [1]:
import os
import concurrent.futures
import threading
import csv
import queue
import gzip
import shutil
import ftplib
from ftplib import FTP
from Bio import SeqIO
import subprocess
from typing import List, Dict, Optional


In [2]:
def find_fasta_file(ftp_server, ftp_directory, file_suffix):
    """
    Connects to an FTP server, navigates to the specified directory, and finds the correct FASTA file based on suffix.
    """
    with FTP(ftp_server) as ftp:
        ftp.login()  # Login anonymously
        ftp.cwd(ftp_directory)
        files = ftp.nlst()  # List all files in the directory
        
        # Find the file that ends with the specified suffix
        for file in files:
            if file.endswith(file_suffix):
                return file
        raise FileNotFoundError(f"No file ending with '{file_suffix}' found in the directory.")

def find_download_and_decompress_from_ftp(
        ftp_server: str, 
        ftp_directory: str, 
        file_suffix: str):
    with FTP(ftp_server) as ftp:
        ftp.login()  # Login anonymously
        ftp.cwd(ftp_directory)
        files = ftp.nlst()  # List all files in the directory
        
        # Find the file that ends with the specified suffix
        for file in files:
            if file.endswith(file_suffix):
                local_filename = file
                with open(local_filename, 'wb') as local_file:
                    ftp.retrbinary(f"RETR {file}", local_file.write)
                decompressed_file = local_filename.rstrip('.gz')
                with gzip.open(local_filename, 'rb') as f_in:
                    with open(decompressed_file, 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)
                os.remove(local_filename)  # Optionally remove the gzipped file
                return decompressed_file
        raise FileNotFoundError(f"No file ending with '{file_suffix}' found in the directory.")

def decompress_gz_file(gz_file):
    """
    Decompresses a .gz file.
    """
    decompressed_file = gz_file.rstrip('.gz')
    with gzip.open(gz_file, 'rb') as f_in:
        with open(decompressed_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(gz_file)  # Optionally remove the gzipped file
    return decompressed_file

def calculate_compression_ratio(fasta_file):
    """
    Compresses the text file and calculates the compression ratio.
    """
    
    compressed_file_name = fasta_file+'.co'
    uncompressed_file_name = fasta_file+'.de'
    gz_file_name = fasta_file+'.gz'
    # Compress the fasta file

    # print(f'GECO3 Compressing fasta file {fasta_file}')
    command = f'/Users/celia/VSCode/dnacomp/geco3/src/GeCo3 {fasta_file}'
    subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    compressed_size = os.path.getsize(compressed_file_name)

    # print(f'Uncompressing compressed fasta file {compressed_file_name}')
    # uncompress the compressed file
    command = f'/Users/celia/VSCode/dnacomp/geco3/src/GeDe3 {compressed_file_name}'
    subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    original_size = os.path.getsize(uncompressed_file_name)
    tot_len = 0
    with open(uncompressed_file_name, 'r') as file:
        content = file.read()  # Read the entire content of the file
        tot_len = len(content)

    # print(f'GZIP Compressing fasta file {fasta_file}')
    # Calculate compression ratio
    ratio = (compressed_size / original_size) * 100
    # print(f'GECO3 compression ratio: {ratio}')

    # Calculate gz compression of decompressed file
    with open(uncompressed_file_name, 'rb') as f_in:  # Use 'rb' to read in binary mode
        with gzip.open(gz_file_name, 'wb') as f_out:
            f_out.writelines(f_in)
    compressed_size_gz = os.path.getsize(gz_file_name)
    ratio_gz = (compressed_size_gz / original_size) * 100
    # print(f'GZIP compression ratio: {ratio_gz}')

    os.remove(compressed_file_name)
    os.remove(uncompressed_file_name)
    os.remove(gz_file_name)
    
    return ratio, ratio_gz, tot_len

def list_organisms(ftp_server: str, ftp_directory: str, starting_organism: Optional[str] = None) -> List[str]:
    """
    Connects to an FTP server, navigates to the specified directory, and lists all subdirectories (organisms).
    Optionally starts the list from a given organism.

    Parameters:
    - ftp_server: The FTP server address.
    - ftp_directory: The directory on the FTP server to list organisms from.
    - starting_organism: An optional organism name to start the list from.

    Returns:
    - A list of organism names, ordered starting from the specified organism if provided.
    """
    with FTP(ftp_server) as ftp:
        ftp.login()  # Login anonymously
        ftp.cwd(ftp_directory)
        organisms = sorted(ftp.nlst())  # List and sort all directories in the specified directory

    if starting_organism and starting_organism in organisms:
        start_index = organisms.index(starting_organism)
        return organisms[start_index:]

    return organisms

def q_write_result_to_csv(q: queue.Queue[Optional[Dict[str, int]]], fieldnames: List, output_csv_file: str) -> None:
    """
    Function to write results from a queue to a CSV file.

    Parameters:
    - q: A thread-safe queue containing dictionaries to write to the CSV file.
    - output_csv_file: The path to the CSV file where results should be written.

    Returns:
    - None
    """
    with open(output_csv_file, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write the header only once
        if csvfile.tell() == 0:
            writer.writeheader()
        
        while True:
            result = q.get()
            if result is None:  # Stop signal
                break
            writer.writerow(result)
            csvfile.flush()
            q.task_done()

def worker(organism: str, ftp_server: str, base_ftp_directory: str, q: queue.Queue[Optional[Dict[str, int]]]) -> None:
    result = process_species(organism, ftp_server, base_ftp_directory)
    q.put(result)

def process_all_organisms(
        ftp_server: str,
        base_ftp_directory: str,
        output_csv_file: str,
        starting_organism: Optional[str] = None):
    """
    Processes all organisms in the specified base FTP directory.
    """
    organisms = list_organisms(ftp_server, base_ftp_directory, starting_organism)
    fieldnames = [
        "species_name", "cds_num_sequences", "cds_tot_len", "cds_compression_ratio_geco3", 
        "cds_compression_ratio_gz", "ncrna_num_sequences", "ncrna_tot_len", 
        "ncrna_compression_ratio_geco3", "ncrna_compression_ratio_gz"
    ]
    q = queue.Queue()

    # Start the CSV writer thread
    writer_thread = threading.Thread(target=q_write_result_to_csv, args=(q, fieldnames, output_csv_file))
    writer_thread.start()

    # Assuming organisms is a list of organism names
    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(worker, organism, ftp_server, base_ftp_directory, q): organism for organism in organisms}
        
        # Wait for all the workers to complete
        for future in concurrent.futures.as_completed(futures):
            organism_name = futures[future]  # Get the organism name from the dictionary
            try:
                future.result()  # This raises any exception encountered during execution
                print(f'{organism_name} completed successfully!')
            except Exception as exc:
                print(f'{organism_name} generated an exception: {exc}')
    
    # Stop the writer thread
    q.put(None)
    writer_thread.join()    
    
def process_species(species_name, ftp_server, base_ftp_directory):
    """
    Processes a species: downloads the FASTA files, decompresses, calculates the number of sequences and compression ratio.
    """
    results = {
        "species_name": species_name,
        "cds_num_sequences": 0,
        "cds_tot_len": 0,
        "cds_compression_ratio_geco3": 0,
        "cds_compression_ratio_gz": 0,
        "ncrna_num_sequences": 0,
        "ncrna_tot_len": 0,
        "ncrna_compression_ratio_geco3": 0,
        "ncrna_compression_ratio_gz": 0,
    }

    # Directories and file suffixes for cds, ncrna
    data_types = [("cds", "cds.all.fa.gz"), ("ncrna", "ncrna.fa.gz")]

    for data_type, file_suffix in data_types:
        ftp_directory = f"{base_ftp_directory}/{species_name}/{data_type}/"
        
        try:
            # Attempt to find, download, and decompress the FASTA file
            decompressed_fasta = find_download_and_decompress_from_ftp(ftp_server, ftp_directory, file_suffix)
        except (FileNotFoundError, ftplib.error_perm) as e:
            print(f"Directory or file not found for {species_name} in {data_type} folder: {e}")
            # Skip processing if the directory or file is not found, fields remain at 0
            continue
        
        # Calculate the compression ratio
        cr, crg, tl = calculate_compression_ratio(decompressed_fasta)
        
        num_seqs = sum(1 for _ in SeqIO.parse(decompressed_fasta, "fasta"))
        # Store results in the appropriate fields
        results[f"{data_type}_num_sequences"] = num_seqs
        results[f"{data_type}_compression_ratio_geco3"] = cr
        results[f"{data_type}_compression_ratio_gz"] = crg
        results[f"{data_type}_tot_len"] = tl
        
        # Print the species name and compression ratios
        print(data_type, num_seqs, ", total = ", tl)
        print(f"The GECO3 compression ratio for {decompressed_fasta} is {cr:.2f}%")
        print(f"The GZIP compression ratio for {decompressed_fasta} is {crg:.2f}%")

        os.remove(decompressed_fasta)
    return results


In [7]:
# Usage bacteria

ftp_server = "ftp.ensemblgenomes.ebi.ac.uk"
base_ftp_directory = "/pub/bacteria/current/fasta/bacteria_0_collection"
output_csv_file = "compression_ratios_bact.csv"

process_all_organisms(ftp_server, base_ftp_directory, output_csv_file)

print(f"Results saved to {output_csv_file}")

Processing acinetobacter_baumannii_aye_gca_000069245...
cds 3712 , total =  3499542
The GECO3 compression ratio for Acinetobacter_baumannii_aye_gca_000069245.ASM6924v1.cds.all.fa is 23.53%
The GZIP compression ratio for Acinetobacter_baumannii_aye_gca_000069245.ASM6924v1.cds.all.fa is 27.82%
ncrna 91 , total =  32793
The GECO3 compression ratio for Acinetobacter_baumannii_aye_gca_000069245.ASM6924v1.ncrna.fa is 6.65%
The GZIP compression ratio for Acinetobacter_baumannii_aye_gca_000069245.ASM6924v1.ncrna.fa is 8.54%
Results for acinetobacter_baumannii_aye_gca_000069245 written to species_compression_ratios_bact_looptest.csv
Processing actinobacillus_pleuropneumoniae_serovar_5b_str_l20_gca_000015885...
cds 2012 , total =  1964559
The GECO3 compression ratio for Actinobacillus_pleuropneumoniae_serovar_5b_str_l20_gca_000015885.ASM1588v1.cds.all.fa is 23.31%
The GZIP compression ratio for Actinobacillus_pleuropneumoniae_serovar_5b_str_l20_gca_000015885.ASM1588v1.cds.all.fa is 27.55%
ncrna 

In [11]:
# Usage Vertebrates

ftp_server = "ftp.ensembl.org" 
base_ftp_directory = "/pub/current_fasta"
output_csv_file = "compression_ratios_vert.csv"

process_all_organisms(ftp_server, base_ftp_directory, output_csv_file)

print(f"Results saved to {output_csv_file}")

cds 18698 , total =  30229734
The GECO3 compression ratio for Cavia_aperea.CavAp1.0.cds.all.fa is 18.17%
The GZIP compression ratio for Cavia_aperea.CavAp1.0.cds.all.fa is 20.32%
ncrna 3614 , total =  468177
The GECO3 compression ratio for Cavia_aperea.CavAp1.0.ncrna.fa is 14.55%
The GZIP compression ratio for Cavia_aperea.CavAp1.0.ncrna.fa is 19.30%
cavia_aperea completed successfully!
cds 25582 , total =  43333286
The GECO3 compression ratio for Cavia_porcellus.Cavpor3.0.cds.all.fa is 18.22%
The GZIP compression ratio for Cavia_porcellus.Cavpor3.0.cds.all.fa is 20.03%
ncrna 9022 , total =  3529522
The GECO3 compression ratio for Cavia_porcellus.Cavpor3.0.ncrna.fa is 18.70%
The GZIP compression ratio for Cavia_porcellus.Cavpor3.0.ncrna.fa is 23.04%
cavia_porcellus completed successfully!
cds 31797 , total =  51770540
The GECO3 compression ratio for Chelonoidis_abingdonii.ASM359739v1.cds.all.fa is 15.82%
The GZIP compression ratio for Chelonoidis_abingdonii.ASM359739v1.cds.all.fa is 16

In [4]:
# Usage Plants

ftp_server = "ftp.ebi.ac.uk" 
base_ftp_directory = "/ensemblgenomes/pub/current/plants/fasta/"
output_csv_file = "compression_ratios_plant.csv"

process_all_organisms(ftp_server, base_ftp_directory, output_csv_file)

print(f"Results saved to {output_csv_file}")

cds 27313 , total =  25807032
The GECO3 compression ratio for Amborella_trichopoda.AMTR1.0.cds.all.fa is 23.36%
The GZIP compression ratio for Amborella_trichopoda.AMTR1.0.cds.all.fa is 27.81%
ncrna 1040 , total =  149397
The GECO3 compression ratio for Amborella_trichopoda.AMTR1.0.ncrna.fa is 16.20%
The GZIP compression ratio for Amborella_trichopoda.AMTR1.0.ncrna.fa is 21.41%
amborella_trichopoda completed successfully!
cds 25783 , total =  29489560
The GECO3 compression ratio for Ananas_comosus.F153.cds.all.fa is 22.27%
The GZIP compression ratio for Ananas_comosus.F153.cds.all.fa is 27.94%
Directory or file not found for ananas_comosus in ncrna folder: 550 Failed to change directory.
ananas_comosus completed successfully!
cds 32667 , total =  35412914
The GECO3 compression ratio for Arabidopsis_lyrata.v.1.0.cds.all.fa is 23.01%
The GZIP compression ratio for Arabidopsis_lyrata.v.1.0.cds.all.fa is 27.73%
ncrna 795 , total =  113407
The GECO3 compression ratio for Arabidopsis_lyrata.

In [7]:
# Usage Fungi

ftp_server = "ftp.ensemblgenomes.org" 
base_ftp_directory = "/pub/fungi/current/fasta/"
output_csv_file = "compression_ratios_fungi.csv"

process_all_organisms(ftp_server, base_ftp_directory, output_csv_file)

print(f"Results saved to {output_csv_file}")

cds 4776 , total =  7020033
The GECO3 compression ratio for Ashbya_gossypii.ASM9102v1.cds.all.fa is 24.18%
The GZIP compression ratio for Ashbya_gossypii.ASM9102v1.cds.all.fa is 28.25%
ncrna 725 , total =  275179
The GECO3 compression ratio for Ashbya_gossypii.ASM9102v1.ncrna.fa is 2.80%
The GZIP compression ratio for Ashbya_gossypii.ASM9102v1.ncrna.fa is 4.14%
ashbya_gossypii completed successfully!
cds 9121 , total =  13531458
The GECO3 compression ratio for Aspergillus_clavatus.ASM271v1.cds.all.fa is 24.28%
The GZIP compression ratio for Aspergillus_clavatus.ASM271v1.cds.all.fa is 28.17%
cds 9623 , total =  14234529
The GECO3 compression ratio for Aspergillus_fumigatus.ASM265v1.cds.all.fa is 24.39%
The GZIP compression ratio for Aspergillus_fumigatus.ASM265v1.cds.all.fa is 28.24%
cds 9929 , total =  14433496
The GECO3 compression ratio for Aspergillus_fumigatusa1163.ASM15014v1.cds.all.fa is 24.39%
The GZIP compression ratio for Aspergillus_fumigatusa1163.ASM15014v1.cds.all.fa is 28.

In [4]:
# Usage Metazoa

ftp_server = "ftp.ensemblgenomes.org" 
base_ftp_directory = "/pub/metazoa/current/fasta/"
output_csv_file = "compression_ratios_metazoa.csv"

process_all_organisms(ftp_server, base_ftp_directory, output_csv_file)

print(f"Results saved to {output_csv_file}")

cds 20241 , total =  37975243
The GECO3 compression ratio for Acromyrmex_echinatior_gca000204515v1rs.Aech_3.9.cds.all.fa is 13.54%
The GZIP compression ratio for Acromyrmex_echinatior_gca000204515v1rs.Aech_3.9.cds.all.fa is 15.30%
ncrna 1369 , total =  1175392
The GECO3 compression ratio for Acromyrmex_echinatior_gca000204515v1rs.Aech_3.9.ncrna.fa is 17.45%
The GZIP compression ratio for Acromyrmex_echinatior_gca000204515v1rs.Aech_3.9.ncrna.fa is 21.20%
acromyrmex_echinatior_gca000204515v1rs completed successfully!
cds 24967 , total =  44401530
The GECO3 compression ratio for Adelges_cooleyi_gca023614345v1rs.UGA_ACOO_1.1.cds.all.fa is 12.35%
The GZIP compression ratio for Adelges_cooleyi_gca023614345v1rs.UGA_ACOO_1.1.cds.all.fa is 14.22%
ncrna 1983 , total =  1743407
The GECO3 compression ratio for Adelges_cooleyi_gca023614345v1rs.UGA_ACOO_1.1.ncrna.fa is 10.64%
The GZIP compression ratio for Adelges_cooleyi_gca023614345v1rs.UGA_ACOO_1.1.ncrna.fa is 16.38%
adelges_cooleyi_gca023614345v

In [3]:
# Usage Protist

ftp_server = "ftp.ensemblgenomes.org" 
base_ftp_directory = "/pub/protists/current/fasta/"
output_csv_file = "compression_ratios_protist.csv"

process_all_organisms(ftp_server, base_ftp_directory, output_csv_file)

print(f"Results saved to {output_csv_file}")

cds 7364 , total =  9778297
The GECO3 compression ratio for Giardia_lamblia.GL2.cds.all.fa is 22.81%
The GZIP compression ratio for Giardia_lamblia.GL2.cds.all.fa is 28.16%
ncrna 90 , total =  19645
The GECO3 compression ratio for Giardia_lamblia.GL2.ncrna.fa is 13.05%
The GZIP compression ratio for Giardia_lamblia.GL2.ncrna.fa is 16.13%
giardia_lamblia completed successfully!
cds 8113 , total =  10237239
The GECO3 compression ratio for Entamoeba_histolytica.JCVI-ESG2-1.0.cds.all.fa is 20.55%
The GZIP compression ratio for Entamoeba_histolytica.JCVI-ESG2-1.0.cds.all.fa is 26.22%
ncrna 97 , total =  11011
The GECO3 compression ratio for Entamoeba_histolytica.JCVI-ESG2-1.0.ncrna.fa is 17.86%
The GZIP compression ratio for Entamoeba_histolytica.JCVI-ESG2-1.0.ncrna.fa is 22.12%
entamoeba_histolytica completed successfully!
cds 14321 , total =  13918876
The GECO3 compression ratio for Hyaloperonospora_arabidopsidis.HyaAraEmoy2_2.0.cds.all.fa is 22.23%
The GZIP compression ratio for Hyaloper