In [None]:
import os
import gzip
import bz2
import lzma
import csv
import subprocess
from Bio import SeqIO

def decompress_gz(file_path, output_path):
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return False
    with gzip.open(file_path, 'rt') as f_in:
        with open(output_path, 'w') as f_out:
            f_out.write(f_in.read())
    return True

def parse_fasta(file_path):
    if not os.path.exists(file_path):
        print(f"File not found for parsing: {file_path}")
        return 0, 0, ''
    count = 0
    total_length = 0
    cleaned_sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        clean_seq = ''.join([c for c in str(record.seq).upper() if c in 'ACGT'])
        cleaned_sequences.append(clean_seq)
        total_length += len(clean_seq)
        count += 1
    return count, total_length, ''.join(cleaned_sequences)

def compute_gc_content(sequence):
    if not sequence:
        print("No sequence data available for GC content calculation.")
        return 0
    gc_count = sum(1 for base in sequence if base in 'GC')
    gc_content = (gc_count / len(sequence)) * 100
    return gc_content

def compress_and_ratio(original_data, method):
    if not original_data:
        return 0
    original_bytes = original_data.encode('utf-8')
    if method == 'lzma':
        compressed_data = lzma.compress(original_bytes)
    elif method == 'gzip':
        compressed_data = gzip.compress(original_bytes)
    elif method == 'bzip2':
        compressed_data = bz2.compress(original_bytes)
    else:
        return None
    compressed_size = len(compressed_data)
    original_size = len(original_bytes)
    return 100* compressed_size / original_size if original_size > 0 else 0

def geco3_compress_and_ratio(original_data, original_file_path):
    if not original_data:
        return 0
    try:
        temp_fasta = original_file_path + ".fasta"
        with open(temp_fasta, 'w') as f:
            f.write(original_data)
        geco3_compressed_file = temp_fasta + ".co"
        command = f'geco3/src/GeCo3 {temp_fasta}'
        subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        compressed_size = os.path.getsize(geco3_compressed_file)
        original_size = os.path.getsize(temp_fasta)
        os.remove(temp_fasta)
        os.remove(geco3_compressed_file)
        return (compressed_size / original_size) * 100
    except Exception as e:
        print(f"Error during GECO3 compression: {e}")
        return 0

dropbox_dir = "dropbox"
output_csv = "output_for_googlespreadsheet.csv"
headers = [
    "kingdom", "organism", 
    "cds_number", "cds_length", "cds_lzma_ratio", "cds_gzip_ratio", "cds_bzip2_ratio", "cds_geco3_ratio", "cds_gc_content", 
    "ncrna_number", "ncrna_length", "ncrna_lzma_ratio", "ncrna_gzip_ratio", "ncrna_bzip2_ratio", "ncrna_geco3_ratio", "ncrna_gc_content"
]

organism_data = {}

for kingdom in os.listdir(dropbox_dir):
    kingdom_path = os.path.join(dropbox_dir, kingdom)
    if os.path.isdir(kingdom_path):
        for file_name in os.listdir(kingdom_path):
            if file_name.endswith('gz'):
                print(kingdom, ':', file_name)
                organism_name = file_name.split('.')[0].split('.')[0]  # Ensure we only get the base organism name
                file_type = 'ncrna' if 'ncrna' in file_name else 'cds'
                file_path = os.path.join(kingdom_path, file_name)
                decompressed_path = file_path.replace('.gz', '')

                if not decompress_gz(file_path, decompressed_path):
                    continue

                num_sequences, cleaned_length, cleaned_seq = parse_fasta(decompressed_path)
                gc_content = compute_gc_content(cleaned_seq)
                geco3_ratio = geco3_compress_and_ratio(cleaned_seq, decompressed_path)  # Compute GECO3 ratio

                if organism_name not in organism_data:
                    organism_data[organism_name] = {
                        'kingdom': kingdom, 'organism': organism_name,
                        'cds_number': 0, 'cds_length': 0, 'cds_lzma_ratio': 0, 'cds_gzip_ratio': 0, 'cds_bzip2_ratio': 0, 'cds_geco3_ratio': 0, 'cds_gc_content': 0,
                        'ncrna_number': 0, 'ncrna_length': 0, 'ncrna_lzma_ratio': 0, 'ncrna_gzip_ratio': 0, 'ncrna_bzip2_ratio': 0, 'ncrna_geco3_ratio': 0, 'ncrna_gc_content': 0
                    }

                # Update the dictionary only in the correct part
                data_prefix = 'ncrna' if 'ncrna' in file_type else 'cds'
                updates = {
                    f'{data_prefix}_number': num_sequences,
                    f'{data_prefix}_length': cleaned_length,
                    f'{data_prefix}_lzma_ratio': compress_and_ratio(cleaned_seq, 'lzma'),
                    f'{data_prefix}_gzip_ratio': compress_and_ratio(cleaned_seq, 'gzip'),
                    f'{data_prefix}_bzip2_ratio': compress_and_ratio(cleaned_seq, 'bzip2'),
                    f'{data_prefix}_geco3_ratio': geco3_ratio,
                    f'{data_prefix}_gc_content': gc_content
                }
                organism_data[organism_name].update(updates)

                os.remove(decompressed_path)

with open(output_csv, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(headers)
    for organism, data in organism_data.items():
        row = [data[key] for key in headers]
        writer.writerow(row)