# Generating random and repetitive sequences

In [10]:
import os
import csv
import gzip
import subprocess

In [18]:
import random

def generate_random_dna_sequence(length):
    """Generate a completely random DNA sequence of a given length."""
    return ''.join(random.choice('ACGT') for _ in range(length))

def generate_motif_based_sequence(length, motif, repeat_probability=0.5):
    """Generate a DNA sequence with a certain degree of predictability by repeating a motif."""
    sequence = []
    motif_length = len(motif)
    while len(sequence) < length:
        if len(sequence) + motif_length <= length and random.random() < repeat_probability:
            sequence.append(motif)
        else:
            sequence.append(random.choice('ACGT'))
    return ''.join(sequence)

def generate_biased_sequence(length, nucleotide_bias):
    """Generate a DNA sequence with a certain nucleotide bias."""
    nucleotides = ['A', 'C', 'G', 'T']
    sequence = ''.join(random.choices(nucleotides, weights=nucleotide_bias, k=length))
    return sequence

def generate_markov_sequence(length, transition_matrix, start_nucleotide='A'):
    """Generate a DNA sequence using a simple Markov model."""
    sequence = [start_nucleotide]
    current_nucleotide = start_nucleotide
    
    for _ in range(length - 1):
        current_nucleotide = random.choices(['A', 'C', 'G', 'T'], weights=transition_matrix[current_nucleotide])[0]
        sequence.append(current_nucleotide)
    
    return ''.join(sequence)

def generate_repetitive_sequence(length, pattern='ATCG'):
    """Generate a DNA sequence by repeating a simple pattern until the desired length is reached."""
    repeat_sequence = (pattern * (length // len(pattern) + 1))[:length]
    return repeat_sequence

def generate_oneletter_sequence(length):
    """Generate a DNA sequence with a certain nucleotide bias."""
    sequence = 'A'*length
    return sequence

In [19]:
def create_fasta_file(sequence, sequence_length, sequence_type):
    """Create a mock FASTA file with the given DNA sequence, including the sequence length and type in the filename and header."""
    filename = f"mock_sequence_{sequence_type}_length_{sequence_length}.fasta"  # Filename with the sequence length and type
    header = f">MockSequence_{sequence_type}_Length_{sequence_length}"  # Header with the sequence length and type
    
    with open(filename, 'w') as f:
        f.write(header + "\n")
        f.write(sequence + "\n")
    
    return filename

def calculate_compression_ratio(fasta_file):
    """
    Compresses the text file and calculates the compression ratio.
    """
    compressed_file_name = fasta_file+'.co'
    decompressed_file_name = fasta_file+'.de'
    gz_file_name = fasta_file+'.gz'

    # Compress the fasta file using GECO3
    command = f'/Users/celia/VSCode/dnacomp/geco3/src/GeCo3 {fasta_file}'
    subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    compressed_size = os.path.getsize(compressed_file_name)

    # Decompress the compressed file
    command = f'/Users/celia/VSCode/dnacomp/geco3/src/GeDe3 {compressed_file_name}'
    subprocess.run(command, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    original_size = os.path.getsize(decompressed_file_name)

    # Calculate compression ratio
    ratio = (compressed_size / original_size) * 100

    # Calculate gz compression of decompressed file
    with open(decompressed_file_name, 'rb') as f_in:
        with gzip.open(gz_file_name, 'wb') as f_out:
            f_out.writelines(f_in)
    compressed_size_gz = os.path.getsize(gz_file_name)
    ratio_gz = (compressed_size_gz / original_size) * 100

    # Clean up temporary files
    os.remove(compressed_file_name)
    os.remove(decompressed_file_name)
    os.remove(gz_file_name)
    
    return ratio, ratio_gz, original_size

In [20]:
def process_fasta_files(csv_filename="compression_ratios_random.csv"):
    """Process different DNA sequences, calculate compression ratios, and save results to a single CSV file."""
    sequence_lengths = [100, 1000, 10000, 100000, int(1E6), int(1E7)]

    # Open the CSV file for writing
    with open(csv_filename, 'w', newline='') as csvfile:
        fieldnames = ["sequence_type", "sequence_length", "filename", "compression_ratio_geco3", "compression_ratio_gz", "original_size"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # Write the header
        writer.writeheader()

        # Dictionary of sequence generation functions and their corresponding names
        generation_methods = {
            "random": generate_random_dna_sequence,
            "motif_based": generate_motif_based_sequence,
            "biased": generate_biased_sequence,
            "markov": generate_markov_sequence,
            "repetitive": generate_repetitive_sequence,
            "oneletter": generate_oneletter_sequence
        }

        # Parameters for specific generation methods
        motif = "ACGT"
        repeat_probability = 0.5
        nucleotide_bias = [0.1, 0.2, 0.6, 0.1]
        transition_matrix = {
            'A': [0.1, 0.4, 0.4, 0.1],
            'C': [0.1, 0.1, 0.7, 0.1],
            'G': [0.7, 0.1, 0.1, 0.1],
            'T': [0.25, 0.25, 0.25, 0.25]
        }

        # Iterate over each sequence generation method
        for sequence_type, generation_function in generation_methods.items():
            for length in sequence_lengths:
                # Generate the sequence
                if sequence_type == "motif_based":
                    sequence = generation_function(length, motif, repeat_probability)
                elif sequence_type == "biased":
                    sequence = generation_function(length, nucleotide_bias)
                elif sequence_type == "markov":
                    sequence = generation_function(length, transition_matrix)
                elif sequence_type == "repetitive":
                    sequence = generation_function(length, pattern="ATCG")
                elif sequence_type == "oneletter":
                    sequence = generation_function(length)
                else:
                    sequence = generation_function(length)
                
                # Create the FASTA file
                fasta_filename = create_fasta_file(sequence, sequence_length=length, sequence_type=sequence_type)
                print(f"Processing file: {fasta_filename} with length {length}")
                
                # Calculate compression ratios
                ratio, ratio_gz, original_size = calculate_compression_ratio(fasta_filename)
                print(f"GECO3 compression ratio: {ratio:.2f}%")
                print(f"GZIP compression ratio: {ratio_gz:.2f}%")
                
                # Write the result to the CSV file
                writer.writerow({
                    "sequence_type": sequence_type,
                    "sequence_length": length,
                    "filename": fasta_filename,
                    "compression_ratio_geco3": ratio,
                    "compression_ratio_gz": ratio_gz,
                    "original_size": original_size
                })
    
    print(f"Results saved to {csv_filename}")

# Run the processing function and save results to a CSV file
process_fasta_files(csv_filename="compression_ratios_random.csv")

Processing file: mock_sequence_random_length_100.fasta with length 100
GECO3 compression ratio: 96.00%
GZIP compression ratio: 107.00%
Processing file: mock_sequence_random_length_1000.fasta with length 1000
GECO3 compression ratio: 32.80%
GZIP compression ratio: 41.20%
Processing file: mock_sequence_random_length_10000.fasta with length 10000
GECO3 compression ratio: 26.05%
GZIP compression ratio: 32.59%
Processing file: mock_sequence_random_length_100000.fasta with length 100000
GECO3 compression ratio: 25.16%
GZIP compression ratio: 29.70%
Processing file: mock_sequence_random_length_1000000.fasta with length 1000000
GECO3 compression ratio: 25.06%
GZIP compression ratio: 28.69%
Processing file: mock_sequence_random_length_10000000.fasta with length 10000000
GECO3 compression ratio: 25.05%
GZIP compression ratio: 28.59%
Processing file: mock_sequence_motif_based_length_100.fasta with length 100
GECO3 compression ratio: 45.31%
GZIP compression ratio: 49.61%
Processing file: mock_sequ