<a href="https://colab.research.google.com/github/mbjallow6/Algorithms-python/blob/main/Boinformatic_Problems_Rosalind.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import data from the computer
from google.colab import files
uploaded = files.upload()

Saving rosalind_lexf.txt to rosalind_lexf.txt


## Enumerating k-mers Lexicographically

In [2]:
"""
Rosalind Enumerating k-mers Lexicographically Problem Solution

This module generates all possible strings of length n from a given ordered alphabet
in lexicographic order. This is useful for cataloguing genetic strings and creating
systematic orderings of sequence data.

The problem generates the Cartesian product of an alphabet with itself n times,
producing alphabet^n total strings in lexicographic order.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Iterator, Tuple
import itertools
import math


class LexicographicStringGenerator:
    """
    A class to generate strings of specified length from an ordered alphabet
    in lexicographic order.
    """

    def __init__(self):
        """Initialize the lexicographic string generator."""
        pass

    @staticmethod
    def validate_alphabet(alphabet: List[str]) -> bool:
        """
        Validate the input alphabet.

        Args:
            alphabet (List[str]): List of alphabet symbols

        Returns:
            bool: True if alphabet is valid, False otherwise
        """
        if not alphabet:
            return False

        # Check for duplicates
        if len(alphabet) != len(set(alphabet)):
            return False

        # Check constraint: at most 10 symbols
        if len(alphabet) > 10:
            return False

        # Check that all symbols are single characters (optional but good practice)
        return all(len(symbol) == 1 for symbol in alphabet)

    @staticmethod
    def validate_n(n: int, max_alphabet_size: int) -> bool:
        """
        Validate the string length parameter.

        Args:
            n (int): Desired string length
            max_alphabet_size (int): Size of alphabet for complexity estimation

        Returns:
            bool: True if n is valid, False otherwise
        """
        if not isinstance(n, int) or n <= 0:
            return False

        # Check constraint: n ≤ 10
        if n > 10:
            return False

        # Optional: Check if the total number of strings would be manageable
        total_strings = max_alphabet_size ** n
        if total_strings > 10**6:  # Reasonable limit for memory/time
            return False

        return True

    def calculate_total_strings(self, alphabet_size: int, n: int) -> int:
        """
        Calculate the total number of strings that will be generated.

        Args:
            alphabet_size (int): Size of the alphabet
            n (int): Length of strings

        Returns:
            int: Total number of strings (alphabet_size^n)
        """
        return alphabet_size ** n

    def generate_strings_iterator(self, alphabet: List[str], n: int) -> Iterator[str]:
        """
        Generate all strings of length n from alphabet in lexicographic order.
        Uses iterator for memory efficiency.

        Args:
            alphabet (List[str]): Ordered alphabet symbols
            n (int): Length of strings to generate

        Yields:
            str: Each generated string in lexicographic order

        Raises:
            ValueError: If alphabet or n is invalid
        """
        # Validate inputs
        if not self.validate_alphabet(alphabet):
            raise ValueError("Invalid alphabet: must be non-empty, unique symbols, ≤10 characters")

        if not self.validate_n(n, len(alphabet)):
            raise ValueError("Invalid n: must be positive integer ≤10")

        # Generate all combinations using Cartesian product
        for combination in itertools.product(alphabet, repeat=n):
            yield ''.join(combination)

    def generate_all_strings(self, alphabet: List[str], n: int) -> List[str]:
        """
        Generate all strings and return as a list.

        Args:
            alphabet (List[str]): Ordered alphabet symbols
            n (int): Length of strings to generate

        Returns:
            List[str]: All generated strings in lexicographic order

        Raises:
            ValueError: If alphabet or n is invalid
        """
        return list(self.generate_strings_iterator(alphabet, n))

    def get_generation_info(self, alphabet: List[str], n: int) -> dict:
        """
        Get information about the string generation process.

        Args:
            alphabet (List[str]): Ordered alphabet symbols
            n (int): Length of strings to generate

        Returns:
            dict: Information about the generation process
        """
        alphabet_size = len(alphabet)
        total_strings = self.calculate_total_strings(alphabet_size, n)

        return {
            'alphabet': alphabet,
            'alphabet_size': alphabet_size,
            'string_length': n,
            'total_strings': total_strings,
            'memory_estimate_mb': (total_strings * n * 4) / (1024 * 1024),  # Rough estimate
            'is_manageable': total_strings <= 10**5
        }


def parse_input_file(file_path: str) -> Tuple[List[str], int]:
    """
    Parse input file to extract alphabet and n.

    Args:
        file_path (str): Path to input file

    Returns:
        Tuple[List[str], int]: (alphabet, n)

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If file format is invalid
    """
    try:
        with open(file_path, 'r') as file:
            lines = file.read().strip().split('\n')

        if len(lines) < 2:
            raise ValueError("Input file must contain at least 2 lines")

        # Parse alphabet (first line)
        alphabet = lines[0].strip().split()
        if not alphabet:
            raise ValueError("First line must contain alphabet symbols")

        # Parse n (second line)
        try:
            n = int(lines[1].strip())
        except ValueError:
            raise ValueError(f"Second line must be an integer, got: '{lines[1]}'")

        return alphabet, n

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")


def write_output_file(output_path: str, strings: Iterator[str]) -> None:
    """
    Write generated strings to output file.

    Args:
        output_path (str): Path to output file
        strings (Iterator[str]): Iterator of strings to write
    """
    try:
        with open(output_path, 'w') as file:
            for string in strings:
                file.write(string + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")


def solve_lexicographic_strings_problem(input_file_path: str) -> Iterator[str]:
    """
    Solve the Lexicographic Strings problem for a given input file.

    Args:
        input_file_path (str): Path to input file

    Returns:
        Iterator[str]: Iterator of generated strings

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If input is invalid
    """
    try:
        # Parse input
        alphabet, n = parse_input_file(input_file_path)

        # Initialize generator
        generator = LexicographicStringGenerator()

        # Generate strings
        return generator.generate_strings_iterator(alphabet, n)

    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")


def main():
    """
    Main function to run the Lexicographic Strings problem solver.
    Designed to work in Google Colab environment.
    """
    # Configuration
    input_file = "rosalind_lexf.txt"  # Change this to your input file name
    output_file = "output_lexf.txt"

    try:
        print("Solving Lexicographic Strings Problem...")

        # Parse input first to get info
        alphabet, n = parse_input_file(input_file)

        # Get generation info
        generator = LexicographicStringGenerator()
        info = generator.get_generation_info(alphabet, n)

        print(f"\nInput Information:")
        print(f"Alphabet: {info['alphabet']}")
        print(f"Alphabet size: {info['alphabet_size']}")
        print(f"String length: {info['string_length']}")
        print(f"Total strings to generate: {info['total_strings']:,}")
        print(f"Estimated memory usage: {info['memory_estimate_mb']:.2f} MB")

        if not info['is_manageable']:
            print("\nWarning: Large number of strings. Consider using iterator approach.")

        # Generate strings using iterator for memory efficiency
        strings_iterator = generator.generate_strings_iterator(alphabet, n)

        # Display first few results
        print(f"\nFirst few generated strings:")
        preview_count = min(10, info['total_strings'])
        preview_strings = []

        for i, string in enumerate(strings_iterator):
            if i < preview_count:
                preview_strings.append(string)
                print(string)
            else:
                break

        if info['total_strings'] > preview_count:
            print("...")
            print(f"(and {info['total_strings'] - preview_count:,} more)")

        # Write all strings to output file (regenerate iterator)
        strings_iterator = generator.generate_strings_iterator(alphabet, n)
        write_output_file(output_file, strings_iterator)
        print(f"\nAll strings written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")


def demo_with_sample():
    """
    Demonstrate the solution with the sample data.
    """
    print("=== Demo with Sample Data ===")

    # Sample input
    sample_alphabet = ['A', 'C', 'G', 'T']
    sample_n = 2

    print(f"Input:")
    print(f"Alphabet: {sample_alphabet}")
    print(f"String length: {sample_n}")

    generator = LexicographicStringGenerator()

    # Get info
    info = generator.get_generation_info(sample_alphabet, sample_n)
    print(f"\nGeneration Info:")
    print(f"Total strings: {info['total_strings']}")
    print(f"Expected: {len(sample_alphabet)**sample_n}")

    # Generate and display all strings
    print(f"\nGenerated strings:")
    strings = generator.generate_all_strings(sample_alphabet, sample_n)

    for string in strings:
        print(string)

    # Verify count
    print(f"\nVerification:")
    print(f"Generated {len(strings)} strings")
    print(f"Expected {info['total_strings']} strings")
    print(f"Match: {'✓' if len(strings) == info['total_strings'] else '✗'}")

    # Check lexicographic ordering
    is_sorted = strings == sorted(strings)
    print(f"Lexicographically ordered: {'✓' if is_sorted else '✗'}")


def test_edge_cases():
    """
    Test various edge cases and constraints.
    """
    print("=== Testing Edge Cases ===")

    generator = LexicographicStringGenerator()

    test_cases = [
        (['A'], 1, "Single symbol, length 1"),
        (['A'], 3, "Single symbol, length 3"),
        (['A', 'B'], 1, "Two symbols, length 1"),
        (['X', 'Y', 'Z'], 2, "Three symbols, length 2"),
        (['A', 'C', 'G', 'T'], 1, "DNA alphabet, length 1"),
    ]

    for alphabet, n, description in test_cases:
        try:
            info = generator.get_generation_info(alphabet, n)
            strings = generator.generate_all_strings(alphabet, n)
            print(f"\n{description}:")
            print(f"  Input: alphabet={alphabet}, n={n}")
            print(f"  Output: {len(strings)} strings")
            print(f"  First few: {strings[:min(5, len(strings))]}")
            if len(strings) > 5:
                print(f"  Last few: {strings[-min(3, len(strings)):]}")
        except Exception as e:
            print(f"\n{description}: Error - {e}")


# Example usage and testing
if __name__ == "__main__":
    # Run demo with sample data
    demo_with_sample()

    print("\n" + "="*60)

    # Test edge cases
    test_edge_cases()

    print("\n" + "="*60)

    # Test algorithm efficiency
    print("=== Algorithm Efficiency Test ===")
    generator = LexicographicStringGenerator()

    # Test with larger alphabets
    efficiency_tests = [
        (['A', 'B', 'C'], 4),
        (['A', 'C', 'G', 'T'], 3),
        (['A', 'B', 'C', 'D', 'E'], 3),
    ]

    for alphabet, n in efficiency_tests:
        info = generator.get_generation_info(alphabet, n)
        print(f"Alphabet size {len(alphabet)}, length {n}: {info['total_strings']:,} strings")

    print("\n" + "="*60)

    # Uncomment to run with actual file input
    # main()


=== Demo with Sample Data ===
Input:
Alphabet: ['A', 'C', 'G', 'T']
String length: 2

Generation Info:
Total strings: 16
Expected: 16

Generated strings:
AA
AC
AG
AT
CA
CC
CG
CT
GA
GC
GG
GT
TA
TC
TG
TT

Verification:
Generated 16 strings
Expected 16 strings
Match: ✓
Lexicographically ordered: ✓

=== Testing Edge Cases ===

Single symbol, length 1:
  Input: alphabet=['A'], n=1
  Output: 1 strings
  First few: ['A']

Single symbol, length 3:
  Input: alphabet=['A'], n=3
  Output: 1 strings
  First few: ['AAA']

Two symbols, length 1:
  Input: alphabet=['A', 'B'], n=1
  Output: 2 strings
  First few: ['A', 'B']

Three symbols, length 2:
  Input: alphabet=['X', 'Y', 'Z'], n=2
  Output: 9 strings
  First few: ['XX', 'XY', 'XZ', 'YX', 'YY']
  Last few: ['ZX', 'ZY', 'ZZ']

DNA alphabet, length 1:
  Input: alphabet=['A', 'C', 'G', 'T'], n=1
  Output: 4 strings
  First few: ['A', 'C', 'G', 'T']

=== Algorithm Efficiency Test ===
Alphabet size 3, length 4: 81 strings
Alphabet size 4, length 3: 6

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## RNA Splicing Problem Solution

In [None]:
"""
Rosalind RNA Splicing Problem Solution

This module removes introns from DNA sequences, then transcribes and translates
the remaining exons to produce a protein string. This simulates the process of
RNA splicing where introns are removed and exons are concatenated before translation.

The problem involves:
1. Removing intron sequences from the main DNA string
2. Transcribing the resulting DNA to RNA (T -> U)
3. Translating the RNA to protein using the genetic code

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Tuple, Optional
import re


class RNASplicer:
    """
    A class to handle RNA splicing operations including intron removal,
    transcription, and translation.
    """

    # Standard RNA codon table
    RNA_CODON_TABLE = {
        'UUU': 'F', 'UUC': 'F', 'UUA': 'L', 'UUG': 'L',
        'UCU': 'S', 'UCC': 'S', 'UCA': 'S', 'UCG': 'S',
        'UAU': 'Y', 'UAC': 'Y', 'UAA': '*', 'UAG': '*',
        'UGU': 'C', 'UGC': 'C', 'UGA': '*', 'UGG': 'W',
        'CUU': 'L', 'CUC': 'L', 'CUA': 'L', 'CUG': 'L',
        'CCU': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
        'CAU': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGU': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
        'AUU': 'I', 'AUC': 'I', 'AUA': 'I', 'AUG': 'M',
        'ACU': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
        'AAU': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGU': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
        'GUU': 'V', 'GUC': 'V', 'GUA': 'V', 'GUG': 'V',
        'GCU': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
        'GAU': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGU': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
    }

    def __init__(self):
        """Initialize the RNA splicer."""
        pass

    @staticmethod
    def validate_dna_sequence(sequence: str) -> bool:
        """
        Validate that a sequence contains only valid DNA bases.

        Args:
            sequence (str): DNA sequence to validate

        Returns:
            bool: True if valid, False otherwise
        """
        return bool(re.match(r'^[ATGC]*$', sequence.upper()))

    def remove_introns(self, dna_sequence: str, introns: List[str]) -> str:
        """
        Remove all intron sequences from the main DNA sequence.

        Args:
            dna_sequence (str): Main DNA sequence
            introns (List[str]): List of intron sequences to remove

        Returns:
            str: DNA sequence with introns removed (exons only)

        Raises:
            ValueError: If DNA sequence or introns contain invalid bases
        """
        # Validate input sequences
        if not self.validate_dna_sequence(dna_sequence):
            raise ValueError("Main DNA sequence contains invalid bases")

        for intron in introns:
            if not self.validate_dna_sequence(intron):
                raise ValueError(f"Intron sequence contains invalid bases: {intron}")

        # Start with the original sequence
        result = dna_sequence.upper()

        # Remove each intron from the sequence
        for intron in introns:
            intron_upper = intron.upper()
            # Remove all occurrences of this intron
            while intron_upper in result:
                result = result.replace(intron_upper, '', 1)  # Remove one occurrence at a time

        return result

    @staticmethod
    def transcribe_dna_to_rna(dna_sequence: str) -> str:
        """
        Transcribe DNA sequence to RNA by replacing T with U.

        Args:
            dna_sequence (str): DNA sequence

        Returns:
            str: RNA sequence
        """
        return dna_sequence.upper().replace('T', 'U')

    def translate_rna_to_protein(self, rna_sequence: str) -> str:
        """
        Translate RNA sequence to protein using the genetic code.

        Args:
            rna_sequence (str): RNA sequence

        Returns:
            str: Protein sequence
        """
        protein = []

        # Translate codons one by one
        for i in range(0, len(rna_sequence) - 2, 3):
            codon = rna_sequence[i:i+3]

            # Skip incomplete codons
            if len(codon) != 3:
                break

            # Get amino acid for this codon
            amino_acid = self.RNA_CODON_TABLE.get(codon.upper())

            if amino_acid is None:
                # Invalid codon - skip it
                continue
            elif amino_acid == '*':
                # Stop codon - end translation
                break
            else:
                protein.append(amino_acid)

        return ''.join(protein)

    def process_rna_splicing(self, dna_sequence: str, introns: List[str]) -> str:
        """
        Complete RNA splicing process: remove introns, transcribe, and translate.

        Args:
            dna_sequence (str): Main DNA sequence
            introns (List[str]): List of intron sequences

        Returns:
            str: Final protein sequence
        """
        # Step 1: Remove introns to get exons
        exons = self.remove_introns(dna_sequence, introns)

        # Step 2: Transcribe DNA to RNA
        rna = self.transcribe_dna_to_rna(exons)

        # Step 3: Translate RNA to protein
        protein = self.translate_rna_to_protein(rna)

        return protein

    def get_detailed_analysis(self, dna_sequence: str, introns: List[str]) -> dict:
        """
        Get detailed step-by-step analysis of the RNA splicing process.

        Args:
            dna_sequence (str): Main DNA sequence
            introns (List[str]): List of intron sequences

        Returns:
            dict: Detailed analysis of each step
        """
        analysis = {}

        # Original DNA
        analysis['original_dna'] = dna_sequence.upper()
        analysis['original_length'] = len(dna_sequence)

        # Introns
        analysis['introns'] = [intron.upper() for intron in introns]
        analysis['num_introns'] = len(introns)

        # After intron removal
        exons = self.remove_introns(dna_sequence, introns)
        analysis['exons'] = exons
        analysis['exons_length'] = len(exons)
        analysis['removed_bases'] = len(dna_sequence) - len(exons)

        # RNA transcription
        rna = self.transcribe_dna_to_rna(exons)
        analysis['rna'] = rna

        # Protein translation
        protein = self.translate_rna_to_protein(rna)
        analysis['protein'] = protein
        analysis['protein_length'] = len(protein)

        return analysis


def parse_fasta_file(file_path: str) -> List[Tuple[str, str]]:
    """
    Parse FASTA file to extract all sequences.

    Args:
        file_path (str): Path to FASTA file

    Returns:
        List[Tuple[str, str]]: List of (header, sequence) tuples

    Raises:
        FileNotFoundError: If file doesn't exist
        ValueError: If file format is invalid
    """
    try:
        with open(file_path, 'r') as file:
            content = file.read().strip()

        return parse_fasta_string(content)

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")


def parse_fasta_string(fasta_content: str) -> List[Tuple[str, str]]:
    """
    Parse FASTA format string to extract sequences.

    Args:
        fasta_content (str): FASTA format content

    Returns:
        List[Tuple[str, str]]: List of (header, sequence) tuples
    """
    sequences = []
    header = None
    seq_lines = []

    for line in fasta_content.strip().split('\n'):
        line = line.strip()
        if line.startswith('>'):
            # Save previous sequence if exists
            if header is not None:
                sequences.append((header, ''.join(seq_lines)))
            # Start new sequence
            header = line[1:]  # Remove '>' character
            seq_lines = []
        else:
            seq_lines.append(line)

    # Add the last sequence
    if header is not None:
        sequences.append((header, ''.join(seq_lines)))

    if not sequences:
        raise ValueError("No valid FASTA sequences found")

    return sequences


def write_output_file(output_path: str, protein: str) -> None:
    """
    Write protein sequence to output file.

    Args:
        output_path (str): Path to output file
        protein (str): Protein sequence
    """
    try:
        with open(output_path, 'w') as file:
            file.write(protein + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")


def solve_rna_splicing_problem(input_file_path: str) -> str:
    """
    Solve the RNA Splicing problem for a given input file.

    Args:
        input_file_path (str): Path to input FASTA file

    Returns:
        str: Final protein sequence

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If input is invalid
    """
    try:
        # Parse FASTA file
        sequences = parse_fasta_file(input_file_path)

        if len(sequences) < 1:
            raise ValueError("At least one DNA sequence is required")

        # First sequence is the main DNA string
        main_dna = sequences[0][1]

        # Remaining sequences are introns
        introns = [seq for _, seq in sequences[1:]]

        # Validate length constraint
        if len(main_dna) > 1000:
            raise ValueError(f"DNA sequence length {len(main_dna)} exceeds maximum of 1000 bp")

        # Initialize splicer
        splicer = RNASplicer()

        # Process RNA splicing
        protein = splicer.process_rna_splicing(main_dna, introns)

        return protein

    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")


def main():
    """
    Main function to run the RNA Splicing problem solver.
    Designed to work in Google Colab environment.
    """
    # Configuration
    input_file = "rosalind_splc.txt"  # Change this to your input file name
    output_file = "output_splc.txt"

    try:
        print("Solving RNA Splicing Problem...")

        # Solve the problem
        protein = solve_rna_splicing_problem(input_file)

        # Display results
        print(f"\nResult:")
        print(f"Final protein sequence: {protein}")
        print(f"Protein length: {len(protein)} amino acids")

        # Write to output file
        write_output_file(output_file, protein)
        print(f"\nResult written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")


def demo_with_sample():
    """
    Demonstrate the solution with the sample data from the search results.
    """
    print("=== Demo with Sample Data ===")

    # Sample FASTA content from the problem
    sample_fasta = """>Rosalind_10
ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG
>Rosalind_12
ATCGGTCGAA
>Rosalind_15
ATCGGTCGAGCGTGT"""

    # Parse sequences
    sequences = parse_fasta_string(sample_fasta)
    print(f"Parsed {len(sequences)} sequences:")
    for i, (header, seq) in enumerate(sequences):
        print(f"  {i+1}. {header}: {len(seq)} bp")

    # Extract main DNA and introns
    main_dna = sequences[0][1]
    introns = [seq for _, seq in sequences[1:]]

    print(f"\nMain DNA sequence ({len(main_dna)} bp):")
    print(f"{main_dna}")
    print(f"\nIntrons ({len(introns)} total):")
    for i, intron in enumerate(introns, 1):
        print(f"  {i}. {intron} ({len(intron)} bp)")

    # Process with detailed analysis
    splicer = RNASplicer()
    analysis = splicer.get_detailed_analysis(main_dna, introns)

    print(f"\n=== Step-by-Step Analysis ===")
    print(f"Original DNA: {analysis['original_dna']}")
    print(f"After removing introns: {analysis['exons']}")
    print(f"RNA: {analysis['rna']}")
    print(f"Protein: {analysis['protein']}")

    print(f"\n=== Summary ===")
    print(f"Original length: {analysis['original_length']} bp")
    print(f"Removed bases: {analysis['removed_bases']} bp")
    print(f"Final exons: {analysis['exons_length']} bp")
    print(f"Final protein: {analysis['protein_length']} amino acids")

    # Expected output
    print(f"\nExpected output: MVYIADKQHVASREAYGHMFKVCA")
    print(f"Our result:      {analysis['protein']}")
    print(f"Match: {'✓' if analysis['protein'] == 'MVYIADKQHVASREAYGHMFKVCA' else '✗'}")


# Example usage and testing
if __name__ == "__main__":
    # Run demo with sample data
    demo_with_sample()

    print("\n" + "="*60)

    # Test edge cases
    print("=== Testing Edge Cases ===")
    splicer = RNASplicer()

    # Test with no introns
    test_dna = "ATGAAATTCTAG"  # Simple gene: ATG AAA TTC TAG
    test_protein = splicer.process_rna_splicing(test_dna, [])
    print(f"No introns: {test_dna} -> {test_protein}")

    # Test with overlapping removal
    test_dna2 = "ATGAAACCCTTTGGG"
    test_introns = ["AAA", "TTT"]
    test_protein2 = splicer.process_rna_splicing(test_dna2, test_introns)
    print(f"With introns: {test_dna2} -> {test_protein2}")

    print("\n" + "="*60)

    # Uncomment to run with actual file input
    main()


=== Demo with Sample Data ===
Parsed 3 sequences:
  1. Rosalind_10: 100 bp
  2. Rosalind_12: 10 bp
  3. Rosalind_15: 15 bp

Main DNA sequence (100 bp):
ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG

Introns (2 total):
  1. ATCGGTCGAA (10 bp)
  2. ATCGGTCGAGCGTGT (15 bp)

=== Step-by-Step Analysis ===
Original DNA: ATGGTCTACATAGCTGACAAACAGCACGTAGCAATCGGTCGAATCTCGAGAGGCATATGGTCACATGATCGGTCGAGCGTGTTTCAAAGTTTGCGCCTAG
After removing introns: ATGGTCTACATAGCTGACAAACAGCACGTAGCATCTCGAGAGGCATATGGTCACATGTTCAAAGTTTGCGCCTAG
RNA: AUGGUCUACAUAGCUGACAAACAGCACGUAGCAUCUCGAGAGGCAUAUGGUCACAUGUUCAAAGUUUGCGCCUAG
Protein: MVYIADKQHVASREAYGHMFKVCA

=== Summary ===
Original length: 100 bp
Removed bases: 25 bp
Final exons: 75 bp
Final protein: 24 amino acids

Expected output: MVYIADKQHVASREAYGHMFKVCA
Our result:      MVYIADKQHVASREAYGHMFKVCA
Match: ✓

=== Testing Edge Cases ===
No introns: ATGAAATTCTAG -> MKF
With introns: ATGAAACCCTTTGGG -> MPG

Solving RN

## Locating Restriction Sites Problem Solution

In [None]:
"""
Rosalind Locating Restriction Sites Problem Solution

This module finds reverse palindromes in DNA sequences. A reverse palindrome
is a DNA string that equals its reverse complement, which are recognition sites
for restriction enzymes used by bacteria to defend against phages.

The problem asks for all reverse palindromes of length 4-12 with their positions.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Tuple, Optional
import re


class RestrictionSiteFinder:
    """
    A class to find restriction sites (reverse palindromes) in DNA sequences.
    """

    # DNA complement mapping
    COMPLEMENT_MAP = {
        'A': 'T',
        'T': 'A',
        'G': 'C',
        'C': 'G'
    }

    def __init__(self):
        """Initialize the restriction site finder."""
        pass

    @staticmethod
    def get_complement(dna_sequence: str) -> str:
        """
        Get the complement of a DNA sequence.

        Args:
            dna_sequence (str): Input DNA sequence

        Returns:
            str: Complement sequence

        Raises:
            ValueError: If sequence contains invalid bases
        """
        try:
            return ''.join(RestrictionSiteFinder.COMPLEMENT_MAP[base.upper()]
                          for base in dna_sequence)
        except KeyError as e:
            raise ValueError(f"Invalid DNA base found: {e}")

    @staticmethod
    def get_reverse_complement(dna_sequence: str) -> str:
        """
        Get the reverse complement of a DNA sequence.

        Args:
            dna_sequence (str): Input DNA sequence

        Returns:
            str: Reverse complement sequence
        """
        complement = RestrictionSiteFinder.get_complement(dna_sequence)
        return complement[::-1]  # Reverse the complement

    @staticmethod
    def is_reverse_palindrome(dna_sequence: str) -> bool:
        """
        Check if a DNA sequence is a reverse palindrome.

        Args:
            dna_sequence (str): DNA sequence to check

        Returns:
            bool: True if sequence equals its reverse complement
        """
        try:
            reverse_comp = RestrictionSiteFinder.get_reverse_complement(dna_sequence)
            return dna_sequence.upper() == reverse_comp.upper()
        except ValueError:
            return False

    def find_restriction_sites(self, dna_sequence: str,
                             min_length: int = 4,
                             max_length: int = 12) -> List[Tuple[int, int]]:
        """
        Find all restriction sites (reverse palindromes) in a DNA sequence.

        Args:
            dna_sequence (str): Input DNA sequence
            min_length (int): Minimum palindrome length (default: 4)
            max_length (int): Maximum palindrome length (default: 12)

        Returns:
            List[Tuple[int, int]]: List of (position, length) tuples (1-indexed positions)
        """
        # Clean and validate sequence
        sequence = dna_sequence.strip().upper()
        if not all(base in 'ATGC' for base in sequence):
            raise ValueError("DNA sequence contains invalid bases")

        restriction_sites = []
        seq_length = len(sequence)

        # Check all possible positions and lengths
        for position in range(seq_length):
            for length in range(min_length, min(max_length + 1, seq_length - position + 1)):
                # Extract substring
                substring = sequence[position:position + length]

                # Check if it's a reverse palindrome
                if self.is_reverse_palindrome(substring):
                    # Add to results (convert to 1-indexed position)
                    restriction_sites.append((position + 1, length))

        return restriction_sites

    def analyze_palindrome(self, dna_sequence: str) -> dict:
        """
        Provide detailed analysis of a palindromic sequence.

        Args:
            dna_sequence (str): DNA sequence to analyze

        Returns:
            dict: Analysis results including complement, reverse complement, etc.
        """
        sequence = dna_sequence.strip().upper()

        analysis = {
            'sequence': sequence,
            'length': len(sequence),
            'complement': self.get_complement(sequence),
            'reverse': sequence[::-1],
            'reverse_complement': self.get_reverse_complement(sequence),
            'is_reverse_palindrome': self.is_reverse_palindrome(sequence)
        }

        return analysis


def parse_fasta_file(file_path: str) -> Tuple[str, str]:
    """
    Parse FASTA file to extract header and DNA sequence.

    Args:
        file_path (str): Path to FASTA file

    Returns:
        Tuple[str, str]: (header, dna_sequence)

    Raises:
        FileNotFoundError: If file doesn't exist
        ValueError: If file format is invalid
    """
    try:
        with open(file_path, 'r') as file:
            content = file.read().strip()

        lines = content.split('\n')
        if not lines or not lines[0].startswith('>'):
            raise ValueError("Invalid FASTA format: missing header")

        header = lines[0][1:]  # Remove '>' character
        sequence = ''.join(line.strip() for line in lines[1:] if not line.startswith('>'))

        if not sequence:
            raise ValueError("Invalid FASTA format: no sequence found")

        # Validate sequence length constraint
        if len(sequence) > 1000:
            raise ValueError(f"Sequence length {len(sequence)} exceeds maximum of 1000 bp")

        return header, sequence

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")


def write_output_file(output_path: str, restriction_sites: List[Tuple[int, int]]) -> None:
    """
    Write restriction sites to output file.

    Args:
        output_path (str): Path to output file
        restriction_sites (List[Tuple[int, int]]): List of (position, length) pairs
    """
    try:
        with open(output_path, 'w') as file:
            for position, length in restriction_sites:
                file.write(f"{position} {length}\n")
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")


def solve_restriction_sites_problem(input_file_path: str) -> List[Tuple[int, int]]:
    """
    Solve the Restriction Sites problem for a given input file.

    Args:
        input_file_path (str): Path to input FASTA file

    Returns:
        List[Tuple[int, int]]: List of (position, length) for each restriction site

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If input is invalid
    """
    try:
        # Parse FASTA file
        header, dna_sequence = parse_fasta_file(input_file_path)

        # Initialize finder
        finder = RestrictionSiteFinder()

        # Find all restriction sites
        restriction_sites = finder.find_restriction_sites(dna_sequence)

        return restriction_sites

    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")


def main():
    """
    Main function to run the Restriction Sites problem solver.
    Designed to work in Google Colab environment.
    """
    # Configuration
    input_file = "/content/rosalind_revp.txt"  # Change this to your input file name
    output_file = "output_revp.txt"

    try:
        print("Solving Restriction Sites Problem...")

        # Solve the problem
        restriction_sites = solve_restriction_sites_problem(input_file)

        # Display results
        print(f"\nFound {len(restriction_sites)} restriction sites:")
        print("Position Length")
        print("-" * 15)
        for position, length in restriction_sites:
            print(f"{position:>8} {length:>6}")

        # Write to output file
        write_output_file(output_file, restriction_sites)
        print(f"\nResults written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")


def demo_with_sample():
    """
    Demonstrate the solution with the sample data.
    """
    print("=== Demo with Sample Data ===")

    # Sample data from problem
    sample_header = "Rosalind_24"
    sample_dna = "TCAATGCATGCGGGTCTATATGCAT"

    print(f"Header: {sample_header}")
    print(f"DNA Sequence: {sample_dna}")
    print(f"Length: {len(sample_dna)} bp")

    # Find restriction sites
    finder = RestrictionSiteFinder()
    sites = finder.find_restriction_sites(sample_dna)

    print(f"\nFound restriction sites:")
    print("Position Length Sequence")
    print("-" * 25)

    for position, length in sites:
        # Extract the palindromic sequence (convert to 0-indexed for slicing)
        sequence = sample_dna[position-1:position-1+length]
        print(f"{position:>8} {length:>6} {sequence}")

    # Verify a few examples
    print(f"\n=== Verification ===")
    verification_cases = [
        (4, 6),  # Position 4, length 6
        (5, 4),  # Position 5, length 4
    ]

    for pos, length in verification_cases:
        sequence = sample_dna[pos-1:pos-1+length]
        analysis = finder.analyze_palindrome(sequence)

        print(f"\nPosition {pos}, Length {length}:")
        print(f"  Sequence: {analysis['sequence']}")
        print(f"  Reverse complement: {analysis['reverse_complement']}")
        print(f"  Is palindrome: {analysis['is_reverse_palindrome']}")


def test_individual_cases():
    """
    Test individual palindrome cases for educational purposes.
    """
    print("=== Testing Individual Cases ===")

    finder = RestrictionSiteFinder()

    test_sequences = [
        "GCATGC",    # Classic example from problem description
        "ATAT",      # Simple palindrome
        "GAATTC",    # EcoRI recognition site
        "GGATCC",    # BamHI recognition site
        "ATCGAT",    # Another common palindrome
        "ABCD",      # Invalid sequence (should fail)
    ]

    for seq in test_sequences:
        try:
            analysis = finder.analyze_palindrome(seq)
            print(f"\nSequence: {seq}")
            print(f"  Complement: {analysis['complement']}")
            print(f"  Reverse complement: {analysis['reverse_complement']}")
            print(f"  Is reverse palindrome: {analysis['is_reverse_palindrome']}")
        except Exception as e:
            print(f"\nSequence: {seq} -> Error: {e}")


# Example usage and testing
if __name__ == "__main__":
    # Run demo with sample data
    demo_with_sample()

    print("\n" + "="*60)

    # Test individual cases
    test_individual_cases()

    print("\n" + "="*60)

    # Uncomment to run with actual file input
    main()


=== Demo with Sample Data ===
Header: Rosalind_24
DNA Sequence: TCAATGCATGCGGGTCTATATGCAT
Length: 25 bp

Found restriction sites:
Position Length Sequence
-------------------------
       4      6 ATGCAT
       5      4 TGCA
       6      6 GCATGC
       7      4 CATG
      17      4 TATA
      18      4 ATAT
      20      6 ATGCAT
      21      4 TGCA

=== Verification ===

Position 4, Length 6:
  Sequence: ATGCAT
  Reverse complement: ATGCAT
  Is palindrome: True

Position 5, Length 4:
  Sequence: TGCA
  Reverse complement: TGCA
  Is palindrome: True

=== Testing Individual Cases ===

Sequence: GCATGC
  Complement: CGTACG
  Reverse complement: GCATGC
  Is reverse palindrome: True

Sequence: ATAT
  Complement: TATA
  Reverse complement: ATAT
  Is reverse palindrome: True

Sequence: GAATTC
  Complement: CTTAAG
  Reverse complement: GAATTC
  Is reverse palindrome: True

Sequence: GGATCC
  Complement: CCTAGG
  Reverse complement: GGATCC
  Is reverse palindrome: True

Sequence: ATCGAT
  C

## Rosalind Calculating Protein Mass Problem Solution

In [None]:
"""
Rosalind Calculating Protein Mass Problem Solution

This module calculates the total monoisotopic mass of a protein string by summing
the monoisotopic masses of its constituent amino acids.

In mass spectrometry, the monoisotopic mass uses the principal (most abundant)
isotope of each atom. For peptides excised from the middle of proteins, we sum
the residue masses without adding water molecule mass.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import Dict, Optional
import re


class ProteinMassCalculator:
    """
    A class to handle protein mass calculations using monoisotopic masses.
    """

    # Monoisotopic mass table for amino acids (in Daltons)
    # Source: Standard biochemistry references
    MONOISOTOPIC_MASSES = {
        'A': 71.03711,   # Alanine
        'C': 103.00919,  # Cysteine
        'D': 115.02694,  # Aspartic acid
        'E': 129.04259,  # Glutamic acid
        'F': 147.06841,  # Phenylalanine
        'G': 57.02146,   # Glycine
        'H': 137.05891,  # Histidine
        'I': 113.08406,  # Isoleucine
        'K': 128.09496,  # Lysine
        'L': 113.08406,  # Leucine
        'M': 131.04049,  # Methionine
        'N': 114.04293,  # Asparagine
        'P': 97.05276,   # Proline
        'Q': 128.05858,  # Glutamine
        'R': 156.10111,  # Arginine
        'S': 87.03203,   # Serine
        'T': 101.04768,  # Threonine
        'V': 99.06841,   # Valine
        'W': 186.07931,  # Tryptophan
        'Y': 163.06333   # Tyrosine
    }

    # Water molecule monoisotopic mass (for reference, not used in this problem)
    WATER_MASS = 18.01056

    def __init__(self):
        """Initialize the protein mass calculator."""
        pass

    @staticmethod
    def validate_protein_string(protein: str) -> bool:
        """
        Validate that the protein string contains only valid amino acid codes.

        Args:
            protein (str): Protein sequence string

        Returns:
            bool: True if valid, False otherwise
        """
        if not protein:
            return False

        # Check if all characters are valid amino acids
        valid_chars = set(ProteinMassCalculator.MONOISOTOPIC_MASSES.keys())
        protein_chars = set(protein.upper())

        return protein_chars.issubset(valid_chars)

    @staticmethod
    def get_amino_acid_mass(amino_acid: str) -> Optional[float]:
        """
        Get the monoisotopic mass of a single amino acid.

        Args:
            amino_acid (str): Single amino acid code

        Returns:
            float or None: Monoisotopic mass in Daltons, or None if invalid
        """
        return ProteinMassCalculator.MONOISOTOPIC_MASSES.get(amino_acid.upper())

    def calculate_protein_mass(self, protein_string: str) -> float:
        """
        Calculate the total monoisotopic mass of a protein string.

        Args:
            protein_string (str): Protein sequence

        Returns:
            float: Total monoisotopic mass in Daltons

        Raises:
            ValueError: If protein string contains invalid amino acids
        """
        # Clean the protein string
        protein = protein_string.strip().upper()

        # Validate the protein string
        if not self.validate_protein_string(protein):
            invalid_chars = set(protein) - set(self.MONOISOTOPIC_MASSES.keys())
            raise ValueError(f"Invalid amino acid codes found: {invalid_chars}")

        # Calculate total mass
        total_mass = 0.0

        for amino_acid in protein:
            mass = self.get_amino_acid_mass(amino_acid)
            if mass is not None:
                total_mass += mass
            else:
                raise ValueError(f"Unknown amino acid: {amino_acid}")

        return total_mass

    def get_mass_breakdown(self, protein_string: str) -> Dict[str, float]:
        """
        Get a detailed breakdown of mass contributions by amino acid.

        Args:
            protein_string (str): Protein sequence

        Returns:
            Dict[str, float]: Dictionary with amino acid counts and masses
        """
        protein = protein_string.strip().upper()

        if not self.validate_protein_string(protein):
            raise ValueError("Invalid protein string")

        breakdown = {}
        amino_acid_counts = {}

        # Count amino acids
        for aa in protein:
            amino_acid_counts[aa] = amino_acid_counts.get(aa, 0) + 1

        # Calculate mass contributions
        for aa, count in amino_acid_counts.items():
            mass_per_aa = self.get_amino_acid_mass(aa)
            breakdown[aa] = {
                'count': count,
                'mass_per_residue': mass_per_aa,
                'total_mass': mass_per_aa * count
            }

        return breakdown


def parse_input_file(file_path: str) -> str:
    """
    Parse input file to extract the protein string.

    Args:
        file_path (str): Path to input file

    Returns:
        str: Protein string

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If file format is invalid
    """
    try:
        with open(file_path, 'r') as file:
            content = file.read().strip()

        # Remove any whitespace and newlines
        protein_string = re.sub(r'\s+', '', content)

        if not protein_string:
            raise ValueError("Input file is empty or contains no valid protein sequence")

        # Validate length constraint
        if len(protein_string) > 1000:
            raise ValueError(f"Protein string length {len(protein_string)} exceeds maximum of 1000")

        return protein_string

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")


def write_output_file(output_path: str, mass: float) -> None:
    """
    Write result to output file.

    Args:
        output_path (str): Path to output file
        mass (float): Calculated protein mass
    """
    try:
        with open(output_path, 'w') as file:
            # Format to 3 decimal places to match expected output format
            file.write(f"{mass:.3f}\n")
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")


def solve_protein_mass_problem(input_file_path: str) -> float:
    """
    Solve the Protein Mass problem for a given input file.

    Args:
        input_file_path (str): Path to input file containing protein string

    Returns:
        float: Total protein mass in Daltons

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If input is invalid
    """
    try:
        # Parse input
        protein_string = parse_input_file(input_file_path)

        # Initialize calculator
        calculator = ProteinMassCalculator()

        # Calculate mass
        total_mass = calculator.calculate_protein_mass(protein_string)

        return total_mass

    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")


def main():
    """
    Main function to run the Protein Mass problem solver.
    Designed to work in Google Colab environment.
    """
    # Configuration
    input_file = "rosalind_prtm.txt"  # Change this to your input file name
    output_file = "output_prtm.txt"

    try:
        print("Solving Protein Mass Problem...")

        # Solve the problem
        total_mass = solve_protein_mass_problem(input_file)

        # Display results
        print(f"\nResult:")
        print(f"Total protein mass: {total_mass:.3f} Da")

        # Write to output file
        write_output_file(output_file, total_mass)
        print(f"\nResult written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")


def demo_with_sample():
    """
    Demonstrate the solution with the sample data.
    """
    print("=== Demo with Sample Data ===")

    # Sample input
    sample_protein = "SKADYEK"

    print(f"Input protein string: {sample_protein}")

    calculator = ProteinMassCalculator()

    # Calculate mass
    total_mass = calculator.calculate_protein_mass(sample_protein)
    print(f"Total mass: {total_mass:.3f} Da")

    # Expected output
    print(f"Expected output: 821.392 Da")

    # Show detailed breakdown
    print(f"\n=== Detailed Breakdown ===")
    breakdown = calculator.get_mass_breakdown(sample_protein)

    running_total = 0.0
    for aa in sample_protein:
        mass = calculator.get_amino_acid_mass(aa)
        running_total += mass
        print(f"{aa}: {mass:.5f} Da (running total: {running_total:.5f})")

    print(f"\nFinal total: {total_mass:.5f} Da")

    # Show amino acid frequency analysis
    print(f"\n=== Amino Acid Composition ===")
    for aa, info in sorted(breakdown.items()):
        print(f"{aa}: {info['count']} × {info['mass_per_residue']:.5f} = {info['total_mass']:.5f} Da")


def show_mass_table():
    """
    Display the complete monoisotopic mass table.
    """
    print("=== Monoisotopic Mass Table ===")
    print("Amino Acid | Code | Mass (Da)")
    print("-" * 35)

    # Full names for amino acids
    aa_names = {
        'A': 'Alanine', 'C': 'Cysteine', 'D': 'Aspartic acid',
        'E': 'Glutamic acid', 'F': 'Phenylalanine', 'G': 'Glycine',
        'H': 'Histidine', 'I': 'Isoleucine', 'K': 'Lysine',
        'L': 'Leucine', 'M': 'Methionine', 'N': 'Asparagine',
        'P': 'Proline', 'Q': 'Glutamine', 'R': 'Arginine',
        'S': 'Serine', 'T': 'Threonine', 'V': 'Valine',
        'W': 'Tryptophan', 'Y': 'Tyrosine'
    }

    for code, mass in sorted(ProteinMassCalculator.MONOISOTOPIC_MASSES.items()):
        name = aa_names.get(code, 'Unknown')
        print(f"{name:<12} | {code:>4} | {mass:>9.5f}")


# Example usage and testing
if __name__ == "__main__":
    # Show mass table
    show_mass_table()

    print("\n" + "="*60)

    # Run demo with sample
    demo_with_sample()

    print("\n" + "="*60)

    # Test with additional cases
    print("=== Additional Test Cases ===")
    calculator = ProteinMassCalculator()

    test_cases = [
        "A",           # Single amino acid
        "AA",          # Repeated amino acid
        "GAVL",        # Small peptide
        "MSKADYEK",    # Extended sample
    ]

    for test_protein in test_cases:
        try:
            mass = calculator.calculate_protein_mass(test_protein)
            print(f"{test_protein}: {mass:.3f} Da")
        except Exception as e:
            print(f"{test_protein}: Error - {e}")

    print("\n" + "="*60)

    # Uncomment to run with actual file input
    main()


=== Monoisotopic Mass Table ===
Amino Acid | Code | Mass (Da)
-----------------------------------
Alanine      |    A |  71.03711
Cysteine     |    C | 103.00919
Aspartic acid |    D | 115.02694
Glutamic acid |    E | 129.04259
Phenylalanine |    F | 147.06841
Glycine      |    G |  57.02146
Histidine    |    H | 137.05891
Isoleucine   |    I | 113.08406
Lysine       |    K | 128.09496
Leucine      |    L | 113.08406
Methionine   |    M | 131.04049
Asparagine   |    N | 114.04293
Proline      |    P |  97.05276
Glutamine    |    Q | 128.05858
Arginine     |    R | 156.10111
Serine       |    S |  87.03203
Threonine    |    T | 101.04768
Valine       |    V |  99.06841
Tryptophan   |    W | 186.07931
Tyrosine     |    Y | 163.06333

=== Demo with Sample Data ===
Input protein string: SKADYEK
Total mass: 821.392 Da
Expected output: 821.392 Da

=== Detailed Breakdown ===
S: 87.03203 Da (running total: 87.03203)
K: 128.09496 Da (running total: 215.12699)
A: 71.03711 Da (running total: 286.

## Rosalind Open Reading Frames (ORF) Problem Solution

In [None]:
"""
Rosalind Open Reading Frames (ORF) Problem Solution

This module finds all distinct candidate protein strings that can be translated
from Open Reading Frames (ORFs) in a DNA sequence. It considers all 6 reading
frames: 3 from the original sequence and 3 from the reverse complement.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Set, Optional
import re


class DNATranslator:
    """
    A class to handle DNA sequence translation and ORF finding operations.
    """

    # Standard genetic code table (DNA codons to amino acids)
    CODON_TABLE = {
        'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
        'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
        'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*',
        'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W',
        'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
        'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
        'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
        'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
        'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
        'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
        'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
        'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
        'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
        'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
        'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
        'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'
    }

    # Complement mapping for DNA bases
    COMPLEMENT_MAP = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}

    START_CODON = 'ATG'
    STOP_CODONS = {'TAA', 'TAG', 'TGA'}

    def __init__(self):
        """Initialize the DNA translator."""
        pass

    @staticmethod
    def reverse_complement(dna_sequence: str) -> str:
        """
        Generate the reverse complement of a DNA sequence.

        Args:
            dna_sequence (str): Input DNA sequence

        Returns:
            str: Reverse complement of the input sequence

        Raises:
            ValueError: If sequence contains invalid DNA bases
        """
        try:
            complement = ''.join(DNATranslator.COMPLEMENT_MAP[base]
                               for base in dna_sequence.upper())
            return complement[::-1]
        except KeyError as e:
            raise ValueError(f"Invalid DNA base found: {e}")

    @staticmethod
    def translate_codon(codon: str) -> Optional[str]:
        """
        Translate a single codon to its corresponding amino acid.

        Args:
            codon (str): 3-letter DNA codon

        Returns:
            str or None: Corresponding amino acid or None if invalid codon
        """
        if len(codon) != 3:
            return None
        return DNATranslator.CODON_TABLE.get(codon.upper())

    def find_orfs_in_frame(self, sequence: str, frame: int = 0) -> List[str]:
        """
        Find all ORFs in a specific reading frame.

        Args:
            sequence (str): DNA sequence
            frame (int): Reading frame offset (0, 1, or 2)

        Returns:
            List[str]: List of protein sequences found in this frame
        """
        proteins = []
        sequence = sequence.upper()

        # Start from the specified frame
        for i in range(frame, len(sequence) - 2, 3):
            codon = sequence[i:i+3]

            # Check if this is a start codon
            if codon == self.START_CODON:
                protein = 'M'  # Start with Methionine

                # Continue translating until stop codon or end of sequence
                for j in range(i + 3, len(sequence) - 2, 3):
                    next_codon = sequence[j:j+3]
                    amino_acid = self.translate_codon(next_codon)

                    if amino_acid is None:  # Invalid codon
                        break
                    elif amino_acid == '*':  # Stop codon
                        proteins.append(protein)
                        break
                    else:
                        protein += amino_acid

        return proteins

    def find_all_orfs(self, dna_sequence: str) -> Set[str]:
        """
        Find all possible ORFs in a DNA sequence across all 6 reading frames.

        Args:
            dna_sequence (str): Input DNA sequence

        Returns:
            Set[str]: Set of unique protein sequences
        """
        all_proteins = set()

        # Clean the sequence (remove whitespace and convert to uppercase)
        clean_sequence = re.sub(r'\s+', '', dna_sequence.upper())

        # Get reverse complement
        rev_comp = self.reverse_complement(clean_sequence)

        # Find ORFs in all 6 reading frames
        sequences = [clean_sequence, rev_comp]

        for seq in sequences:
            for frame in range(3):
                proteins = self.find_orfs_in_frame(seq, frame)
                all_proteins.update(proteins)

        return all_proteins


def parse_fasta(file_content: str) -> List[tuple]:
    """
    Parse FASTA format content and return sequences.

    Args:
        file_content (str): Content of FASTA file

    Returns:
        List[tuple]: List of (header, sequence) tuples
    """
    sequences = []
    current_header = ""
    current_sequence = ""

    for line in file_content.strip().split('\n'):
        line = line.strip()
        if line.startswith('>'):
            if current_header and current_sequence:
                sequences.append((current_header, current_sequence))
            current_header = line[1:]  # Remove '>' character
            current_sequence = ""
        else:
            current_sequence += line

    # Add the last sequence
    if current_header and current_sequence:
        sequences.append((current_header, current_sequence))

    return sequences


def solve_orf_problem(input_file_path: str) -> List[str]:
    """
    Solve the Open Reading Frame problem for a given input file.

    Args:
        input_file_path (str): Path to input file containing DNA sequence in FASTA format

    Returns:
        List[str]: List of unique protein sequences

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If file format is invalid
    """
    try:
        # Read input file
        with open(input_file_path, 'r') as file:
            file_content = file.read()

        # Parse FASTA format
        sequences = parse_fasta(file_content)

        if not sequences:
            raise ValueError("No valid FASTA sequences found in input file")

        # Initialize translator
        translator = DNATranslator()

        # Process all sequences (usually just one for this problem)
        all_proteins = set()
        for header, sequence in sequences:
            proteins = translator.find_all_orfs(sequence)
            all_proteins.update(proteins)

        return sorted(list(all_proteins))

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{input_file_path}' not found")
    except Exception as e:
        raise ValueError(f"Error processing file: {str(e)}")


def main():
    """
    Main function to run the ORF problem solver.
    Designed to work in Google Colab environment.
    """
    # For Google Colab usage
    input_file = "rosalind_orf.txt"  # Change this to your input file name

    try:
        # Solve the problem
        protein_sequences = solve_orf_problem(input_file)

        # Print results
        print("Found protein sequences:")
        for protein in protein_sequences:
            print(protein)

        # Optionally write to output file
        with open("/content/rosalind_orf.txt", "w") as f:
            for protein in protein_sequences:
                f.write(protein + "\n")

        print(f"\nTotal unique protein sequences found: {len(protein_sequences)}")

    except Exception as e:
        print(f"Error: {e}")
        print("Make sure your input file is in the correct FASTA format")


# Example usage and testing
if __name__ == "__main__":
    # Test with sample data
    sample_data = """>Rosalind_99
AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"""

    # Test the parsing
    sequences = parse_fasta(sample_data)
    print("Parsed sequences:")
    for header, seq in sequences:
        print(f"Header: {header}")
        print(f"Sequence length: {len(seq)}")

    # Test the translator
    translator = DNATranslator()
    if sequences:
        proteins = translator.find_all_orfs(sequences[0][1])
        print(f"\nFound proteins: {sorted(proteins)}")

    # Run main function
    main()


Parsed sequences:
Header: Rosalind_99
Sequence length: 96

Found proteins: ['M', 'MGMTPRLGLESLLE', 'MLLGSFRLIPKETLIQVAGSSPCNLS', 'MTPRLGLESLLE']
Found protein sequences:
M
MAGNSYTESRRSSPIPKLK
MAGPS
MCPARSDISEI
MDLRKCVFSCSGHW
MGMAGNSYTESRRSSPIPKLK
MGRPC
MHCSQFLGLRTQGDLVHSNCVSRISCSK
MHFCFTGVCVTSGCNGDNSSDDIVV
MISG
MISWCRDTSLI
MKLSCLRGLLLMDLRKCVFSCSGHW
MKYLYTTISSDELSPLHPEVTHTPVKQKCIVVSS
MLEVARLLERC
MLISRAR
MPIRY
MPRTAENTLTQVH
MQRLTV
MRDLWV
MRKCENRQLPHKGFAPITRCS
MSERAGHIKNKIYRVI
MSLPSPEVQ
MSVRCHGAGIGSHNTRPDDFLGLDFTYVRASWAH
MTSLG

Total unique protein sequences found: 24


## Solution for Enumerating Gene Orders (Permutations) Problem

In [None]:
"""
Rosalind Enumerating Gene Orders Problem Solution

This module generates all possible permutations of a given length n, representing
different arrangements of synteny blocks in genomic rearrangements.

The problem asks for:
1. Total number of permutations of length n
2. All permutations listed in any order

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from itertools import permutations
from typing import List, Tuple
import math


class PermutationGenerator:
    """
    A class to handle permutation generation and counting operations.
    """

    def __init__(self):
        """Initialize the permutation generator."""
        pass

    @staticmethod
    def calculate_factorial(n: int) -> int:
        """
        Calculate factorial of n (n!).

        Args:
            n (int): Non-negative integer

        Returns:
            int: Factorial of n

        Raises:
            ValueError: If n is negative
        """
        if n < 0:
            raise ValueError("Factorial is not defined for negative numbers")
        return math.factorial(n)

    @staticmethod
    def count_permutations(n: int) -> int:
        """
        Count the total number of permutations of length n.

        Args:
            n (int): Length of permutation

        Returns:
            int: Total number of permutations (n!)
        """
        return PermutationGenerator.calculate_factorial(n)

    @staticmethod
    def generate_all_permutations(n: int) -> List[Tuple[int, ...]]:
        """
        Generate all permutations of integers from 1 to n.

        Args:
            n (int): Length of permutation

        Returns:
            List[Tuple[int, ...]]: List of all permutations

        Raises:
            ValueError: If n is not a positive integer or exceeds reasonable limits
        """
        if not isinstance(n, int) or n <= 0:
            raise ValueError("n must be a positive integer")

        if n > 10:  # Safety check to prevent memory issues
            raise ValueError("n is too large. Maximum supported value is 10")

        # Generate all permutations of {1, 2, ..., n}
        numbers = list(range(1, n + 1))
        all_perms = list(permutations(numbers))

        return all_perms

    def solve_permutation_problem(self, n: int) -> Tuple[int, List[str]]:
        """
        Solve the complete permutation problem.

        Args:
            n (int): Length of permutation

        Returns:
            Tuple[int, List[str]]: (count_of_permutations, formatted_permutations)
        """
        # Validate input
        if not isinstance(n, int) or n <= 0:
            raise ValueError("Input must be a positive integer")

        if n > 7:  # Based on problem constraint
            raise ValueError("n must be ≤ 7 according to problem constraints")

        # Generate all permutations
        all_perms = self.generate_all_permutations(n)

        # Count permutations
        count = len(all_perms)

        # Format permutations as strings
        formatted_perms = [' '.join(map(str, perm)) for perm in all_perms]

        return count, formatted_perms


def parse_input_file(file_path: str) -> int:
    """
    Parse input file to extract the integer n.

    Args:
        file_path (str): Path to input file

    Returns:
        int: The integer n from the file

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If file content is invalid
    """
    try:
        with open(file_path, 'r') as file:
            content = file.read().strip()

        # Parse the integer
        try:
            n = int(content)
            return n
        except ValueError:
            raise ValueError(f"Invalid input: '{content}' is not a valid integer")

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")


def write_output_file(output_path: str, count: int, permutations: List[str]) -> None:
    """
    Write results to output file.

    Args:
        output_path (str): Path to output file
        count (int): Number of permutations
        permutations (List[str]): List of formatted permutations
    """
    try:
        with open(output_path, 'w') as file:
            # Write count first
            file.write(f"{count}\n")

            # Write all permutations
            for perm in permutations:
                file.write(f"{perm}\n")

    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")


def solve_gene_orders_problem(input_file_path: str) -> Tuple[int, List[str]]:
    """
    Solve the Gene Orders problem for a given input file.

    Args:
        input_file_path (str): Path to input file containing integer n

    Returns:
        Tuple[int, List[str]]: (count, list_of_permutations)

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If input is invalid
    """
    try:
        # Parse input
        n = parse_input_file(input_file_path)

        # Validate constraints
        if n > 7:
            raise ValueError("n must be ≤ 7 according to problem constraints")

        # Initialize solver
        generator = PermutationGenerator()

        # Solve the problem
        count, permutations = generator.solve_permutation_problem(n)

        return count, permutations

    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")


def main():
    """
    Main function to run the Gene Orders problem solver.
    Designed to work in Google Colab environment.
    """
    # Configuration
    input_file = "/content/rosalind_perm.txt"  # Change this to your input file name
    output_file = "output_perm.txt"

    try:
        print("Solving Gene Orders (Permutations) Problem...")

        # Solve the problem
        count, permutations = solve_gene_orders_problem(input_file)

        # Display results
        print(f"\nResults:")
        print(f"Total number of permutations: {count}")
        print(f"All permutations:")

        for perm in permutations:
            print(perm)

        # Write to output file
        write_output_file(output_file, count, permutations)
        print(f"\nResults written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")


def demo_with_sample():
    """
    Demonstrate the solution with the sample data.
    """
    print("=== Demo with Sample Data ===")

    # Sample input
    n = 3

    generator = PermutationGenerator()
    count, permutations = generator.solve_permutation_problem(n)

    print(f"Input: {n}")
    print(f"Output:")
    print(count)
    for perm in permutations:
        print(perm)

    # Verify the math
    expected_count = math.factorial(n)
    print(f"\nVerification: {n}! = {expected_count} ✓" if count == expected_count else "✗")


# Example usage and testing
if __name__ == "__main__":
    # Run demo first
    demo_with_sample()

    print("\n" + "="*50)

    # Test with different values
    print("=== Testing with different values ===")
    generator = PermutationGenerator()

    for test_n in [1, 2, 3, 4]:
        try:
            count, perms = generator.solve_permutation_problem(test_n)
            print(f"n={test_n}: {count} permutations")
        except Exception as e:
            print(f"n={test_n}: Error - {e}")

    print("\n" + "="*50)

    # Uncomment to run with actual file input
    main()


=== Demo with Sample Data ===
Input: 3
Output:
6
1 2 3
1 3 2
2 1 3
2 3 1
3 1 2
3 2 1

Verification: 3! = 6 ✓

=== Testing with different values ===
n=1: 1 permutations
n=2: 2 permutations
n=3: 6 permutations
n=4: 24 permutations

Solving Gene Orders (Permutations) Problem...

Results:
Total number of permutations: 720
All permutations:
1 2 3 4 5 6
1 2 3 4 6 5
1 2 3 5 4 6
1 2 3 5 6 4
1 2 3 6 4 5
1 2 3 6 5 4
1 2 4 3 5 6
1 2 4 3 6 5
1 2 4 5 3 6
1 2 4 5 6 3
1 2 4 6 3 5
1 2 4 6 5 3
1 2 5 3 4 6
1 2 5 3 6 4
1 2 5 4 3 6
1 2 5 4 6 3
1 2 5 6 3 4
1 2 5 6 4 3
1 2 6 3 4 5
1 2 6 3 5 4
1 2 6 4 3 5
1 2 6 4 5 3
1 2 6 5 3 4
1 2 6 5 4 3
1 3 2 4 5 6
1 3 2 4 6 5
1 3 2 5 4 6
1 3 2 5 6 4
1 3 2 6 4 5
1 3 2 6 5 4
1 3 4 2 5 6
1 3 4 2 6 5
1 3 4 5 2 6
1 3 4 5 6 2
1 3 4 6 2 5
1 3 4 6 5 2
1 3 5 2 4 6
1 3 5 2 6 4
1 3 5 4 2 6
1 3 5 4 6 2
1 3 5 6 2 4
1 3 5 6 4 2
1 3 6 2 4 5
1 3 6 2 5 4
1 3 6 4 2 5
1 3 6 4 5 2
1 3 6 5 2 4
1 3 6 5 4 2
1 4 2 3 5 6
1 4 2 3 6 5
1 4 2 5 3 6
1 4 2 5 6 3
1 4 2 6 3 5
1 4 2 6 5 3
1 4 3 2 5 6
1 

## Insertion Sort Swaps Problem

In [None]:
"""
Rosalind Insertion Sort Swaps Problem Solution

This module counts the number of swaps (shifts) performed by the insertion sort
algorithm when sorting an array of integers.

The problem asks for the total number of swaps needed to sort an array using
insertion sort, which is equivalent to counting the number of inversions in the array.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Tuple
import copy


class InsertionSortAnalyzer:
    """
    A class to analyze insertion sort performance and count swaps/inversions.
    """

    def __init__(self):
        """Initialize the insertion sort analyzer."""
        self.swap_count = 0

    def insertion_sort_with_count(self, arr: List[int]) -> int:
        """
        Perform insertion sort while counting the number of swaps.

        Args:
            arr (List[int]): Array to sort (will be modified)

        Returns:
            int: Number of swaps performed
        """
        swap_count = 0
        n = len(arr)

        # Start from second element (index 1)
        for i in range(1, n):
            current_value = arr[i]
            position = i

            # Shift elements to the right while they are greater than current_value
            while position > 0 and arr[position - 1] > current_value:
                arr[position] = arr[position - 1]  # This is a swap/shift
                position -= 1
                swap_count += 1

            # Place current_value at its correct position
            arr[position] = current_value

        return swap_count

    def count_swaps_without_sorting(self, arr: List[int]) -> int:
        """
        Count swaps needed for insertion sort without actually sorting.
        This counts inversions in the array.

        Args:
            arr (List[int]): Original array (won't be modified)

        Returns:
            int: Number of swaps needed
        """
        swap_count = 0
        n = len(arr)

        # For each element starting from second
        for i in range(1, n):
            current_value = arr[i]

            # Count how many elements to the left are greater
            for j in range(i - 1, -1, -1):
                if arr[j] > current_value:
                    swap_count += 1
                else:
                    break  # Elements to the left are sorted, so we can break

        return swap_count

    def count_inversions_merge_sort(self, arr: List[int]) -> int:
        """
        Count inversions using modified merge sort (O(n log n) approach).

        Args:
            arr (List[int]): Array to analyze

        Returns:
            int: Number of inversions (swaps needed)
        """
        def merge_and_count(arr: List[int], temp: List[int], left: int, mid: int, right: int) -> int:
            """Helper function to merge and count inversions."""
            i, j, k = left, mid + 1, left
            inversion_count = 0

            # Merge the two halves while counting inversions
            while i <= mid and j <= right:
                if arr[i] <= arr[j]:
                    temp[k] = arr[i]
                    i += 1
                else:
                    temp[k] = arr[j]
                    # All elements from i to mid are greater than arr[j]
                    inversion_count += (mid - i + 1)
                    j += 1
                k += 1

            # Copy remaining elements
            while i <= mid:
                temp[k] = arr[i]
                i += 1
                k += 1

            while j <= right:
                temp[k] = arr[j]
                j += 1
                k += 1

            # Copy back to original array
            for i in range(left, right + 1):
                arr[i] = temp[i]

            return inversion_count

        def merge_sort_and_count(arr: List[int], temp: List[int], left: int, right: int) -> int:
            """Recursive function to perform merge sort and count inversions."""
            inversion_count = 0
            if left < right:
                mid = (left + right) // 2

                inversion_count += merge_sort_and_count(arr, temp, left, mid)
                inversion_count += merge_sort_and_count(arr, temp, mid + 1, right)
                inversion_count += merge_and_count(arr, temp, left, mid, right)

            return inversion_count

        # Create a copy to avoid modifying original array
        arr_copy = arr.copy()
        temp = [0] * len(arr_copy)
        return merge_sort_and_count(arr_copy, temp, 0, len(arr_copy) - 1)

    def solve_insertion_sort_problem(self, arr: List[int], method: str = "direct") -> int:
        """
        Solve the insertion sort swap counting problem.

        Args:
            arr (List[int]): Input array
            method (str): Method to use - "direct", "count_only", or "merge_sort"

        Returns:
            int: Number of swaps needed
        """
        if not arr:
            return 0

        if method == "direct":
            # Actually perform insertion sort and count swaps
            arr_copy = copy.deepcopy(arr)
            return self.insertion_sort_with_count(arr_copy)

        elif method == "count_only":
            # Count swaps without sorting (O(n²) but doesn't modify array)
            return self.count_swaps_without_sorting(arr)

        elif method == "merge_sort":
            # Use merge sort approach (O(n log n))
            return self.count_inversions_merge_sort(arr)

        else:
            raise ValueError("Method must be 'direct', 'count_only', or 'merge_sort'")


def parse_input_file(file_path: str) -> Tuple[int, List[int]]:
    """
    Parse input file to extract n and the array.

    Args:
        file_path (str): Path to input file

    Returns:
        Tuple[int, List[int]]: (n, array)

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If file format is invalid
    """
    try:
        with open(file_path, 'r') as file:
            lines = file.read().strip().split('\n')

        if len(lines) < 2:
            raise ValueError("Input file must contain at least 2 lines")

        # Parse n
        try:
            n = int(lines[0].strip())
        except ValueError:
            raise ValueError(f"First line must be an integer, got: '{lines[0]}'")

        # Parse array
        try:
            array = list(map(int, lines[1].strip().split()))
        except ValueError:
            raise ValueError(f"Second line must contain integers, got: '{lines[1]}'")

        # Validate
        if len(array) != n:
            raise ValueError(f"Array length {len(array)} doesn't match specified n={n}")

        if n > 1000:
            raise ValueError(f"n must be ≤ 1000, got {n}")

        return n, array

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")


def write_output_file(output_path: str, swap_count: int) -> None:
    """
    Write result to output file.

    Args:
        output_path (str): Path to output file
        swap_count (int): Number of swaps
    """
    try:
        with open(output_path, 'w') as file:
            file.write(f"{swap_count}\n")
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")


def solve_insertion_sort_swaps_problem(input_file_path: str) -> int:
    """
    Solve the Insertion Sort Swaps problem for a given input file.

    Args:
        input_file_path (str): Path to input file

    Returns:
        int: Number of swaps needed

    Raises:
        FileNotFoundError: If input file doesn't exist
        ValueError: If input is invalid
    """
    try:
        # Parse input
        n, array = parse_input_file(input_file_path)

        # Initialize analyzer
        analyzer = InsertionSortAnalyzer()

        # Solve using the most appropriate method
        # For n ≤ 1000, direct method is fine
        if n <= 1000:
            swap_count = analyzer.solve_insertion_sort_problem(array, method="direct")
        else:
            # Use merge sort method for larger arrays
            swap_count = analyzer.solve_insertion_sort_problem(array, method="merge_sort")

        return swap_count

    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")


def main():
    """
    Main function to run the Insertion Sort Swaps problem solver.
    Designed to work in Google Colab environment.
    """
    # Configuration
    input_file = "rosalind_ins.txt"  # Change this to your input file name
    output_file = "output_ins.txt"

    try:
        print("Solving Insertion Sort Swaps Problem...")

        # Solve the problem
        swap_count = solve_insertion_sort_swaps_problem(input_file)

        # Display results
        print(f"\nResult:")
        print(f"Number of swaps needed: {swap_count}")

        # Write to output file
        write_output_file(output_file, swap_count)
        print(f"\nResult written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")


def demo_with_sample():
    """
    Demonstrate the solution with the sample data.
    """
    print("=== Demo with Sample Data ===")

    # Sample input
    n = 6
    array = [6, 10, 4, 5, 1, 2]

    print(f"Input:")
    print(f"n = {n}")
    print(f"Array = {array}")

    analyzer = InsertionSortAnalyzer()

    # Test all methods
    methods = ["direct", "count_only", "merge_sort"]

    for method in methods:
        swap_count = analyzer.solve_insertion_sort_problem(array.copy(), method)
        print(f"\nMethod '{method}': {swap_count} swaps")

    # Expected output is 12
    print(f"\nExpected output: 12")

    # Show step-by-step insertion sort
    print(f"\n=== Step-by-step insertion sort ===")
    arr_copy = array.copy()
    print(f"Initial: {arr_copy}")

    total_swaps = 0
    for i in range(1, len(arr_copy)):
        current = arr_copy[i]
        pos = i
        swaps_this_round = 0

        while pos > 0 and arr_copy[pos - 1] > current:
            arr_copy[pos] = arr_copy[pos - 1]
            pos -= 1
            swaps_this_round += 1

        arr_copy[pos] = current
        total_swaps += swaps_this_round
        print(f"After inserting {current}: {arr_copy} (swaps: {swaps_this_round}, total: {total_swaps})")


# Example usage and testing
if __name__ == "__main__":
    # Run demo first
    demo_with_sample()

    print("\n" + "="*60)

    # Test with additional cases
    print("=== Additional Test Cases ===")
    analyzer = InsertionSortAnalyzer()

    test_cases = [
        [1, 2, 3, 4, 5],  # Already sorted - 0 swaps
        [5, 4, 3, 2, 1],  # Reverse sorted - maximum swaps
        [3, 1, 4, 1, 5],  # Random case
    ]

    for i, test_array in enumerate(test_cases):
        swaps = analyzer.solve_insertion_sort_problem(test_array.copy(), "direct")
        print(f"Test {i+1}: {test_array} -> {swaps} swaps")

    print("\n" + "="*60)

    # Uncomment to run with actual file input
    main()


=== Demo with Sample Data ===
Input:
n = 6
Array = [6, 10, 4, 5, 1, 2]

Method 'direct': 12 swaps

Method 'count_only': 6 swaps

Method 'merge_sort': 12 swaps

Expected output: 12

=== Step-by-step insertion sort ===
Initial: [6, 10, 4, 5, 1, 2]
After inserting 10: [6, 10, 4, 5, 1, 2] (swaps: 0, total: 0)
After inserting 4: [4, 6, 10, 5, 1, 2] (swaps: 2, total: 2)
After inserting 5: [4, 5, 6, 10, 1, 2] (swaps: 2, total: 4)
After inserting 1: [1, 4, 5, 6, 10, 2] (swaps: 4, total: 8)
After inserting 2: [1, 2, 4, 5, 6, 10] (swaps: 4, total: 12)

=== Additional Test Cases ===
Test 1: [1, 2, 3, 4, 5] -> 0 swaps
Test 2: [5, 4, 3, 2, 1] -> 10 swaps
Test 3: [3, 1, 4, 1, 5] -> 3 swaps

Solving Insertion Sort Swaps Problem...
Error: Error solving problem: Input file 'rosalind_ins.txt' not found
