<a href="https://colab.research.google.com/github/mbjallow6/Algorithms-python/blob/main/Rossalind_Problems_Part_Two.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [41]:
# import data from the computer
from google.colab import files
uploaded = files.upload()

Saving rosalind_pdpl.txt to rosalind_pdpl.txt


## Rosalind Cyclic Superstring from k-mers Solution

In [None]:
"""
Rosalind Cyclic Superstring from k-mers Solution (Corrected)

This module reconstructs a minimal cyclic superstring from a set of k-mers
forming a de Bruijn graph with exactly one simple cycle.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Dict, Set
from collections import defaultdict, deque

class CyclicSuperstringReconstructor:
    """
    A class to reconstruct a minimal cyclic superstring from k-mers.
    """

    def __init__(self):
        """Initialize the reconstructor."""
        pass

    def build_de_bruijn_graph(self, kmers: List[str]) -> Dict[str, List[str]]:
        """
        Build the de Bruijn graph from k-mers.

        Args:
            kmers (List[str]): List of k-mers

        Returns:
            Dict[str, List[str]]: Adjacency list
        """
        adj = defaultdict(list)
        for kmer in kmers:
            prefix = kmer[:-1]
            suffix = kmer[1:]
            adj[prefix].append(suffix)
        return adj

    def find_eulerian_cycle(self, adj: Dict[str, List[str]]) -> List[str]:
        graph = defaultdict(list)
        for node, neighbors in adj.items():
            graph[node] = list(neighbors)

        # Total edges
        total_edges = sum(len(graph[node]) for node in graph)

        # Start from a node with outgoing edges
        start = next((node for node in graph if graph[node]), None)
        if not start:
            return []

        cycle = []
        stack = [start]

        while stack:
            current = stack[-1]
            if graph[current]:
                stack.append(graph[current].pop())
            else:
                cycle.append(stack.pop())

        cycle.reverse()

        # Merge sub-cycles if remaining edges
        while sum(len(graph[node]) for node in graph) > 0:
            # Find node in cycle with remaining edges
            for idx, node in enumerate(cycle):
                if graph[node]:
                    # Run sub-tour
                    sub_stack = [node]
                    sub_cycle = []
                    while sub_stack:
                        sub_current = sub_stack[-1]
                        if graph[sub_current]:
                            sub_stack.append(graph[sub_current].pop())
                        else:
                            sub_cycle.append(sub_stack.pop())
                    sub_cycle.reverse()
                    # Insert into main cycle
                    cycle = cycle[:idx] + sub_cycle + cycle[idx + 1:]
                    break
            else:
                raise ValueError("No node with remaining edges found in cycle")

        # Cycle length should be total_edges + 1
        if len(cycle) != total_edges + 1:
            raise ValueError("Cycle length incorrect")

        return cycle

    def reconstruct_cyclic_superstring(self, kmers: List[str]) -> str:
        if not kmers:
            return ""

        adj = self.build_de_bruijn_graph(kmers)
        cycle = self.find_eulerian_cycle(adj)

        m = len(kmers)
        k = len(kmers[0]) if kmers else 0

        if len(cycle) != m + 1:
            raise ValueError("Eulerian cycle does not cover all k-mers")

        # Build full superstring
        superstring = cycle[0]
        for node in cycle[1:]:
            superstring += node[-1]

        # Length should be (k-1) + m
        if len(superstring) != k + m - 1:
            raise ValueError("Full string length incorrect")

        # Trim last (k-1) for minimal cyclic
        superstring = superstring[:m]

        return superstring

    def verify_superstring(self, kmers: List[str], superstring: str) -> Tuple[bool, str]:
        """
        Verify if the superstring contains all k-mers cyclically.

        Args:
            kmers (List[str]): Original k-mers
            superstring (str): Reconstructed superstring

        Returns:
            Tuple[bool, str]: (is_valid, message)
        """
        k = len(kmers[0]) if kmers else 0
        n = len(superstring)
        generated = set()
        for i in range(n):
            substr = superstring[i:i+k]
            if len(substr) == k:
                generated.add(substr)
            else:
                # Wrap around
                wrap = superstring[i:] + superstring[:k - len(superstring[i:])]
                generated.add(wrap)
        if generated == set(kmers):
            return True, "Valid: Contains all k-mers cyclically"
        return False, f"Invalid: Missing some k-mers (generated {len(generated)}, expected {len(kmers)})"

def parse_input_file(file_path: str) -> List[str]:
    """
    Parse input file to extract k-mers.

    Args:
        file_path (str): Path to input file

    Returns:
        List[str]: List of k-mers
    """
    try:
        with open(file_path, 'r') as file:
            kmers = [line.strip() for line in file if line.strip()]
        if not kmers:
            raise ValueError("No k-mers found in input")
        k = len(kmers[0])
        if not all(len(kmer) == k for kmer in kmers):
            raise ValueError("All k-mers must have the same length")
        return kmers
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")

def write_output_file(output_path: str, superstring: str) -> None:
    """
    Write the superstring to output file.

    Args:
        output_path (str): Path to output file
        superstring (str): Cyclic superstring
    """
    try:
        with open(output_path, 'w') as file:
            file.write(superstring + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")

def solve_cyclic_superstring(input_file_path: str) -> str:
    """
    Solve the cyclic superstring reconstruction problem.

    Args:
        input_file_path (str): Input file path

    Returns:
        str: Minimal cyclic superstring
    """
    try:
        kmers = parse_input_file(input_file_path)
        print(f"Parsed {len(kmers)} k-mers of length {len(kmers[0])}")

        reconstructor = CyclicSuperstringReconstructor()
        superstring = reconstructor.reconstruct_cyclic_superstring(kmers)

        is_valid, msg = reconstructor.verify_superstring(kmers, superstring)
        if not is_valid:
            raise ValueError(msg)
        print(f"Verification: {msg}")

        return superstring
    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")

def main():
    """
    Main function to run the cyclic superstring solver.
    """
    # Configuration
    input_file = "rosalind_pcov.txt"  # Change to your input file name
    output_file = "output_pcov.txt"

    try:
        print("Solving Cyclic Superstring Problem...")

        # Solve the problem
        superstring = solve_cyclic_superstring(input_file)

        # Display result
        print(f"\nCyclic Superstring: {superstring}")

        # Write to output file
        write_output_file(output_file, superstring)
        print(f"Result written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

def test_cyclic_superstring():
    """Test with sample and edge cases."""
    print("=== Testing Cyclic Superstring Reconstructor ===")

    test_cases = [
        {
            "kmers": ["ATTAC", "TACAG", "GATTA", "ACAGA", "CAGAT", "TTACA", "AGATT"],
            "expected": "GATTACA"
        },
    ]

    reconstructor = CyclicSuperstringReconstructor()

    for i, case in enumerate(test_cases, 1):
        kmers = case["kmers"]
        expected = case["expected"]
        print(f"\nTest {i}: {len(kmers)} k-mers")

        superstring = reconstructor.reconstruct_cyclic_superstring(kmers)
        print(f"Reconstructed: {superstring}")

        is_valid, msg = reconstructor.verify_superstring(kmers, superstring)
        # Check cyclic equivalence
        n = len(expected)
        matches_expected = any(superstring == expected[i:] + expected[:i] for i in range(n))
        print(f"Valid: {'✓' if is_valid and matches_expected else '✗'} - {msg} (Matches expected cyclically: {matches_expected})")

if __name__ == "__main__":
    # Run tests
    test_cyclic_superstring()

    print("\n" + "="*60)

    # Run main function
    print("Running main function...")
    main()


## Rosalind De Bruijn Graph with Reverse Complements Solution

In [None]:
"""
Rosalind De Bruijn Graph with Reverse Complements Solution

This module constructs the de Bruijn graph from (k+1)-mers including reverse
complements and outputs the adjacency list.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Set, Dict
from collections import defaultdict

class DeBruijnGraphBuilder:
    """
    A class to build de Bruijn graph from reads and their reverse complements.
    """

    def __init__(self):
        """Initialize the graph builder."""
        self.complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

    def reverse_complement(self, seq: str) -> str:
        """
        Compute reverse complement of a DNA sequence.

        Args:
            seq (str): DNA sequence

        Returns:
            str: Reverse complement
        """
        return ''.join(self.complement[base] for base in reversed(seq))

    def build_graph(self, reads: List[str]) -> Dict[str, Set[str]]:
        """
        Build de Bruijn graph adjacency list.

        Args:
            reads (List[str]): List of (k+1)-mers

        Returns:
            Dict[str, Set[str]]: Adjacency list (prefix -> set of suffixes)
        """
        if not reads:
            return {}

        k = len(reads[0]) - 1
        unique_reads = set(reads)
        for read in list(unique_reads):
            unique_reads.add(self.reverse_complement(read))

        adj = defaultdict(set)
        for read in unique_reads:
            if len(read) != k + 1:
                continue
            prefix = read[:-1]
            suffix = read[1:]
            adj[prefix].add(suffix)

        return adj

    def get_adjacency_list(self, adj: Dict[str, Set[str]]) -> List[str]:
        """
        Get sorted adjacency list in format "(prefix, suffix)".

        Args:
            adj (Dict[str, Set[str]]): Adjacency list

        Returns:
            List[str]: Sorted list of edge strings
        """
        edges = []
        for prefix in sorted(adj):
            for suffix in sorted(adj[prefix]):
                edges.append(f"({prefix}, {suffix})")
        return edges

    def verify_graph(self, reads: List[str], edges: List[str]) -> Tuple[bool, str]:
        """
        Verify the graph against known cases.

        Args:
            reads (List[str]): Input reads
            edges (List[str]): Computed edges

        Returns:
            Tuple[bool, str]: (is_valid, message)
        """
        # For sample, check if all expected edges are present
        expected_sample = set([
            "(ATC, TCA)", "(ATG, TGA)", "(ATG, TGC)", "(CAT, ATC)",
            "(CAT, ATG)", "(GAT, ATG)", "(GCA, CAT)", "(TCA, CAT)",
            "(TGA, GAT)"
        ])
        computed_set = set(edges)
        if computed_set == expected_sample:
            return True, "Valid: Matches expected edges"
        return False, "Invalid: Does not match expected edges"

def parse_input_file(file_path: str) -> List[str]:
    """
    Parse input file to extract reads.

    Args:
        file_path (str): Path to input file

    Returns:
        List[str]: List of reads
    """
    try:
        with open(file_path, 'r') as file:
            reads = [line.strip() for line in file if line.strip()]
        if not reads:
            raise ValueError("No reads found in input")
        length = len(reads[0])
        if not all(len(read) == length for read in reads):
            raise ValueError("All reads must have equal length")
        return reads
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")

def write_output_file(output_path: str, edges: List[str]) -> None:
    """
    Write adjacency list to output file.

    Args:
        output_path (str): Path to output file
        edges (List[str]): List of edge strings
    """
    try:
        with open(output_path, 'w') as file:
            file.write('\n'.join(edges) + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")

def solve_de_bruijn_graph(input_file_path: str) -> List[str]:
    """
    Solve the de Bruijn graph construction problem.

    Args:
        input_file_path (str): Input file path

    Returns:
        List[str]: Adjacency list strings
    """
    try:
        reads = parse_input_file(input_file_path)
        print(f"Parsed {len(reads)} reads of length {len(reads[0])}")

        builder = DeBruijnGraphBuilder()
        adj = builder.build_graph(reads)
        edges = builder.get_adjacency_list(adj)

        is_valid, msg = builder.verify_graph(reads, edges)
        print(f"Verification: {msg}")

        return edges
    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")

def main():
    """
    Main function to run the de Bruijn graph solver.
    """
    # Configuration
    input_file = "rosalind_dbru.txt"  # Change to your input file name
    output_file = "output_dbru.txt"

    try:
        print("Solving De Bruijn Graph Problem...")

        # Solve the problem
        edges = solve_de_bruijn_graph(input_file)

        # Display partial result
        print(f"\nPartial Adjacency List (first 3):")
        for edge in edges[:3]:
            print(edge)

        # Write to output file
        write_output_file(output_file, edges)
        print(f"Full result written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

def test_de_bruijn_graph():
    """Test with sample case."""
    print("=== Testing De Bruijn Graph Builder ===")

    sample_reads = ["TGAT", "CATG", "TCAT", "ATGC", "CATC", "CATC"]
    print("\nSample Test:")

    builder = DeBruijnGraphBuilder()
    adj = builder.build_graph(sample_reads)
    edges = builder.get_adjacency_list(adj)

    print("Computed Edges:")
    for edge in edges[:3]:
        print(edge)

    is_valid, msg = builder.verify_graph(sample_reads, edges)
    print(f"Valid: {'✓' if is_valid else '✗'} - {msg}")

if __name__ == "__main__":
    # Run test
    test_de_bruijn_graph()

    print("\n" + "="*60)

    # Run main function
    print("Running main function...")
    main()


## Rosalind Spectral Convolution Solution

In [None]:
"""
Rosalind Spectral Convolution Solution

This module computes the Minkowski difference (spectral convolution) of two multisets,
finds the difference with maximum multiplicity, and outputs the multiplicity and |x|.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Dict, Tuple
from collections import defaultdict

class SpectralConvolution:
    """
    A class to compute spectral convolution and find max multiplicity.
    """

    def __init__(self, precision: int = 5):
        """Initialize with rounding precision for floats."""
        self.precision = precision

    def compute_convolution(self, s1: List[float], s2: List[float]) -> Dict[float, int]:
        """
        Compute S1 ⊖ S2 as a dict of differences to their multiplicities.

        Args:
            s1 (List[float]): First multiset
            s2 (List[float]): Second multiset

        Returns:
            Dict[float, int]: Differences rounded, with counts
        """
        diff_count = defaultdict(int)
        for val1 in s1:
            for val2 in s2:
                diff = round(val1 - val2, self.precision)
                diff_count[diff] += 1
        return diff_count

    def find_max_multiplicity(self, diff_count: Dict[float, int]) -> Tuple[int, float]:
        """
        Find the max multiplicity and the absolute value of corresponding x.

        Args:
            diff_count (Dict[float, int]): Difference counts

        Returns:
            Tuple[int, float]: (max_multiplicity, abs_x)
        """
        if not diff_count:
            return 0, 0.0

        max_mult = max(diff_count.values())
        # Find any x with max_mult (problem allows any)
        max_x = next(x for x, mult in diff_count.items() if mult == max_mult)
        return max_mult, abs(max_x)

    def verify_convolution(self, s1: List[float], s2: List[float], expected_mult: int, expected_x: float) -> Tuple[bool, str]:
        """
        Verify against known cases.

        Args:
            s1, s2: Multisets
            expected_mult: Expected max multiplicity
            expected_x: Expected |x|

        Returns:
            Tuple[bool, str]: (is_valid, message)
        """
        diff_count = self.compute_convolution(s1, s2)
        mult, abs_x = self.find_max_multiplicity(diff_count)
        if mult == expected_mult and abs(abs_x - expected_x) < 1e-5:
            return True, "Valid: Matches expected"
        return False, f"Invalid: Got {mult}, {abs_x}; expected {expected_mult}, {expected_x}"

def parse_input_file(file_path: str) -> Tuple[List[float], List[float]]:
    """
    Parse input file: first line S1, second line S2.

    Args:
        file_path (str): Path to input file

    Returns:
        Tuple[List[float], List[float]]: S1 and S2
    """
    try:
        with open(file_path, 'r') as file:
            lines = [line.strip() for line in file if line.strip()]
        if len(lines) != 2:
            raise ValueError("Input must have exactly two lines")
        s1 = [float(x) for x in lines[0].split()]
        s2 = [float(x) for x in lines[1].split()]
        return s1, s2
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")
    except ValueError:
        raise ValueError("Invalid numbers in input")

def write_output_file(output_path: str, mult: int, abs_x: float) -> None:
    """
    Write multiplicity and |x| to output file.

    Args:
        output_path (str): Path to output file
        mult (int): Max multiplicity
        abs_x (float): Absolute x
    """
    try:
        with open(output_path, 'w') as file:
            file.write(f"{mult}\n{abs_x}\n")
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")

def solve_spectral_convolution(input_file_path: str) -> Tuple[int, float]:
    """
    Solve the spectral convolution problem.

    Args:
        input_file_path (str): Input file path

    Returns:
        Tuple[int, float]: (max_multiplicity, abs_x)
    """
    try:
        s1, s2 = parse_input_file(input_file_path)
        print(f"Parsed S1 ({len(s1)} elements), S2 ({len(s2)} elements)")

        conv = SpectralConvolution()
        diff_count = conv.compute_convolution(s1, s2)
        mult, abs_x = conv.find_max_multiplicity(diff_count)

        return mult, abs_x
    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")

def main():
    """
    Main function to run the spectral convolution solver.
    """
    # Configuration
    input_file = "rosalind_conv.txt"  # Change to your input file name
    output_file = "output_conv.txt"

    try:
        print("Solving Spectral Convolution Problem...")

        # Solve the problem
        mult, abs_x = solve_spectral_convolution(input_file)

        # Display result
        print(f"\nMax Multiplicity: {mult}")
        print(f"Absolute x: {abs_x}")

        # Write to output file
        write_output_file(output_file, mult, abs_x)
        print(f"Result written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

def test_spectral_convolution():
    """Test with sample and edge cases."""
    print("=== Testing Spectral Convolution ===")

    test_cases = [
        {
            "s1": [186.07931, 287.12699, 548.20532, 580.18077, 681.22845, 706.27446, 782.27613, 968.35544, 968.35544],
            "s2": [101.04768, 158.06914, 202.09536, 318.09979, 419.14747, 463.17369],
            "expected_mult": 3,
            "expected_x": 85.03163
        },
        {"s1": [1.0, 2.0], "s2": [1.0], "expected_mult": 1, "expected_x": 0.0},  # Simple case
    ]

    conv = SpectralConvolution()

    for i, case in enumerate(test_cases, 1):
        s1 = case["s1"]
        s2 = case["s2"]
        print(f"\nTest {i}: S1={s1[:3]}..., S2={s2[:3]}...")

        diff_count = conv.compute_convolution(s1, s2)
        mult, abs_x = conv.find_max_multiplicity(diff_count)
        print(f"Max Multiplicity: {mult}, Abs x: {abs_x}")

        is_valid, msg = conv.verify_convolution(s1, s2, case["expected_mult"], case["expected_x"])
        print(f"Valid: {'✓' if is_valid else '✗'} - {msg}")

if __name__ == "__main__":
    # Run tests
    test_spectral_convolution()

    print("\n" + "="*60)

    # Run main function
    print("Running main function...")
    main()


## Protein Reconstruction from Prefix Spectrum Solution

In [None]:
"""
Rosalind Protein Reconstruction from Prefix Spectrum Solution (Corrected)

This module reconstructs a protein string from a list of prefix masses using
the monoisotopic mass table, computing consecutive differences.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Dict, Tuple

class ProteinReconstructor:
    """
    A class to reconstruct protein from prefix spectrum.
    """

    def __init__(self):
        """Initialize with monoisotopic mass table."""
        self.mass_table = {
            'A': 71.03711, 'C': 103.00919, 'D': 115.02694, 'E': 129.04259,
            'F': 147.06841, 'G': 57.02146, 'H': 137.05891, 'I': 113.08406,
            'K': 128.09496, 'L': 113.08406, 'M': 131.04049, 'N': 114.04293,
            'P': 97.05276, 'Q': 128.05858, 'R': 156.10111, 'S': 87.03203,
            'T': 101.04768, 'V': 99.06841, 'W': 186.07931, 'Y': 163.06333
        }

    def reconstruct_protein(self, masses: List[float]) -> str:
        """
        Reconstruct protein string from prefix masses by computing differences.

        Args:
            masses (List[float]): List of prefix masses

        Returns:
            str: Protein string
        """
        if len(masses) < 2:
            return ""

        # Sort masses to ensure increasing prefix order
        masses = sorted(masses)

        protein = []
        for i in range(1, len(masses)):
            diff = round(masses[i] - masses[i-1], 5)
            # Find closest matching mass
            closest = min(self.mass_table, key=lambda aa: abs(self.mass_table[aa] - diff))
            if abs(self.mass_table[closest] - diff) > 0.001:
                raise ValueError(f"No amino acid matches mass diff {diff}")
            protein.append(closest)

        return ''.join(protein)

    def verify_reconstruction(self, masses: List[float], protein: str) -> Tuple[bool, str]:
        """
        Verify by checking if differences in masses match amino acid masses.

        Args:
            masses (List[float]): Original masses
            protein (str): Reconstructed protein

        Returns:
            Tuple[bool, str]: (is_valid, message)
        """
        sorted_masses = sorted(masses)
        computed_diffs = [round(sorted_masses[i] - sorted_masses[i-1], 5) for i in range(1, len(masses))]
        protein_diffs = [round(self.mass_table[aa], 5) for aa in protein]
        if computed_diffs == protein_diffs:
            return True, "Valid: Differences match amino acid masses"
        return False, f"Invalid: Differences do not match (computed {computed_diffs}, protein {protein_diffs})"

def parse_input_file(file_path: str) -> List[float]:
    """
    Parse input file to extract list of masses.

    Args:
        file_path (str): Path to input file

    Returns:
        List[float]: List of masses
    """
    try:
        with open(file_path, 'r') as file:
            lines = [line.strip() for line in file if line.strip() and not line.startswith('>')]
        masses = [float(line) for line in lines]
        return masses
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")
    except ValueError:
        raise ValueError("Invalid mass value in input")

def write_output_file(output_path: str, protein: str) -> None:
    """
    Write the protein string to output file.

    Args:
        output_path (str): Path to output file
        protein (str): Protein string
    """
    try:
        with open(output_path, 'w') as file:
            file.write(protein + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")

def solve_protein_reconstruction(input_file_path: str) -> str:
    """
    Solve the protein reconstruction problem.

    Args:
        input_file_path (str): Input file path

    Returns:
        str: Reconstructed protein
    """
    try:
        masses = parse_input_file(input_file_path)
        print(f"Parsed {len(masses)} masses")

        reconstructor = ProteinReconstructor()
        protein = reconstructor.reconstruct_protein(masses)

        is_valid, msg = reconstructor.verify_reconstruction(masses, protein)
        if not is_valid:
            raise ValueError(msg)
        print(f"Verification: {msg}")

        return protein
    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")

def main():
    """
    Main function to run the protein reconstruction solver.
    """
    # Configuration
    input_file = "rosalind_spec.txt"  # Standard Rosalind ID for this problem
    output_file = "output_spec.txt"

    try:
        print("Solving Protein Reconstruction Problem...")

        # Solve the problem
        protein = solve_protein_reconstruction(input_file)

        # Display result
        print(f"\nReconstructed Protein: {protein}")

        # Write to output file
        write_output_file(output_file, protein)
        print(f"Result written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

def test_protein_reconstruction():
    """Test with sample and edge cases."""
    print("=== Testing Protein Reconstructor ===")

    test_cases = [
        {"masses": [3524.8542, 3710.9335, 3841.974, 3970.0326, 4057.0646], "expected": "WMQS"},
        {"masses": [3710.9335, 3841.974, 3970.0326, 4057.0646, 4243.1439], "expected": "WMQSW"},  # Hypothetical extension
    ]

    reconstructor = ProteinReconstructor()

    for i, case in enumerate(test_cases, 1):
        masses = case["masses"]
        expected = case["expected"]
        print(f"\nTest {i}: Masses={masses}")

        protein = reconstructor.reconstruct_protein(masses)
        print(f"Reconstructed: {protein}")

        is_valid, msg = reconstructor.verify_reconstruction(masses, protein)
        matches_expected = protein == expected
        print(f"Valid: {'✓' if is_valid and matches_expected else '✗'} - {msg} (Matches expected: {matches_expected})")

if __name__ == "__main__":
    # Run tests
    test_protein_reconstruction()

    print("\n" + "="*60)

    # Run main function
    print("Running main function...")
    main()


## Rosalind Failure Array (KMP Prefix Table) Solution

In [None]:
"""
Rosalind Failure Array (KMP Prefix Table) Solution

This module computes the failure array for a DNA string using the Knuth-Morris-Pratt algorithm.
It handles FASTA input and outputs the array as space-separated integers.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Tuple

class FailureArrayComputer:
    """
    A class to compute the KMP failure array for a string.
    """

    def __init__(self):
        """Initialize the failure array computer."""
        pass

    def compute_failure_array(self, s: str) -> List[int]:
        """
        Compute the failure array (prefix table) for the KMP algorithm.

        Args:
            s (str): Input string

        Returns:
            List[int]: Failure array P where P[i] is the longest proper prefix-suffix match for s[0..i]
        """
        n = len(s)
        if n == 0:
            return []

        pi = [0] * n
        j = 0  # Length of previous longest prefix suffix

        for i in range(1, n):
            while j > 0 and s[i] != s[j]:
                j = pi[j - 1]
            if s[i] == s[j]:
                j += 1
            pi[i] = j

        return pi

    def verify_failure_array(self, s: str, pi: List[int]) -> Tuple[bool, str]:
        """
        Verify the failure array against known cases.

        Args:
            s (str): Input string
            pi (List[int]): Computed failure array

        Returns:
            Tuple[bool, str]: (is_valid, message)
        """
        known_cases = {
            "CAGCATGGTATCACAGCAGAG": [0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 2, 1, 2, 3, 4, 5, 3, 0, 0],
            "AAA": [0, 1, 2],
            "ABC": [0, 0, 0],
            "": []
        }
        if s in known_cases:
            expected = known_cases[s]
            if pi == expected:
                return True, "Valid: Matches expected array"
            return False, f"Invalid: Expected {expected}, got {pi}"
        return True, "No verification available for this input"

def parse_fasta_file(file_path: str) -> str:
    """
    Parse FASTA file to extract the DNA sequence.

    Args:
        file_path (str): Path to input file

    Returns:
        str: Concatenated DNA sequence
    """
    try:
        with open(file_path, 'r') as file:
            lines = [line.strip() for line in file]
        sequence = ''
        in_sequence = False
        for line in lines:
            if line.startswith('>'):
                in_sequence = True
                continue
            if in_sequence:
                sequence += line
        if not sequence:
            raise ValueError("No sequence found in FASTA file")
        return sequence
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")

def write_output_file(output_path: str, pi: List[int]) -> None:
    """
    Write the failure array to the output file.

    Args:
        output_path (str): Path to output file
        pi (List[int]): Failure array
    """
    try:
        with open(output_path, 'w') as file:
            file.write(' '.join(map(str, pi)) + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")

def solve_failure_array_problem(input_file_path: str) -> List[int]:
    """
    Solve the failure array problem for the given input file.

    Args:
        input_file_path (str): Path to FASTA input file

    Returns:
        List[int]: Computed failure array
    """
    try:
        s = parse_fasta_file(input_file_path)
        print(f"Parsed sequence: length={len(s)}")

        computer = FailureArrayComputer()
        pi = computer.compute_failure_array(s)

        is_valid, msg = computer.verify_failure_array(s, pi)
        if not is_valid:
            raise ValueError(msg)
        print(f"Verification: {msg}")

        return pi
    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")

def main():
    """
    Main function to run the failure array solver.
    """
    # Configuration
    input_file = "rosalind_kmp.txt"  # Change this to your input file name
    output_file = "output_kmp.txt"

    try:
        print("Solving Failure Array Problem...")

        # Solve the problem
        pi = solve_failure_array_problem(input_file)

        # Display results
        print(f"\nResult (first 10 values): {' '.join(map(str, pi[:10]))} ...")

        # Write to output file
        write_output_file(output_file, pi)
        print(f"Result written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")

def test_failure_array():
    """Test the failure array computation with sample and edge cases."""
    print("=== Testing Failure Array Computer ===")

    test_cases = [
        {"s": "CAGCATGGTATCACAGCAGAG"},
        {"s": "AAA"},
        {"s": "ABC"},
        {"s": ""},
    ]

    computer = FailureArrayComputer()

    for i, case in enumerate(test_cases, 1):
        s = case["s"]
        print(f"\nTest {i}: s={s}")

        pi = computer.compute_failure_array(s)
        print(f"Failure array: {pi}")

        is_valid, msg = computer.verify_failure_array(s, pi)
        print(f"Valid: {'✓' if is_valid else '✗'} - {msg}")

if __name__ == "__main__":
    # Run tests
    test_failure_array()

    print("\n" + "="*60)

    # Run main function
    print("Running main function...")
    main()


## Rosalind 4-mer Composition Solution

In [None]:
"""
Rosalind 4-mer Composition Solution

This module computes the 4-mer composition of a DNA string, counting occurrences
of all possible 4-mers in lexicographic order.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Dict
from itertools import product

class KmerComposer:
    """
    A class to compute k-mer composition of a DNA string.
    """

    def __init__(self, k: int = 4):
        """Initialize with k-mer length."""
        self.k = k
        self.alphabet = ['A', 'C', 'G', 'T']

    def generate_all_kmers(self) -> List[str]:
        """
        Generate all possible k-mers in lexicographic order.

        Returns:
            List[str]: Sorted list of all k-mers.
        """
        kmers = [''.join(p) for p in product(self.alphabet, repeat=self.k)]
        return sorted(kmers)

    def count_kmers(self, dna: str) -> Dict[str, int]:
        """
        Count occurrences of each k-mer in the DNA string (overlapping).

        Args:
            dna (str): DNA sequence.

        Returns:
            Dict[str, int]: Counts for each k-mer.
        """
        if len(dna) < self.k:
            return {}

        counts = {kmer: 0 for kmer in self.generate_all_kmers()}
        for i in range(len(dna) - self.k + 1):
            substring = dna[i:i + self.k]
            if substring in counts:
                counts[substring] += 1
        return counts

    def get_composition(self, dna: str) -> List[int]:
        """
        Get the composition as a list of counts in lex order.

        Args:
            dna (str): DNA sequence.

        Returns:
            List[int]: Counts in order.
        """
        counts = self.count_kmers(dna)
        all_kmers = self.generate_all_kmers()
        return [counts.get(kmer, 0) for kmer in all_kmers]

    def verify_composition(self, dna: str, composition: List[int]) -> Tuple[bool, str]:
        """
        Verify composition for small test cases.

        Args:
            dna (str): DNA sequence
            composition (List[int]): Computed composition

        Returns:
            Tuple[bool, str]: (is_valid, message)
        """
        # Example: For dna="CTGA", 2-mers: AA=0, AC=0, ..., TG=1, TT=0 (adapt for 4-mer if needed)
        # For simplicity, check sum equals expected number of k-mers
        expected_count = max(0, len(dna) - self.k + 1)
        total = sum(composition)
        if total == expected_count:
            return True, f"Valid: Total counts match {expected_count}"
        return False, f"Invalid: Total {total}, expected {expected_count}"

def parse_fasta_file(file_path: str) -> str:
    """
    Parse FASTA file to extract DNA sequence.

    Args:
        file_path (str): Path to input file

    Returns:
        str: Concatenated DNA sequence
    """
    try:
        with open(file_path, 'r') as file:
            lines = file.readlines()
        sequence = ''
        for line in lines:
            if not line.startswith('>'):
                sequence += line.strip()
        if not all(base in 'ACGT' for base in sequence):
            raise ValueError("Invalid DNA sequence: only A, C, G, T allowed")
        return sequence
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")

def write_output_file(output_path: str, composition: List[int]) -> None:
    """
    Write composition to output file as space-separated integers.

    Args:
        output_path (str): Path to output file
        composition (List[int]): List of counts
    """
    try:
        with open(output_path, 'w') as file:
            file.write(' '.join(map(str, composition)) + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")

def solve_kmer_composition(input_file_path: str) -> List[int]:
    """
    Solve the k-mer composition problem.

    Args:
        input_file_path (str): Input FASTA file path

    Returns:
        List[int]: 4-mer composition
    """
    try:
        dna = parse_fasta_file(input_file_path)
        print(f"Parsed DNA: length={len(dna)}")

        composer = KmerComposer(k=4)
        composition = composer.get_composition(dna)

        is_valid, msg = composer.verify_composition(dna, composition)
        if not is_valid:
            raise ValueError(msg)
        print(f"Verification: {msg}")

        return composition
    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")

def main():
    """
    Main function to run the k-mer composition solver.
    """
    # Configuration
    input_file = "rosalind_kmer.txt"  # Change to your input file name
    output_file = "output_kmer.txt"

    try:
        print("Solving 4-mer Composition Problem...")

        # Solve the problem
        composition = solve_kmer_composition(input_file)

        # Display partial results (first 10 for brevity)
        print(f"\nPartial Result (first 10 counts): {' '.join(map(str, composition[:10]))}")

        # Write to output file
        write_output_file(output_file, composition)
        print(f"Full result written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")

def test_kmer_composition():
    """Test the k-mer composer with small cases."""
    print("=== Testing K-mer Composer ===")

    test_cases = [
        {"dna": "CTGA", "k": 2, "expected_total": 3},  # CT, TG, GA
        {"dna": "AAAA", "k": 4, "expected_count_AAAA": 1},
    ]

    for i, case in enumerate(test_cases, 1):
        composer = KmerComposer(k=case["k"])
        composition = composer.get_composition(case["dna"])
        total = sum(composition)
        print(f"\nTest {i}: DNA={case['dna']}, k={case['k']}")
        print(f"Total k-mers counted: {total}")
        is_valid, msg = composer.verify_composition(case["dna"], composition)
        print(f"Valid: {'✓' if is_valid else '✗'} - {msg}")

if __name__ == "__main__":
    # Run tests
    test_kmer_composition()

    print("\n" + "="*60)

    # Run main function
    print("Running main function...")
    main()


## Counting Valid RNA Secondary Structures with Wobble Pairs

In [None]:
"""
Rosalind Counting Valid RNA Secondary Structures with Wobble Pairs

This module uses iterative dynamic programming to count valid noncrossing matchings
in RNA bonding graphs, allowing wobble pairs and minimum distance constraints.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import List, Tuple

class RNAMatchingCounter:
    """
    Class to count valid RNA matchings.
    """

    def __init__(self):
        """Initialize base pair rules."""
        self.base_pairs = {
            ('A', 'U'), ('U', 'A'),
            ('C', 'G'), ('G', 'C'),
            ('G', 'U'), ('U', 'G')
        }

    def can_pair(self, base1: str, base2: str) -> bool:
        """Check if two bases can pair."""
        return (base1, base2) in self.base_pairs

    def count_valid_matchings(self, rna: str) -> int:
        """
        Count valid matchings using 2D DP.

        dp[i][j]: number of valid matchings for subsequence rna[i..j]
        """
        n = len(rna)
        if n == 0:
            return 1

        dp: List[List[int]] = [[0] * n for _ in range(n)]

        # Base cases: single base or empty
        for i in range(n):
            dp[i][i] = 1  # Unpaired single base

        for length in range(1, n):
            for i in range(n - length):
                j = i + length
                # Case: j is unpaired
                dp[i][j] = dp[i][j - 1]
                # Case: j pairs with some k
                for k in range(i, j):
                    if self.can_pair(rna[k], rna[j]) and (j - k) >= 4:
                        left = dp[i][k - 1] if k > i else 1
                        inside = dp[k + 1][j - 1] if k + 1 <= j - 1 else 1
                        dp[i][j] += left * inside

        return dp[0][n - 1]

    def verify_count(self, rna: str, count: int) -> Tuple[bool, str]:
        """Verify against known small cases."""
        known = {
            "CGAUGCUAG": 12,
            "AU": 1,
            "AUGC": 1,
            "AUGCAU": 2,
            "AUGCUAGUACGGAGCGAGUCUAGCGAGCGAUGUCGUGAGUACUAUAUAUGCGCAUAAGCCACGU": 284850219977421
        }
        if rna in known:
            if count == known[rna]:
                return True, "Valid: Matches expected"
            return False, f"Invalid: Expected {known[rna]}, got {count}"
        return True, "No verification available"

def parse_input_file(file_path: str) -> str:
    """Parse RNA from file."""
    with open(file_path, 'r') as f:
        rna = ''.join(line.strip() for line in f if line.strip())
    if not all(b in 'AUGC' for b in rna):
        raise ValueError("Invalid RNA sequence")
    return rna

def write_output_file(output_path: str, result: int) -> None:
    """Write result to file."""
    with open(output_path, 'w') as f:
        f.write(str(result) + '\n')

def solve_problem(input_file: str, output_file: str) -> None:
    """Solve and output."""
    rna = parse_input_file('/content/rosalind_rnas.txt')
    counter = RNAMatchingCounter()
    result = counter.count_valid_matchings(rna)
    is_valid, msg = counter.verify_count(rna, result)
    if not is_valid:
        raise ValueError(msg)
    write_output_file(output_file, result)
    print(f"Result: {result} (Verification: {msg})")

# Usage example (adjust file names as needed)
if __name__ == "__main__":
    input_file = "rosalind_input.txt"
    output_file = "rosalind_output.txt"
    solve_problem(input_file, output_file)


## Rosalind Distances in Trees Problem

In [None]:
import re

class Node:
    """A simple class to represent a node in a tree."""
    def __init__(self, name=None, parent=None):
        self.name = name
        self.parent = parent
        self.children = []

    def __repr__(self):
        return f"Node({self.name})"

def parse_newick(newick_string):
    """
    Parses a Newick format string into a tree of Node objects.

    Returns a dictionary mapping all named nodes to their Node objects.
    """
    # Clean up the string by removing the final semicolon and any whitespace
    tokens = re.split(r'([,();])', newick_string.strip())
    tokens = [t.strip() for t in tokens if t.strip()]

    root = Node()
    current_node = root
    nodes_map = {}

    # Counter for generating unique names for internal nodes that are not explicitly named
    internal_node_counter = 0

    for token in tokens:
        if token == '(':
            # Start of a new set of children. Create a new child node.
            # If the current node doesn't have a name yet, it's an internal node.
            if current_node.name is None:
                current_node.name = f"internal_{internal_node_counter}"
                internal_node_counter += 1
            if current_node.name not in nodes_map:
                nodes_map[current_node.name] = current_node

            # Create a new child and descend into it
            new_child = Node(parent=current_node)
            current_node.children.append(new_child)
            current_node = new_child

        elif token == ',':
            # A sibling follows. Go back to the parent to add the next sibling.
            current_node = current_node.parent
            new_sibling = Node(parent=current_node)
            current_node.children.append(new_sibling)
            current_node = new_sibling

        elif token == ')':
            # End of a children list. Go back to the parent.
            current_node = current_node.parent

        elif token == ';':
            # End of the entire tree string
            break

        else:
            # This token is a name for the current node.
            current_node.name = token
            nodes_map[token] = current_node

    # If the root node was a simple leaf (e.g., "dog;"), it won't be in the map yet.
    if root.name and root.name not in nodes_map:
        nodes_map[root.name] = root

    return nodes_map

def solve_paths_in_trees_no_bio():
    """
    Reads trees, calculates distances without Biopython, and writes results.
    """
    try:
        with open('rosalind_nwck.txt', 'r') as f:
            data_sets = f.read().strip().split('\n\n')

        distances = []

        for data_set in data_sets:
            if not data_set:
                continue

            newick_string, nodes_line = data_set.strip().split('\n')
            node1_name, node2_name = nodes_line.split()

            # Step 1: Parse the Newick string into a tree
            nodes_map = parse_newick(newick_string)

            # Get the node objects from the map
            node1 = nodes_map[node1_name]
            node2 = nodes_map[node2_name]

            # Step 2: Find the path from node1 to the root
            path_to_root1 = []
            curr = node1
            while curr:
                path_to_root1.append(curr)
                curr = curr.parent

            # Create a set for quick lookups of ancestors
            ancestors1 = set(path_to_root1)

            # Step 3: Find the LCA by traversing from node2 to the root
            lca = None
            path_to_lca2 = []
            curr = node2
            while curr:
                path_to_lca2.append(curr)
                if curr in ancestors1:
                    lca = curr
                    break
                curr = curr.parent

            # Step 4: Calculate the distance
            # The distance is the number of edges, which is (path_len1 - 1) + (path_len2 - 1)
            dist1 = path_to_root1.index(lca)
            dist2 = path_to_lca2.index(lca)
            total_distance = dist1 + dist2
            distances.append(str(total_distance))

        with open('output_nwck.txt', 'w') as f:
            f.write(' '.join(distances))

        print("Processing complete. Results are in 'output_nwck.txt'.")

    except FileNotFoundError:
        print("Error: 'rosalind_nwck.txt' not found.")
        print("Please create the file and add your input data.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Execute the function
solve_paths_in_trees_no_bio()



## Motzkin Numbers and RNA Secondary Structures

In [None]:
"""
Rosalind Motzkin Numbers and RNA Secondary Structures Solution

This module implements a dynamic programming approach based on Motzkin numbers
to count noncrossing matchings in RNA secondary structures. It handles large inputs
efficiently and includes verification for correctness.

Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""

from typing import Dict, Tuple

class MotzkinRNA:
    """
    A class to compute the number of noncrossing matchings in RNA secondary structures
    using a modified Motzkin recurrence.
    """

    def __init__(self):
        """Initialize the Motzkin RNA counter with base pair rules."""
        self.base_pairs = {('A', 'U'), ('U', 'A'), ('G', 'C'), ('C', 'G')}
        self.mod = 1000000

    def can_pair(self, base1: str, base2: str) -> bool:
        """
        Check if two RNA bases can form a pair.

        Args:
            base1 (str): First RNA base
            base2 (str): Second RNA base

        Returns:
            bool: True if bases can pair, False otherwise
        """
        return (base1, base2) in self.base_pairs

    def count_noncrossing_matchings(self, rna: str) -> int:
        """
        Count the number of noncrossing matchings in the RNA bonding graph using
        dynamic programming based on Motzkin numbers.

        Args:
            rna (str): RNA sequence

        Returns:
            int: Number of noncrossing matchings modulo 1,000,000
        """
        n = len(rna)
        if n == 0:
            return 1

        # dp[i][j] represents the number of noncrossing matchings for subsequence i to j
        dp = [[0] * n for _ in range(n)]

        # Initialize base cases for single and empty subsequences
        for i in range(n):
            dp[i][i] = 1  # Single base, no pairing
            if i + 1 < n:
                dp[i][i+1] = 1  # Two bases, no pairing yet

        # Fill the dp table for subsequences of length 2 to n
        for length in range(2, n + 1):
            for start in range(n - length + 1):
                end = start + length - 1
                # Case 1: Last base is unpaired
                dp[start][end] = dp[start][end - 1]

                # Case 2: Last base pairs with some base k
                for k in range(start, end):
                    if self.can_pair(rna[k], rna[end]):
                        if k == start:
                            # Pairing with first base, count inside
                            inside = dp[start + 1][end - 1] if start + 1 <= end - 1 else 1
                            dp[start][end] = (dp[start][end] + inside) % self.mod
                        else:
                            # Split into two parts: start to k-1 and k+1 to end-1
                            left = dp[start][k - 1] if start <= k - 1 else 1
                            right = dp[k + 1][end - 1] if k + 1 <= end - 1 else 1
                            dp[start][end] = (dp[start][end] + left * right) % self.mod

        return dp[0][n - 1]

    def verify_count(self, rna: str, count: int) -> Tuple[bool, str]:
        """
        Verify the count of noncrossing matchings for small test cases.

        Args:
            rna (str): RNA sequence
            count (int): Computed number of matchings

        Returns:
            Tuple[bool, str]: (is_valid, error_message)
        """
        # For small known cases, verify against expected output
        known_cases = {
            "AUAU": 7,
            "AU": 2,
            "A": 1,
            "": 1
        }
        if rna in known_cases:
            expected = known_cases[rna]
            if count == expected:
                return True, f"Valid: Matches expected count {expected} for {rna}"
            return False, f"Invalid: Expected {expected} for {rna}, got {count}"
        return True, "No verification for this input (unknown case)"

def parse_input_file(file_path: str) -> str:
    """
    Parse input file to extract RNA sequence.

    Args:
        file_path (str): Path to input file

    Returns:
        str: RNA sequence
    """
    try:
        with open(file_path, 'r') as file:
            lines = [line.strip() for line in file if line.strip()]

        if not lines:
            raise ValueError("Input file is empty")

        # Skip the header line if it starts with '>'
        rna = ''.join(lines[1:]) if lines[0].startswith('>') else ''.join(lines)
        if not all(base in 'AUGC' for base in rna):
            raise ValueError("Invalid RNA sequence: must contain only A, U, G, C")
        return rna

    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")
    except Exception as e:
        raise ValueError(f"Input parsing error: {e}")

def write_output_file(output_path: str, result: int) -> None:
    """
    Write the result to the output file.

    Args:
        output_path (str): Path to output file
        result (int): Number of noncrossing matchings
    """
    try:
        with open(output_path, 'w') as file:
            file.write(str(result) + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")

def solve_motzkin_rna_problem(input_file_path: str) -> int:
    """
    Solve the Motzkin RNA problem for a given input file.

    Args:
        input_file_path (str): Path to input file

    Returns:
        int: Number of noncrossing matchings modulo 1,000,000
    """
    try:
        # Parse input
        rna = parse_input_file(input_file_path)
        print(f"Parsed input: RNA sequence = {rna[:10]}... (length={len(rna)})")

        # Initialize Motzkin counter
        counter = MotzkinRNA()

        # Compute the number of noncrossing matchings
        result = counter.count_noncrossing_matchings(rna)

        # Verify for small cases
        is_valid, msg = counter.verify_count(rna, result)
        if not is_valid:
            raise ValueError(f"Invalid result: {msg}")
        print(f"Verification: {msg}")

        return result

    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")

def main():
    """
    Main function to run the Motzkin RNA problem solver.
    """
    # Configuration
    input_file = "rosalind_motz.txt"  # Change this to your input file name
    output_file = "output_motz.txt"

    try:
        print("Solving Motzkin RNA Secondary Structures Problem...")

        # Solve the problem
        result = solve_motzkin_rna_problem(input_file)

        # Display results
        print(f"\nResult:")
        print(f"Number of noncrossing matchings (mod 1,000,000): {result}")

        # Write to output file
        write_output_file(output_file, result)
        print(f"Result written to: {output_file}")

    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")

    except ValueError as e:
        print(f"Error: {e}")

    except Exception as e:
        print(f"Unexpected error: {e}")

def test_motzkin_rna():
    """Test the Motzkin RNA counter with sample and edge cases."""
    print("=== Testing Motzkin RNA Counter ===")

    test_cases = [
        {"rna": "AUAU", "expected": 7},
        {"rna": "AU", "expected": 2},
        {"rna": "A", "expected": 1},
        {"rna": "GC", "expected": 2},
        {"rna": "AA", "expected": 1},  # Can't pair
        {"rna": "AUAUAU", "expected": 22},
    ]

    counter = MotzkinRNA()

    for i, case in enumerate(test_cases, 1):
        rna = case["rna"]
        expected = case.get("expected", None)
        print(f"\nTest {i}: RNA={rna}")

        result = counter.count_noncrossing_matchings(rna)
        print(f"Result: {result}")

        is_valid, msg = counter.verify_count(rna, result)
        print(f"Valid: {'✓' if is_valid else '✗'} - {msg}")

        if expected is not None and result == expected:
            print(f"Matches expected {expected}: ✓")
        elif expected is not None:
            print(f"Matches expected {expected}: ✗")

if __name__ == "__main__":
    # Run tests
    test_motzkin_rna()

    print("\n" + "="*60)

    # Run main function
    print("Running main function...")
    main()


## Majority Element Problem

In [None]:
"""
Rosalind Majority Element Problem Solution


This module implements a divide-and-conquer algorithm to find the majority element
in multiple arrays. It handles large inputs efficiently and includes verification for correctness.


Author: Bioinformatics Solution
Compatible with: Python 3.6+
Platform: Google Colab
"""


from typing import List, Tuple



class MajorityFinder:
    """
    A class to find the majority element in an array using divide-and-conquer.
    """


    def __init__(self):
        """Initialize the majority finder."""
        pass


    def find_majority(self, arr: List[int]) -> int:
        """
        Find the majority element in the array using divide-and-conquer.


        A majority element appears more than n/2 times.


        Args:
            arr (List[int]): Input array


        Returns:
            int: The majority element if it exists, -1 otherwise
        """
        if not arr:
            return -1


        def majority_rec(start: int, end: int) -> int:
            if start == end:
                return arr[start]


            mid = (start + end) // 2
            left_major = majority_rec(start, mid)
            right_major = majority_rec(mid + 1, end)


            if left_major == right_major:
                return left_major


            # Count occurrences in the range
            left_count = sum(1 for i in range(start, end + 1) if arr[i] == left_major)
            right_count = sum(1 for i in range(start, end + 1) if arr[i] == right_major)


            if left_count > (end - start + 1) // 2:
                return left_major
            if right_count > (end - start + 1) // 2:
                return right_major


            return -1  # No majority in this range


        candidate = majority_rec(0, len(arr) - 1)
        if candidate == -1:
            return -1


        # Verify the count
        count = sum(1 for x in arr if x == candidate)
        return candidate if count > len(arr) // 2 else -1


    def verify_majority(self, arr: List[int], result: int) -> Tuple[bool, str]:
        """
        Verify that the majority element is correct.


        Args:
            arr (List[int]): Input array
            result (int): Reported majority element or -1


        Returns:
            Tuple[bool, str]: (is_valid, error_message)
        """
        if not arr:
            return result == -1, "Valid for empty array" if result == -1 else "Invalid for empty array"


        from collections import Counter
        counts = Counter(arr)
        n = len(arr)
        threshold = n // 2


        if result == -1:
            # Check if no element exceeds threshold
            if all(count <= threshold for count in counts.values()):
                return True, "Valid: No majority element"
            else:
                max_elem = max(counts, key=counts.get)
                return False, f"Invalid: {max_elem} appears {counts[max_elem]} > {threshold} times"
        else:
            # Check if result appears > n/2 times
            if result not in counts:
                return False, f"Result {result} not in array"
            if counts[result] <= threshold:
                return False, f"Result {result} appears {counts[result]} <= {threshold} times"


            # Check if it's indeed the majority
            if counts[result] > threshold:
                return True, f"Valid: {result} appears {counts[result]} > {threshold} times"
            return False, "Invalid majority"


def parse_input_file(file_path: str) -> Tuple[int, int, List[List[int]]]:
    """
    Parse input file to extract k, n, and k arrays each of size n.
    """
    try:
        with open(file_path, 'r') as file:
            lines = [line.strip() for line in file if line.strip()]


        if len(lines) < 1:
            raise ValueError("Input file is empty")


        # Parse k and n
        first_line = list(map(int, lines[0].split()))
        if len(first_line) != 2:
            raise ValueError("First line must contain exactly two integers: k and n")
        k, n = first_line


        if len(lines) != k + 1:
            raise ValueError(f"Expected {k + 1} lines, found {len(lines)}")


        # Parse k arrays
        arrays = []
        for i in range(1, k + 1):
            arr = list(map(int, lines[i].split()))
            if len(arr) != n:
                raise ValueError(f"Array {i} length {len(arr)} doesn't match n={n}")
            arrays.append(arr)


        return k, n, arrays


    except FileNotFoundError:
        raise FileNotFoundError(f"Input file '{file_path}' not found")
    except ValueError as e:
        raise ValueError(f"Input parsing error: {e}")



def write_output_file(output_path: str, results: List[int]) -> None:
    """
    Write majority elements to output file.
    """
    try:
        with open(output_path, 'w') as file:
            file.write(' '.join(map(str, results)) + '\n')
    except Exception as e:
        raise IOError(f"Error writing to output file: {e}")



def solve_majority_problem(input_file_path: str) -> List[int]:
    """
    Solve the majority element problem for a given input file.
    """
    try:
        # Parse input
        k, n, arrays = parse_input_file(input_file_path)


        print(f"Parsed input: k={k}, n={n}, {k} arrays")


        # Initialize finder
        finder = MajorityFinder()


        # Find majority for each array
        results = []
        for i, arr in enumerate(arrays, 1):
            major = finder.find_majority(arr)
            results.append(major)


            # Verify
            is_valid, msg = finder.verify_majority(arr, major)
            if not is_valid:
                raise ValueError(f"Invalid result for array {i}: {msg}")


            print(f"Array {i}: majority {major} - {msg}")


        return results


    except Exception as e:
        raise ValueError(f"Error solving problem: {str(e)}")



def main():
    """
    Main function to run the majority element problem solver.
    """
    # Configuration
    input_file = "rosalind_maj.txt"  # Change this to your input file name
    output_file = "output_maj.txt"


    try:
        print("Solving Majority Element Problem...")


        # Solve the problem
        results = solve_majority_problem(input_file)


        # Display results
        print(f"\nResult:")
        print(f"Majority elements: {' '.join(map(str, results))}")


        # Write to output file
        write_output_file(output_file, results)
        print(f"Result written to: {output_file}")


    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        print("Please make sure the file exists in the current directory.")


    except ValueError as e:
        print(f"Error: {e}")


    except Exception as e:
        print(f"Unexpected error: {e}")



def test_majority_finder():
    """Test the majority finder with sample and edge cases."""
    print("=== Testing Majority Finder ===")


    test_cases = [
        # Sample dataset arrays
        [5, 5, 5, 5, 5, 5, 5, 5],                # 5 is majority
        [8, 7, 7, 7, 1, 7, 3, 7],                # 7 is majority (5 times > 4)
        [7, 1, 6, 5, 10, 100, 1000, 1],          # No majority
        [5, 1, 6, 7, 1, 1, 10, 1],               # 1 appears 4 times == 4, but need >4? Wait, n=8, >4
        # Note: for n=8, >4 means at least 5
        # In sample, last one has 1 appearing 4 times, so -1
        # Additional tests
        [],                                      # Empty
        [1],                                     # Single element
        [1, 2],                                  # No majority
        [1, 1, 2],                               # 1 appears 2 > 1.5
        [1, 2, 2, 3],                            # No majority
        [2, 2, 2, 2],                            # 2 is majority
    ]


    expected = [5, 7, -1, -1, -1, 1, -1, 1, -1, 2]


    finder = MajorityFinder()


    for i, arr in enumerate(test_cases, 1):
        print(f"\nTest {i}: {arr}")


        result = finder.find_majority(arr)
        print(f"Result: {result}")


        is_valid, msg = finder.verify_majority(arr, result)
        print(f"Valid: {'✓' if is_valid else '✗'} - {msg}")


        if i <= len(expected):
            exp = expected[i-1]
            print(f"Matches expected {exp}: {'✓' if result == exp else '✗'}")


if __name__ == "__main__":
    # Run tests
    test_majority_finder()


    print("\n" + "="*60)


    # Run main function
    print("Running main function...")
    main()
