In [None]:
"""
    This function reads the DNA sequence from a text file.

    Parameters:
        text (str): Path to the text file containing the DNA sequence.

    Returns:
        str: DNA sequence in uppercase with whitespace and newlines removed.
             Returns an empty string if the file is not found.
"""
def read_dna_sequence(text):
    try:
        with open(text, "r") as file:
            sequence = file.read()
        dna_seq = sequence.replace("\n", "").replace(" ", "").upper()
        print(f"DNA Sequence: {dna_seq}")
        return dna_seq
    except FileNotFoundError:
        print("File not found. Please make sure that the file exists in the current directory.")
        return ""

"""
    This function validates that the sequence contains only valid DNA bases(A,T,G,C).

    Parameters:
        dna_seq (str): DNA sequence to validate that was read from the text file.

    Returns:
        boolean: False if the sequence contains any nucleotide except A, C, G, T. False otherwise.
"""

def is_dna_sequence(dna_seq):
    base = "ACGT"
    for nucleotide in dna_seq:
        if nucleotide not in base:
            return False
    return True

"""
    This function calculates the number of mismatched nucleotides between two motifs.

    Parameters:
        seq1 (str): common motif.
        seq2 (str): Second motif extracted from DNA sequence.
    Returns:
        int: Number of positions where nucleotides differ.
"""
def calculate_mismatches(seq1, seq2):
    #return error if the lengths of the two motifs are not equal
    if len(seq1) != len(seq2):
        print("Strings must be the same length.")

    #set total mismatches to 0 and loop through the sequences to count mismatches
    total = 0
    #zip function is used to pair up the nucleotides from both sequencces
    for a, b in zip(seq1, seq2):
        #if the nucleotides are different, increment the mismatch count
        if a != b:
            total += 1
    return total

"""
    This function finds all exact positions of the common promoter motif in a DNA sequence.

    Parameters:
        dna_seq (str): DNA sequence to search.
        motif (str): Motif sequence to locate.

    Returns:
        list: Positions of all exact motif matches.
"""
def exact_motif_positions(dna_seq, motif):
    positions = []
    start = 0
    while True:
        # start shows the index of the first character of the motif in the dna sequence
        start = dna_seq.find(motif, start)
        # if the position is not found then exit the loop
        if start == -1:
            break
        # if found then append the position to the list
        positions.append(start + 1)  # +1 for 1-based indexing
        start += 1  # Move to the next character for overlapping matches
    return positions

"""
    This function finds positions of motif matches with mismatches allowed.

    Parameters:
        dna_seq (str): DNA sequence to search.
        motif (str): Motif sequence to locate.
        max_mismatches (int): Maximum number of mismatches allowed.

    Returns:
        list: Tuples of (position, mismatch_count) for motif matches
              that differ by up to max_mismatches nucleotides.
"""
def mismatch_motif_positions(dna_seq, motif, max_mismatches=3):
    mismatch_positions = []
    #check for parts in the dna sequence that have the matching first 2 nucleotides with the motif
    for i in range(len(dna_seq) - len(motif) + 1):
        #get that part of the dna sequence
        segment = dna_seq[i:i+len(motif)]
        #if the first 2 nucleotides match, check for mismatches
        if segment[:2] == motif[:2]:
            #count mismatches
            mismatches = calculate_mismatches(segment, motif)
            #if there are mismatches within the allowed limit, store the position and mismatch count
            if 0 < mismatches <= max_mismatches:
                mismatch_positions.append((i + 1, mismatches))
    return mismatch_positions
    
"""
    This function highlights detected motifs in the DNA sequence with brackets.

    Parameters:
        dna_seq (str): Original DNA sequence.
        motif (str): Motif sequence to highlight.
        positions (list): Positions (1-based indexing) where motifs occur.

    Returns:
        list: Modified DNA sequences with motifs enclosed in brackets.
"""
def highlighted_sequence(dna_seq, motif, positions):
    GREEN = "\033[92m"
    RESET = "\033[0m"
    highlighted_seqs = []
    for pos in positions:
        highlighted_seq = (dna_seq[:pos-1] + GREEN + motif + RESET + dna_seq[pos-1+len(motif):])
        highlighted_seqs.append(highlighted_seq)
    return highlighted_seqs

"""
    Search for exact and mismatch occurrences of motifs in a DNA sequence.

    Parameters:
        dna_seq (str): DNA sequence to search.
        motifs (list): List of motifs to check.

    Output:
        Prints positions of exact matches, mismatch matches, and highlights
        sequences with detected motifs for each motif in the list.
"""
def motif_checker(dna_seq, motifs):
    GREEN = "\033[92m"
    RESET = "\033[0m"
    for motif in motifs:
        # Find exact positions of the motif
        exact_positions = exact_motif_positions(dna_seq, motif)
        if exact_positions:
            print(f"Motif '{motif}' found at positions: {exact_positions}")
            highlighted_seqs = highlighted_sequence(dna_seq, motif, exact_positions)
            for seq in highlighted_seqs:
                print(f"Sequence with motif highlighted: {seq}")
        else:
            print(f"Motif '{motif}' not found in the sequence.")
        
        # Find potential mismatches of the motif
        mismatch_positions = mismatch_motif_positions(dna_seq, motif)
        if mismatch_positions:
            match_scores = {}
            for pos, mismatch_count in mismatch_positions:
                matches = len(motif) - mismatch_count
                score_percent = matches / len(motif)
                match_scores[pos] = round(score_percent, 2)

            print(f"Match scores for motif '{motif}' (in %): {match_scores}")
            
            for pos, mismatch_count in mismatch_positions:
                segment = dna_seq[pos-1:pos-1+len(motif)]
                highlighted_seq = (dna_seq[:pos-1] + GREEN + segment + RESET + dna_seq[pos-1+len(motif):])
                print(f"Sequence with potential mismatch highlighted in brackets: {highlighted_seq}")
        else:
            print(f"No mismatches found")

    
"""
    Main program workflow.

    Steps:
        1. Read DNA sequence from file (dna_sequence.txt).
        2. Validate that the sequence is a proper DNA sequence.
        3. Run motif search for exact and mismatch matches.
        4. Print results to the console.
"""
def main():
    motifs = ["TTGACA", "TATAAT"]
    text = "dna_sequence.txt"
    dna_sequence = read_dna_sequence(text)
    if dna_sequence and is_dna_sequence(dna_sequence):
        print("The sequence is a valid DNA sequence.")
        motif_checker(dna_sequence, motifs)
    else:
        print("The sequence is not a valid DNA sequence.")

if __name__ == "__main__":
    main()

DNA Sequence: TTGACAGGGCTCGATGCCGATGATATAATAGGCGACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC
The sequence is a valid DNA sequence.
[92mMotif 'TTGACA' found at positions: [1][0m
[92mSequence with motif highlighted: [92mTTGACA[0mGGGCTCGATGCCGATGATATAATAGGCGACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC[0m
[92mMatch scores for motif 'TTGACA' (in %): {52: 66.67, 66: 83.33}[0m
[92mSequence with potential mismatch highlighted in brackets: TTGACAGGGCTCGATGCCGATGATATAATAGGCGACGATGCTAGCTGAACC[TTGATG]CCTATAGCTTGACGTATAAC[0m
[92mSequence with potential mismatch highlighted in brackets: TTGACAGGGCTCGATGCCGATGATATAATAGGCGACGATGCTAGCTGAACCTTGATGCCTATAGC[TTGACG]TATAAC[0m
[92mMotif 'TATAAT' found at positions: [24][0m
[92mSequence with motif highlighted: TTGACAGGGCTCGATGCCGATGA[92mTATAAT[0mAGGCGACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC[0m
[92mMatch scores for motif 'TATAAT' (in %): {26: 50.0, 60: 66.67, 62: 50.0, 72: 83.33}[0m
[92mSequence with potential mismatch highlighted in brac