In [53]:
# Read DNA sequence from a file
def read_dna_sequence(text):
    try:
        with open(text, "r") as file:
            sequence = file.read()
        dna_seq = sequence.replace("\n", "").replace(" ", "").upper()
        print(f"DNA Sequence: {dna_seq}")
        return dna_seq
    except FileNotFoundError:
        print("File not found. Please ensure the file exists in the current directory.")
        return ""

# Check if the sequence is a valid DNA sequence
def is_dna_sequence(dna_seq):
    base = "ACGT"
    for nucleotide in dna_seq:
        if nucleotide not in base:
            return False
    return True

#calculate the number of mismatches between two sequences by counting the number of positions where the nucleotides differ
def calculate_mismatches(seq1, seq2):
    #return error if the lengths of the two sequences are not equal
    if len(seq1) != len(seq2):
        raise ValueError("Strings must be the same length.")

    #set total mismatches to 0 and loop through the sequences to count mismatches
    total = 0
    #zip function is used to pair up the nucleotides from both sequencessequ
    for a, b in zip(seq1, seq2):
        #if the nucleotides are different, increment the mismatch count
        if a != b:
            total += 1
    return total

#find exact positions of motifs in the DNA sequence
def exact_motif_positions(dna_seq, motif):
    positions = []
    start = 0
    while True:
        # start shows the index of the first character of the motif in the dna sequence
        start = dna_seq.find(motif, start)
        # if the position is not found then exit the loop
        if start == -1:
            break
        # if found then append the position to the list
        positions.append(start + 1)  # +1 for 1-based indexing
        start += 1  # Move to the next character for overlapping matches
    return positions

def mismatch_motif_positions(dna_seq, motif, max_mismatches=2):
    mismatch_positions = []
    #check for parts in the dna sequence that have the matching first 2 nucleotides with the motif
    for i in range(len(dna_seq) - len(motif) + 1):
        #get that part of the dna sequence
        segment = dna_seq[i:i+len(motif)]
        #if the first 2 nucleotides match, check for mismatches
        if segment[:2] == motif[:2]:
            #count mismatches
            mismatches = calculate_mismatches(segment, motif)
            #if there are mismatches within the allowed limit, store the position and mismatch count
            if 0 < mismatches <= max_mismatches:
                mismatch_positions.append((i + 1, mismatches))
    return mismatch_positions
    
#put the found motif in brackets in the dna sequence
def highlighted_sequence(dna_seq, motif, positions):
    highlighted_seqs = []
    for pos in positions:
        highlighted_seq = (dna_seq[:pos-1] + "[" + motif + "]" + dna_seq[pos-1+len(motif):])
        highlighted_seqs.append(highlighted_seq)
    return highlighted_seqs

#check for motifs and print results
def motif_checker(dna_seq, motifs):
    for motif in motifs:
        # Find exact positions of the motif
        exact_positions = exact_motif_positions(dna_seq, motif)
        if exact_positions:
            print(f"Motif '{motif}' found at positions: {exact_positions}")
            highlighted_seqs = highlighted_sequence(dna_seq, motif, exact_positions)
            for seq in highlighted_seqs:
                print(f"Sequence with motif highlighted: {seq}")
        else:
            print(f"Motif '{motif}' not found in the sequence.")
        
        # Find potential mismatches of the motif
        mismatch_positions = mismatch_motif_positions(dna_seq, motif)
        if mismatch_positions:
            print(f"Mismatches for motif '{motif}' at positions with mismatch count: {mismatch_positions}")
            for pos, mismatch_count in mismatch_positions:
                segment = dna_seq[pos-1:pos-1+len(motif)]
                highlighted_seq = (dna_seq[:pos-1] + "[" + segment + "]" + dna_seq[pos-1+len(motif):])
                print(f"Sequence with potential mismatch highlighted in brackets: {highlighted_seq} (Mismatches position: {mismatch_count})")
        else:
            print(f"No mismatches found")
    
# Main function
def main():
    motifs = ["TTGACA", "TATAAT"]
    text = "dna_sequence.txt"
    dna_sequence = read_dna_sequence(text)
    if dna_sequence and is_dna_sequence(dna_sequence):
        print("The sequence is a valid DNA sequence.")
        motif_checker(dna_sequence, motifs)
    else:
        print("The sequence is not a valid DNA sequence.")

if __name__ == "__main__":
    main()

DNA Sequence: TTGACAGGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC
The sequence is a valid DNA sequence.
Motif 'TTGACA' found at positions: [1]
Sequence with motif highlighted: [TTGACA]GGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC
Mismatches for motif 'TTGACA' at positions with mismatch count: [(53, 2), (67, 1)]
Sequence with potential mismatch highlighted in brackets: TTGACAGGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACC[TTGATG]CCTATAGCTTGACGTATAAC (Mismatches position: 2)
Sequence with potential mismatch highlighted in brackets: TTGACAGGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGC[TTGACG]TATAAC (Mismatches position: 1)
Motif 'TATAAT' found at positions: [24]
Sequence with motif highlighted: TTGACAGGGCTCGATGCCGATGA[TATAAT]AGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC
Mismatches for motif 'TATAAT' at positions with mismatch count: [(35, 2), (61, 2), (73, 1)]
Sequence with potential mismatch highlighted in brackets: T