In [48]:
# Read DNA sequence from a file
def read_dna_sequence(text):
    try:
        with open(text, "r") as file:
            sequence = file.read()
        dna_seq = sequence.replace("\n", "").replace(" ", "").upper()
        print(f"DNA Sequence: {dna_seq}")
        return dna_seq
    except FileNotFoundError:
        print("File not found. Please ensure the file exists in the current directory.")
        return ""

# Check if the sequence is a valid DNA sequence
def is_dna_sequence(dna_seq):
    base = "ACGT"
    for nucleotide in dna_seq:
        if nucleotide not in base:
            return False
        return True

# Find motifs in the DNA sequence and return their exact positions also find the mismatches of the motifs
def motif_checker(dna_seq, motifs):
    # loop though the motifs and find their positions in the dna sequence
    for motif in motifs:
        positions = []
        start = 0
        while True:
            # start shows the index of the first character of the motif in the dna sequence
            start = dna_seq.find(motif, start)
            # if the position is not found then exit the loop
            if start == -1:
                break
            # if found then append the position to the list
            positions.append(start + 1)  # +1 for 1-based indexing
            start += 1  # Move to the next character for overlapping matches
        # print the results with a dna sequence including the found motif in brackets
        if positions:
            print(f"Motif '{motif}' found at positions: {positions}")
            for pos in positions:
                highlighted_seq = (dna_seq[:pos-1] + "[" + motif + "]" + dna_seq[pos-1+len(motif):])
                print(f"Sequence with motif highlighted: {highlighted_seq}")
        else:
            print(f"Motif '{motif}' not found in the sequence.")
            
    #find mismatches of the motifs
    #first off, find if the part of the dna sequence have matching first 2 nucleotides with the motif
    if motifs:
        for motif in motifs:
            mismatch_positions = []
            #check for parts in the dna sequence that have the matching first 2 nucleotides with the motif
            for i in range(len(dna_seq) - len(motif) + 1):
                #get that part of the dna sequence
                segment = dna_seq[i:i+len(motif)]
                #if the first 2 nucleotides match, check for mismatches
                if segment[:2] == motif[:2]:
                    #count mismatches
                    mismatches = sum(1 for a, b in zip(segment, motif) if a != b)
                    #if there are mismatches, store the position and mismatch count
                    if mismatches > 0:
                        mismatch_positions.append((i + 1, mismatches))
            #print the dna sequence with the mismatched part in brackets
            if mismatch_positions:
                print(f"Potential mismatches for motif '{motif}' at positions (1-based index) with mismatch count: {mismatch_positions}")
                for pos, mismatch_count in mismatch_positions:
                    segment = dna_seq[pos-1:pos-1+len(motif)]
                    highlighted_seq = (dna_seq[:pos-1] + "[" + segment + "]" + dna_seq[pos-1+len(motif):])
                    print(f"Sequence with potential mismatch highlighted: {highlighted_seq} (Mismatches position: {mismatch_count})")
                    
                
    
    
    
# Main function
def main():
    motifs = ["TTGACA", "TATAAT"]
    text = "dna_sequence.txt"
    dna_sequence = read_dna_sequence(text)
    if dna_sequence and is_dna_sequence(dna_sequence):
        print("The sequence is a valid DNA sequence.")
        motif_checker(dna_sequence, motifs)
    else:
        print("The sequence is not a valid DNA sequence.")

if __name__ == "__main__":
    main()

DNA Sequence: TTGACAGGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC
The sequence is a valid DNA sequence.
Motif 'TTGACA' found at positions: [1]
Sequence with motif highlighted: [TTGACA]GGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC
Motif 'TATAAT' found at positions: [24]
Sequence with motif highlighted: TTGACAGGGCTCGATGCCGATGA[TATAAT]AGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGCTTGACGTATAAC
Potential mismatches for motif 'TTGACA' at positions (1-based index) with mismatch count: [(53, 2), (67, 1)]
Sequence with potential mismatch highlighted: TTGACAGGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACC[TTGATG]CCTATAGCTTGACGTATAAC (Mismatches position: 2)
Sequence with potential mismatch highlighted: TTGACAGGGCTCGATGCCGATGATATAATAGGCGTACGATGCTAGCTGAACCTTGATGCCTATAGC[TTGACG]TATAAC (Mismatches position: 1)
Potential mismatches for motif 'TATAAT' at positions (1-based index) with mismatch count: [(26, 3), (29, 4), (35, 2), (43, 4), (61, 2), (63, 3), (73,