### Implanted Morif Problem - find all (k,d)-motifs in a collection of strings

Exhaustive search (brute force) \
explore all possible solution candidates

In [5]:
def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def generate_neighbors(pattern, d):
    if d == 0:
        return [pattern]
    if len(pattern) == 1:
        return ['A', 'C', 'G', 'T']
    
    neighborhood = set()
    suffix_neighbors = generate_neighbors(pattern[1:], d)
    for neighbor in suffix_neighbors:
        if hamming_distance(pattern[1:], neighbor) < d:
            for nucleotide in ['A', 'C', 'G', 'T']:
                neighborhood.add(nucleotide + neighbor)
        else:
            neighborhood.add(pattern[0] + neighbor)
    return list(neighborhood)

def motif_enumeration(dna, k, d):
    patterns = set()
    for sequence in dna:
        for i in range(len(sequence) - k + 1):
            pattern = sequence[i:i+k]
            neighborhood = generate_neighbors(pattern, d)
            for neighbor in neighborhood:
                found = True
                for dna_sequence in dna:
                    has_match = False
                    for j in range(len(dna_sequence) - k + 1):
                        if hamming_distance(neighbor, dna_sequence[j:j+k]) <= d:
                            has_match = True
                            break
                    if not has_match:
                        found = False
                        break
                if found:
                    patterns.add(neighbor)
    return list(patterns)

In [6]:
# test
dna = ["ATTTGGC", "TGCCTTA", "CGGTATC", "GAAAATT"]
k = 3
d = 1
result = motif_enumeration(dna, k, d)
print(*result)

GTT TTT ATT ATA


In [8]:
# sample
dna = ['CCAGATTGCGCCTTATTATATGAGT', 'TGGTAAAGCAATCGTAACCATTATA', 'CGGTGCTGCTTTATGGCGTGGGTCA', 'TTTGATTATCTGAGGTAGCGGCAAC', 'TTATGCTGTCGCTAGATGGTCTCTC', 'GGTCGTTAATTTATGGCTAGTTCGC']
k = 5
d = 1
result = motif_enumeration(dna, k, d)
print(*result)

TTATT TTATC TTATG TTATA
