#### Solve the Frequent Words with Mismatches Problem

generate the d-neighborhood Neighbors(Pattern, d), the set of all k-mers whose Hamming distance from Pattern does not exceed d

In [1]:
# define hamming_distance
def hamming_distance(p, q):
    return sum(1 for x, y in zip(p, q) if x != y)

In [2]:
# step1: generate a neighborhood (d=1)
def immediate_neigbors(pattern):
    neighborhood = {pattern}
    nucleotides = ['A', 'C', 'G', 'T']
    for i in range(len(pattern)):
        symbol = pattern[i]
        for x in nucleotides:
            if x != symbol:
                neighbor = pattern[:i] + x + pattern[i+1:]
                neighborhood.add(neighbor)
    return list(neighborhood)

# test
pattern = "ACGT"
print("Immediate neighbors of", pattern, ":", immediate_neigbors(pattern))

Immediate neighbors of ACGT : ['ACGG', 'ACGC', 'AGGT', 'ACCT', 'ACTT', 'AAGT', 'CCGT', 'ACGT', 'ACAT', 'GCGT', 'ACGA', 'TCGT', 'ATGT']


In [3]:
# step2: generate a neigborhood for any d (number of mismatches)
# assumption: hamming distance <= d
def neighbors(pattern, d):
    nucleotides = ['A', 'C', 'G', 'T']

    if d == 0:
        return {pattern}
    if len(pattern) == 1:
        return set(nucleotides)

    neighborhood = set()
    suffix_neighbors = neighbors(pattern[1:], d) #recursive call

    for text in suffix_neighbors:
        if hamming_distance(pattern[1:], text) < d:
            for x in nucleotides:
                neighborhood.add(x + text)
        else:
            neighborhood.add(pattern[0] + text)

    return list(neighborhood)

# Test the function
pattern = "ACGT"
d = 2
print("Neighbors of", pattern, "with up to", d, "mismatches:", neighbors(pattern, d))

Neighbors of ACGT with up to 2 mismatches: ['ACGC', 'TCAT', 'ATGC', 'GCTT', 'GCGC', 'AGGG', 'ACCG', 'ACAG', 'CCAT', 'TGGT', 'CCTT', 'AAAT', 'ACAC', 'ACCA', 'ATGG', 'ATCT', 'GAGT', 'CCGA', 'CCCT', 'GCCT', 'GGGT', 'TAGT', 'AGGC', 'ACTT', 'TCGC', 'AGGA', 'TTGT', 'ACCC', 'AAGC', 'CAGT', 'AACT', 'CCGC', 'AAGA', 'TCTT', 'TCGT', 'CCGG', 'AGCT', 'ACCT', 'AATT', 'ATGA', 'ACTA', 'TCGA', 'ACGT', 'CCGT', 'AGAT', 'TCGG', 'CGGT', 'ACAA', 'ACTC', 'ATAT', 'ACGA', 'ATGT', 'ATTT', 'AGTT', 'ACGG', 'ACTG', 'GCGA', 'GCGG', 'AGGT', 'AAGT', 'TCCT', 'CTGT', 'GTGT', 'GCAT', 'ACAT', 'GCGT', 'AAGG']


In [4]:
# Define fucntion that returns most frequent words in a sequence of k-length with d-mismatches
def frequent_words_with_mismatches(seq, k, d):
    patterns = []
    freq_map = {}
    for i in range(0, len(seq)-k+1):
        pattern = seq[i:i+k]
        neighborhood = neighbors(pattern, d)
        for j in range(0,len(neighborhood)-1):
            neighbor = neighborhood[j]
            freq_map[neighbor] = freq_map.get(neighbor, 0) + 1
    m = max(freq_map.values())
    patterns = [kmer for kmer, freq in freq_map.items() if freq == m]
    return patterns

In [5]:
sequence = 'ACGTTGCATGTCGCATGATGCATGAGAGCT'
kmer_len = 4
mismatch_n = 1

# GATG ATGC ATGT
print(f"Most frequent {kmer_len}-kmers with up to {mismatch_n} mismatches:", *frequent_words_with_mismatches(sequence, kmer_len, mismatch_n))

Most frequent 4-kmers with up to 1 mismatches: GATG ATGC


In [6]:
test_seq = open('/Users/npajek/Documents/fri_biolab/coursera/bioinformatics-specialization/01_finding_hidden_messages_in_DNA/data/frequent_words_with_mismatches.txt').read()
test_k = 5
test_d = 3

print(f"Most frequent {test_k}-kmers with up to {test_d} mismatches:", *frequent_words_with_mismatches(test_seq, test_k, test_d))

Most frequent 5-kmers with up to 3 mismatches: GCCGC
