### Find a median string

**Input**: An integer k, followed by a space-separated collection of strings Dna. \
**Output**: A k-mer Pattern that minimizes d(Pattern, Dna) among all possible choices of k-mers. (If there are multiple such strings Pattern, then you may return any one.)

In [7]:
from itertools import product

In [14]:
filename_test = 'data/median_string_test.txt'
filename_data = 'data/median_string.txt'

In [15]:
def parse_data(file_path):
    with open(file_path) as file:
        k, dna = int(file.readline()), file.readline().split()
    return k, dna

k, dna = parse_data(filename_data)

In [16]:
def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def all_possible_kmers(k):
    nucleotides = ['A', 'C', 'G', 'T']
    kmers = [''.join(x) for x in product(nucleotides, repeat=k)]
    return kmers

def total_distance(pattern, dna_list):
    return sum(min(hamming_distance(pattern, dna[i:i+len(pattern)]) for i in range(len(dna) - len(pattern) + 1)) for dna in dna_list)

def median_string(k, dna_list):
    best_pattern = None
    min_distance = float('inf') #unbounded upper value for comparison

    for pattern in all_possible_kmers(k):
        distance = total_distance(pattern, dna_list)
        if distance < min_distance:
            min_distance = distance
            best_pattern = pattern

    return best_pattern

In [17]:
result = median_string(k, dna)
print(result)

ACTTGT
