-
Notifications
You must be signed in to change notification settings - Fork 0
/
3b_median_string.py
70 lines (50 loc) · 1.87 KB
/
3b_median_string.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pattern_to_number
dna = ['TTGGAGTACGCGTGATACCACTGTTCGTCACGCTTATCGTCA', 'TAGGCATATGGCTTAACGAGGAGATTTGAGAATCTATGCTCC', 'GTTATTTTTTCGCTGACGAAGACGTAGCAGTTCGAGGACACC', 'GCTGACCCTTGTGGATGCGTACACTTAGAGACACAGGTTGTT', 'ACGTGAGGTCGCTTTGAGTCAGGAATGCGAAGAAAACTCCAG', 'CATATATGGCTCTTAGAGAGGCCTCGGATCAGGTAATTGCGC', 'TCTACGCAATATTTCGAGGAGACGAGGCAACCTGCACTTGTG', 'TTGGAGTCGAAGGCTCGCGTGTACCTCACGCAGTAGTGGAGC', 'TATGATGTACGTACGCCTTTCGAGAGGTTCCCGAGCATTGGT', 'GAACTAGCGGTGTTATCGAAAAAATTGGAGATACACAACATC']
def score(pattern1, pattern2):
"""
Returns the number of mismatches between pattern1 and pattern2
score('GCA', 'ATA') --> 2
"""
count = 0
for i in range(len(pattern1)):
if pattern1[i] != pattern2[i]:
count = count + 1
return count
def d(pattern, dna):
"""
Finds the best match of pattern to each substring in dna and returns the total number
of mismatches of the resulting best matches.
d('AAA', ['ttaccttAAC', 'gATAtctgtc', 'ACGgcgttcg'] --> 4
"""
kmers = []
scores = []
k = len(pattern)
for oligo in dna:
for i in range(0, len(oligo)-k+1):
kmers.append(oligo[i:i+k])
for i in kmers:
scores.append(score(pattern, i))
comparisons = len(dna[0]) - k + 1
index_min_scores = []
min_scores = []
scores2 = []
kmers2 = []
for i in range(0, len(kmers), comparisons):
kmers2.append(kmers[i:i+comparisons])
for i in range(0, len(scores), comparisons):
scores2.append(scores[i:i+comparisons])
for i in scores2:
min_scores.append(min(i))
return sum(min_scores)
def median_string(dna, k):
"""
Iterates through every possible kmer of length k and returns a kmer that minimizes d(pattern, dna).
This is a faster version of the motif_finding problem.
"""
kmers = pattern_to_number.list_kmer(k)
best_pattern = kmers[0]
for kmer in kmers:
if d(kmer, dna) < d(best_pattern, dna):
best_pattern = kmer
return best_pattern
print median_string(dna, 6)