### Implement RandomizedMotifSearch.

**Input**: Integers k and t, followed by a space-separated collection of strings Dna. \
**Output**: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1,000 times. Remember to use pseudocounts!

In [10]:
filename_test = 'data/04.1_sample_input.txt'
filename = 'data/04.1_dataset.txt'

In [11]:
def parse_data(file_path):
    with open(file_path) as file:
        lines = file.readlines()
        dna = lines[1].split()
        k, t = map(int, lines[0].strip().split())
    return k, t, dna

k, t, dna = parse_data(filename_test)

In [12]:
import random

def _motifs(profile, Dna):
    motifs = []
    t = len(Dna)
    k = len(profile['A'])

    for i in range(t):
        motifs.append(profile_most_probable_kmer(Dna[i], k, profile))

    return motifs

def random_motifs(Dna, k, t):
    randMotifs = []

    for i in range(t):
        x = random.randint(0, t)
        randMotifs.append(Dna[i][x:x+k])

    return randMotifs


def profile_most_probable_kmer(text, k, profile):
    mostProbVal = -1
    mostProbKmer = ''

    for i in range(0, 1 + len(text) - k):
        kmer = text[i:i+k]
        probKmerVal = _pr(kmer, profile)
        if probKmerVal > mostProbVal:
            mostProbVal = probKmerVal
            mostProbKmer = kmer

    return mostProbKmer


def _pr(text, profile):
    P = 1

    for i in range(len(text)):
        P = P * profile[text[i]][i]

    return P

def profile_with_pseudocounts(motifs):
    profile = {}
    t = len(motifs)
    k = len(motifs[0])
    countMotifs = count_with_pseudocounts(motifs)

    for symbol in "ACGT":
        profile[symbol] = []

    for x in countMotifs:
        for y in countMotifs[x]:
            z = y/float(t+4)
            profile[x].append(z)

    return profile

def count_with_pseudocounts(motifs):
    count = {}
    pseudocounts = {}
    t = len(motifs)
    k = len(motifs[0])

    for symbol in "GACT":
        count[symbol] = []
        for j in range(k):
            count[symbol].append(0)

    for i in range(t):
        for j in range(k):
            symbol = motifs[i][j]
            count[symbol][j] += 1

    for symbol in "GACT":
        pseudocounts[symbol] = []

    for x in count:
        for y in count[x]:
            z = y + 1
            pseudocounts[x].append(z)

    return pseudocounts

def _score(motifs):
    count = 0
    k = len(motifs[0])
    t = len(motifs)
    consensusMotif = _consensus(motifs)

    for i in range(t):
        for j in range(k):
            if motifs[i][j] != consensusMotif[j]:
                count += 1

    return count

def _consensus(motifs):
    k = len(motifs[0])
    count = count_with_pseudocounts(motifs)
    consensus = ""

    for j in range(k):
        M = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > M:
                M = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol

    return consensus

N = 1000
def randomized_motif_search(Dna, k, t):
    M = random_motifs(Dna, k, t)
    bestMotifs = M
    while True:
        profile = profile_with_pseudocounts(M)
        M = _motifs(profile, Dna)
        if _score(M) < _score(bestMotifs):
            bestMotifs = M
        else:
            return bestMotifs
        


In [13]:
randomized_motif_search(dna, k, t)
M = randomized_motif_search(dna, k, t)
bMotifs = M

for i in range(N+1):
    M = randomized_motif_search(dna, k, t)
    if _score(M) < _score(bMotifs):
         bMotifs = M
    else:
        bestMotifs = bMotifs

print (' '.join(bestMotifs))

TTCGCGGGAGTCGTG CTCTTGACAGTCATG CTCAGCACTAACATG CTCAGCAACATCATG CTCAGCACAGTAGAG CTCAGATGAGTCATG AGAAGCACAGTCATG CTCACGTCAGTCATG CTCAGCACAGCGGTG CTCAGCTACGTCATG CTCATTTCAGTCATG GGCAGCACAGTCATA CCATGCACAGTCATG CTCAGCACATAGATG CTGTACACAGTCATG CTCAGCGAGGTCATG CTCAGCACAGTCCCA ATCAGCACAGTCACA CTCAGGGTAGTCATG CTCCTTACAGTCATG


Exercise Break: Compute the probability that ten randomly selected 15-mers from the ten 600-nucleotide long strings in the Subtle Motif Problem capture at least one implanted 15-mer. (Allowable error: 0.000001)

In [14]:
total_nucleotides = 600

k = 15  # Length of each motif

num_selected_motifs = 10  # Number of motifs selected

 

# Calculate the total number of possible k-mers in a 600-nucleotide string

total_kmers = total_nucleotides - k + 1

 

# Calculate the probability of not capturing the motif with one k-mer

prob_not_capturing_one_kmer = (total_kmers - 1) / total_kmers

 

# Calculate the probability of not capturing the motif with ten k-mers

prob_not_capturing_ten_kmers = prob_not_capturing_one_kmer ** num_selected_motifs

 

# Calculate the probability of capturing at least one motif with ten k-mers

prob_capturing_at_least_one = 1 - prob_not_capturing_ten_kmers

 

print(f"Probability of capturing at least one motif: {prob_capturing_at_least_one:.6f}")

Probability of capturing at least one motif: 0.016934


Exercise Break: Compute the probability that ten randomly selected 15-mers from ten 600-nucleotide long strings (as in the Subtle Motif Problem) capture at least two implanted 15-mers. (Allowable error: 0.000001)

In [15]:
total_nucleotides = 600

k = 15  # Length of each motif

num_selected_motifs = 10  # Number of motifs selected

# Calculate the total number of possible k-mers in a 600-nucleotide string

total_kmers = total_nucleotides - k + 1

# Calculate the probability of not capturing the motif with one k-mer

prob_not_capturing_one_kmer = (total_kmers - 1) / total_kmers

# Calculate the probability of not capturing the motif with ten k-mers

prob_not_capturing_ten_kmers = prob_not_capturing_one_kmer ** num_selected_motifs

# Calculate the probability of capturing at least one motif with ten k-mers

prob_capturing_at_least_one = 1 - prob_not_capturing_ten_kmers

r=prob_capturing_at_least_one

print(f"Probability of capturing at least one motif: {prob_capturing_at_least_one:.6f}")

q=1/586 * ((585/586) ** 9)

print(f"{r:.6f}")

p=r-q*10

print(f'{p:.6f}')

Probability of capturing at least one motif: 0.016934
0.016934
0.000130


test no5.

What are the 3-mers after one iteration of RandomizedMotifSearch?  In other words, what are the 3-mers Motifs(Profile(Motifs), Dna)?  Please enter your answer as four space-separated strings.

In [26]:
dna = ['TGACGTTC','TAAGAGTT','GGACGAAA','CTGTTCGC']
k = 3
t = 4
random_motifs = ['TGA','GTT','GAA','TGT']

N = 1
def randomized_motif_search(Dna, k, t):
    M = random_motifs
    bestMotifs = M
    while True:
        profile = profile_with_pseudocounts(M)
        M = _motifs(profile, Dna)
        if _score(M) < _score(bestMotifs):
            bestMotifs = M
        else:
            return bestMotifs
        
' '.join(randomized_motif_search(dna, k, t))

'TGA TAA GGA TGT'