### Implement RandomizedMotifSearch.

Input: Integers k and t, followed by a space-separated collection of strings Dna.

Output: A collection BestMotifs resulting from running RandomizedMotifSearch(Dna, k, t) 1,000 times. Remember to use pseudocounts!

In [21]:
import random
import pandas as pd
import time
import numpy as np

In [67]:
def mostProbKmer(text, k, profile):
    #process profile into usable format
    profDict = {}
    profDict['A'] = profile[0]
    profDict['C'] = profile[1]
    profDict['G'] = profile[2]
    profDict['T'] = profile[3]
    #print(profDict)
    probsDict = {}
    
    for i in range(0, len(text)-k+1):
        pat = text[i:k+i]
        prob = 1
        for b in range(0, len(pat)): #len(pat)=k
            prob*=float(profDict[pat[b]][b])
        probsDict[pat] = prob
    #print(probsDict)
    for key, val in probsDict.items():
        if val == max(probsDict.values()):
            return key
            

In [92]:
def score(dnas):
    score = 0
    dnasDF = pd.DataFrame(dnas)[0].str.split('',expand=True).iloc[:,1:-1]

    for i in range(0,dnasDF.shape[1]):
        valCts = dnasDF.iloc[:,i].value_counts()
        score+=(len(dnas)-max(valCts))    
    return score

In [70]:
# profile
def profile(motifs):
    k = len(motifs[0])
    dicNuc = {'A':0, 'C':1, 'G':2, 'T':3}
    profCts = np.array([[1]*k,
                            [1]*k,
                            [1]*k,
                            [1]*k])
    
    profile = np.array([[float(0)]*k,
                        [float(0)]*k,
                        [float(0)]*k,
                        [float(0)]*k])
    
    for n in range (0, k): 
        for motf in motifs:
            #get profile counts
            profIndx = dicNuc[motf[n]]
            profCts[profIndx,n]+=1 
        totCol = profCts[:,n].sum()     
        for r in range (0,4):
            profile[r, n] = profCts[r, n]/totCol
        
    return profile                

In [93]:
def randomizedMotifSearch(dna, k, t):
    motifs = []
    for stri in dna:
        # random motifs
        r = random.randrange(0,len(stri)-k+1)
        #print(r)
        motifs.append(stri[r:k+r])
    bestMotifs = motifs
    
    while 1==1:
        prof = profile(motifs)
        motifs = []
        for stri in dna:    
            motifs.append(mostProbKmer(stri, k , prof))
        
        scor = score(motifs)
        bestScore = scor
        if scor < score(bestMotifs):
            bestMotifs = motifs
            bestScore = scor
        else:
            return bestScore, bestMotifs

In [94]:
file = pd.read_csv("dataset_30307_5 (1).txt", header=None)
print(file.iloc[1,0].split()[1])

CCATGCTAAGTCCAATCTAGCGGTCTTGAGTACCCGGGAAAAATGACAGTATACTTACTCGGCCCGGAAACGAGAGGGTCAAGCAGTTGTTTGCTGTCAGTTGGGGTTAACAGTCCGCAAGGATCCCAATCCCTTGCGCGCTCTAGAAACGTTCATCGGATATTTCGAGAATACTCTTGATTTCAGGAGCGCCATTTAAGAACC


In [95]:
pd.Series(file.iloc[1,0].split())

0     CGCCATTTAAGAACCCCATGCTAAGTCCAATCTAGCGGTCTTGAGT...
1     CCATGCTAAGTCCAATCTAGCGGTCTTGAGTACCCGGGAAAAATGA...
2     TGTCGCGGGCTGTGCAGGAACGACCTCAGGAACTTCTCTGGGTACT...
3     AGCGAGGGCACGACTTAAAAGTAACATATTGTTAGCGATATCATCC...
4     ATACACCCTTCGCGAAAGTGTCCACGTACTAGACATGCCGTTTGCG...
5     AACTCGACCGGATAGTTATTCGGGACAACGTGACGATCCGATGGGG...
6     AATCTGATAATCGTTGAAGAACAGCTTCCTCACGCAGGAAGCGCTT...
7     ACCCCACCGGGGTCTATTTGGAGCAGCCACGTACGAGAGGGAATGG...
8     GTTACGTTCCGGCACACAACAGCGCTATCTCAACCTTGCACCTAGC...
9     GCTCCCGCGGACTATGATGCCAAATGCAAACGTGATATAGAGGGAC...
10    GATGGAACTTTCATCGTTCAGATAAAGAACGTGGCAAAAGATATGT...
11    ATAAGCTGACCTTACACCGATTTAAAGAGGGTGATCAAGACTCAAA...
12    ATTAAGGTGTCTTATTCACAGGAAGACAAACAGCTATCCGATTACC...
13    AAGGTAGCGTGCGCTGCGTGCCTCTCTTAGAGCATCGTACGCCGGC...
14    GGATAGAGTTCGCTACAAGTGCGTATCGAAGACTACTTGTAGCATG...
15    CGCTACTCACTATTGTCCATTACTCACTTACCGATTACGAGCTTGG...
16    GCAAGAGGTGGAAGCGGTACAAGCCTAGTTAGCCGATTACGATGTG...
17    TATTTTGCGTGGCTCTGAAACCTGAACACCAACGTCCGTCGC

In [96]:
# RUNNING CELL
startt = time.time()
motDi = {}
for i in range (0,1000):
    i+=1
    key, value = randomizedMotifSearch(pd.Series(file.iloc[1,0].split()), int(file.iloc[0,0].split()[0]),int(file.iloc[0,0].split()[1]))
    motDi[key] = value
        
for key, val in motDi.items():
    if key == min(motDi.keys()):
        print("--- Running time: %s seconds ---" % (time.time() - startt))
        print(*val)

--- Running time: 156.08359217643738 seconds ---
TTGATTTCAGGAGCG CCGGAAACGAGAGGG CCGATTAATGGAGGG CCGATTACGCCGGGG GCGATTACGAGAGTA CCGATGGGGAGAGGG CAATTTACGAGAGGG CCACGTACGAGAGGG TGAATTACGAGAGGG TTGATTACGAGAGGC CCGATGGTGAGAGGG CCGATTTAAAGAGGG CCGATTACCTCAGGG CCGGACACGAGAGGG CCGAAGCCGAGAGGG CCGATTACGAGCTTG CCGATTACGATGTGG CCGATTTGAAGAGGG CCGACGCCGAGAGGG CCGATTACGAGACAC
