In [118]:
import sys
from random import seed
from random import randint

def get_profile(motifs, t):
    
    profile = {0:{"A":0, "C":0, "G":0, "T":0}}
    bases = ["A", "C", "T", "G"]
    
    # Count occurances of every base at every index
    for motif in motifs:
        for i in range(len(motif)):
            if len(profile) <= i:
                profile[i] ={"A":0, "C":0, "G":0, "T":0}
            profile[i][motif[i]] = profile[i][motif[i]] + 1
            
    # Convert to percentages
    for i in range(len(profile)):
        for base in bases:
            profile[i][base] = profile[i][base] / t
            
    return profile

def get_probability(string, profile):
    score = 1
    for i in range(len(string)):
        score = score * profile[i][string[i]]
    return score

def get_score(motifs):
    counts = {0:{"A":0, "C":0, "G":0, "T":0}}
    bases = ["A", "C", "T", "G"]
    score = 0
    
    # Count occurances of every base at every index
    for string in motifs:
        for i in range(len(string)):
            if len(counts) <= i:
                counts[i] ={"A":0, "C":0, "G":0, "T":0}
            counts[i][string[i]] = counts[i][string[i]] + 1
    # Find the max values 
    for i in counts:
        max_val = max(counts[i].values())
        for j in counts[i]:
            
            # Get score
            if counts[i][j] == max_val:
                score = score + len(motifs) - max_val
                break
    return score

def get_motifs(dna, k, profile):
    
    kmers = []
    scores = {}
    # For each DNA string
    for string in dna:
        best_percent = 0
        best_kmer = string[0:k]
        
        # For each potential starting index find the most probable kmer
        for i in range(len(string) - k + 1):
            score = get_probability(string[i:i+k], profile)
            if score > best_percent:
                best_percent = score
                best_kmer = string[i:i+k]
        kmers.append(best_kmer)   
        scores[string] = best_percent
        
    return kmers

def randomized_motif_search(dna: list[str], k: int, t: int) -> list[str]:
    final_motif = []
    best_score = sys.maxsize
    for runit in range(10000):
        random_motifs = []

        for sample in dna:
            number = randint(0, len(sample) - k)
            random_motifs.append(sample[number: number + k])

        best_motifs = random_motifs
        still_run = True
        while still_run:
            profile = get_profile(random_motifs, t)
            #print(profile)
            random_motifs = get_motifs(dna, k, profile)
            #print(random_motifs)
            if get_score(random_motifs) <  get_score(best_motifs):
                best_motifs = random_motifs
            else:
                if best_score > get_score(best_motifs): 
                    final_motif = best_motifs
                    best_score = get_score(best_motifs)
                still_run = False
    return final_motif, best_score
                


In [120]:
dna = ["CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA", "GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG", "TAGTACCGAGACCGAAAGAAGTATACAGGCGT", "TAGATCAAGTTTCAGGTGCACGTCGGTGAACCAA","TCCACCAGCTCCACGTGCAATGTTGGCCTA"]
print(randomized_motif_search(dna, 8, 2))

(['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG'], 9)


In [None]:
import sys
from random import seed
from random import randint

def get_profile(motifs, t):
    
    profile = {0:{"A":0, "C":0, "G":0, "T":0}}
    bases = ["A", "C", "T", "G"]
    
    # Count occurances of every base at every index
    for motif in motifs:
        for i in range(len(motif)):
            if len(profile) <= i:
                profile[i] ={"A":0, "C":0, "G":0, "T":0}
            profile[i][motif[i]] = profile[i][motif[i]] + 1
            
    # Convert to percentages
    for i in range(len(profile)):
        for base in bases:
            profile[i][base] = profile[i][base] / t
            
    return profile

def get_probability(string, profile):
    score = 1
    for i in range(len(string)):
        score = score * profile[i][string[i]]
    return score

def get_score(motifs):
    counts = {0:{"A":0, "C":0, "G":0, "T":0}}
    bases = ["A", "C", "T", "G"]
    score = 0
    
    # Count occurances of every base at every index
    for string in motifs:
        for i in range(len(string)):
            if len(counts) <= i:
                counts[i] ={"A":0, "C":0, "G":0, "T":0}
            counts[i][string[i]] = counts[i][string[i]] + 1
    # Find the max values 
    for i in counts:
        max_val = max(counts[i].values())
        #print(counts[i].values())
        for j in counts[i]:
            
            # Get score
            if counts[i][j] == max_val:
                #print(max_val)
                #print(len(motifs))
                score = score + len(motifs) - max_val
                #print(score)
                break
    return score

def get_motifs(dna, k, profile):
    
    kmers = []

    # For each DNA string
    for string in dna:
        best_percent = 0
        best_kmer = string[0:k]
        
        # For each potential starting index find the most probable kmer
        for i in range(len(string) - k + 1):
            score = get_probability(string[i:i+k], profile)
            if score > best_percent:
                best_percent = score
                best_kmer = string[i:i+k]
        kmers.append(best_kmer)   
        
    return kmers

def randomized_motif_search(dna: list[str], k: int, t: int) -> list[str]:
    final_motif = []
    best_score = sys.maxsize
    
    # WITH THIS MANY, THE CORRECT MOTIF MUST BE GETTING PICKED -> THERE IS SOMETHING WRONG IN THE FOLLOWING STEPS
    for runit in range(5000):
        random_motifs = []
        keep_track = False
        for sample in dna:
            number = randint(0, len(sample) - k)
            random_motifs.append(sample[number: number + k])
            if "TCTCGGGG" == sample[number: number + k]:
                keep_track = True
        best_motifs = random_motifs
        still_run = True
        while still_run:
            profile = get_profile(random_motifs, t)
            random_motifs = get_motifs(dna, k, profile)
            #if keep_track:
             #   print(profile)
              #  print(random_motifs)
            if get_score(random_motifs) <  get_score(best_motifs):
                best_motifs = random_motifs
            else:
                if best_score > get_score(best_motifs): 
                    final_motif = best_motifs
                    best_score = get_score(best_motifs)
                still_run = False
    return final_motif
                
