Example of dict application: biology

A, C, G, T 1-mer

AA, AC, AG, AT ... 2-mer

AAA AAC ... 3-mer



In [1]:
def generate_dna_kmers(k):
    '''
    return a list of all possible substrings of length k using only chars A, C, T, and G
    '''
    bases = ['A','C','T','G']
    
    last = bases # last = ['']
    for _ in range(k-1): # for _ in range(k):
        current = []
        for b in bases:
                for l in last:
                    current.append(l+b)
        last = current
    return last

In [4]:
def counter_mer(mer, seq):
    ''' 
    Counts the number of times a substring mer
    occurs in the sequence seq (including overlapping 
    occurrences)
    
    sample use: counter_mer("GGG", "AGGGCGGG") => 2
    '''
    
    k = len(mer)
    count = 0
    for i in range(0, len(seq)-k+1):
        if mer == seq[i:i+k]:
            count += 1
    return count

In [5]:
counter_mer("GGG", "AGGGCGGG")

2

In [6]:
def kmer_count(k, seq):
    rv = {}
    for i in range(0, len(seq)-k+1):
        subseq = seq[i:i+k]
        if subseq in rv:
            v = rv[subseq]
        else:
            v=1
        rv[subseq] = v + 1

In [8]:
def kmer_count_leet(k, seq):
    rv = {}
    for i in range(0, len(seq)-k+1):
        subseq = seq[i:i+k]
        rv[subseq] = 1 + rv.get(subseq,0)
    return rv

In [22]:
import numpy as np
nums = list(np.random.randint(0,10,1000))
len(nums)

1000

In [23]:
def count_k(k,nums):
    hold = {}
    for n in nums:
        hold[n] = 1 + hold.get(n,0)
    return hold

In [24]:
d = count_k(10,nums)

In [28]:
{k:v for k,v in sorted(d.items(), key=lambda item: item[1],reverse=True)[:5]}

{1: 112, 5: 108, 7: 104, 9: 103, 8: 102}