In [1]:
import sys, os, time
import numpy as np
from scipy import sparse
from matplotlib import pyplot as plt

In [2]:
def revcomp(x):
    d = {'A':'T','T':'A','C':'G','G':'C','N':'N'};
    out = [d[xi] for xi in x];
    out = out[::-1];
    out = ''.join(out);
    return out
    
def list_kmers(K):
    # Make a list of all K-mers
    acgt='ACGT';
    kmers = ['']
    for k in range(K):
        kmers_base=kmers.copy();
        kmers = []
        for kmer in kmers_base:
            for n in acgt:
                kmers.append(kmer+n)
                
    return kmers

def prune_kmers(kmers):
    # For each kmer, return the index
    # Map the seq and its reverse complement to the same index
    kmers_revcomp = [revcomp(x) for x in kmers];
    kd1 = {kmer: i for i,kmer in enumerate(kmers)}
    kd2 = {revcomp(kmer): i for i,kmer in enumerate(kmers)}
    kmer_index = [np.min((kd1[kmer],kd2[kmer])) for kmer in kmers]
    _, kmer_orig_index, kmer_index = np.unique(kmer_index, 
                                               return_index=True, return_inverse=True)
    return kmer_index, kmer_orig_index

def kmer_dict(K):
    kmers = list_kmers(K)
    kmer_index, kmer_orig_index  = prune_kmers(kmers)
    mydict = {kmer: kmer_index[i] for i,kmer in enumerate(kmers)}
    return mydict

def seq2kmers(seq):
    n = len(seq)
    mykmers = np.empty((n,K),dtype=str)
    for k in range(K): 
        mykmers[:len(seq)-k,k] = list(seq[k:])
    mykmers = mykmers[:-K,:]
    
    # Remove kmers that contain N or other unwanted letters
    good_kmers = [
        np.all([a in ['A','C','G','T'] for a in mykmer]) 
                 for mykmer in mykmers
    ]
    mykmers = mykmers[good_kmers,:]

    # Map k-mers to index
    kmer_indices = [mydict[''.join(x)] for x in mykmers]
    
    return kmer_indices


In [3]:
K=6
kdict = kmer_dict(K)

In [33]:
# Load K-mers that were counted by kmer-counter: https://www.biostars.org/p/268007/
# This seems to be ~5x faster than counting the k-mers directly in Python
fn='enhancer_data/kmers_6mers/count.bed'
nenh=302106
NK=np.max([i for i in kdict.values()])+1
# kmer_counts = sparse.lil_matrix((nenh,NK), dtype=np.int16)
kmer_counts = np.zeros((nenh,NK), dtype=np.int16)

rowvec = np.zeros((NK,1))
i=0;
tstart=time.time()
with open(fn,'r') as f:
    line='asdf'
    while (line):
        line=f.readline()
        (chrom,start,end,counts) = line.strip().split('\t')
    #     kmers = dict((kdict[k],int(v)) for (k,v) in [d.split(':') for d in counts.split(' ')])
        kmers = np.array([[kdict[k],int(v)] for (k,v) in [d.split(':') for d in counts.split(' ')]])

#         kmer_counts[i,:] = sparse.csr_matrix((kmers[:,1], (np.zeros(kmers.shape[0]),kmers[:,0])), shape=(1,NK))        
        kmer_counts[i,kmers[:,0]] = kmers[:,1]
    
        i+=1
        if (i % 10000 == 0):
            print('%d, t=%3.3f' % (i, time.time()-tstart))

10000, t=13.588
20000, t=27.070
30000, t=40.849
40000, t=54.395
50000, t=68.089
60000, t=81.756
70000, t=95.296
80000, t=109.169
90000, t=122.901
100000, t=136.573
110000, t=150.247
120000, t=163.679
130000, t=177.362
140000, t=191.203
150000, t=204.804
160000, t=218.629
170000, t=232.534
180000, t=246.120
190000, t=259.889
200000, t=273.602
210000, t=287.057
220000, t=300.950
230000, t=314.761
240000, t=328.466
250000, t=342.299
260000, t=355.868
270000, t=369.675
280000, t=383.300
290000, t=397.102
300000, t=410.778


ValueError: not enough values to unpack (expected 4, got 1)

In [40]:
np.save('enhancer_data/kmers_6mers/count.npy',kmer_counts)

In [42]:
np.save('enhancer_data/kmers_6mers/kmer_dict.npy',kdict)