In [8]:
import pandas as pd
from pathlib import Path
import numpy as np 

In [9]:
def compute_kmer_counts(seq, k, alphabet = None, do_sliding = True):
    
    if(not alphabet):
        alphabet = list(set(list(seq)))
        alphabet.sort()

    kmers_list = []
    kmers_list.append(alphabet)
    
    for k_idx in range(k-1):
        kmers_list.append([a+b for a in kmers_list[k_idx] for b in alphabet ])

    kmers = kmers_list[-1] # find the frquencies of the kmers with the largest k

    seq_lst = list(seq)
    subs = ["".join(seq_lst[idx:idx+k]) for idx in range(len(seq)-k+1)]
    kmers_counts = [subs.count(kmer) for kmer in kmers]
    
    #print(kmers,kmers_counts)
    
    return kmers_counts

In [10]:
# print(len(compute_kmer_counts("ACTGNNN", k, list("ACTG"))))

# cat lnc_pdiv.txt lnc_pint.txt > lnc_p.txt
# cat lnc_pdiv_noN.txt lnc_pint_noN.txt > lnc_p_noN.txt

In [11]:
inputseq_dir = Path('/disks/data/paper_projects/Fantom/data/seq')

seqfiles = [inputseq_dir/'lnc_e_noN.txt', inputseq_dir/'lnc_p_noN.txt', inputseq_dir/'lnc_pdiv_noN.txt', inputseq_dir/'lnc_pint_noN.txt']
alphabet = list("ACGT")

seqfiles = [inputseq_dir/'lnc_e.txt', inputseq_dir/'lnc_p.txt', inputseq_dir/'lnc_pdiv.txt', inputseq_dir/'lnc_pint.txt']
alphabet = list("ACGNT")


seqfile = seqfiles[3]
kmers = 3

for seqfile in seqfiles:
    
    for kmer in range(1, kmers + 1):

        outfile = Path(str(seqfile).replace('.txt', f'_{kmer}mer_features.csv'))
        print(outfile)

        df = pd.read_csv(seqfile, header = None)

        print(df.head())

        print(df.shape, len(df.iloc[1,:].values.item()))

        sequences = [df.iloc[i,:].values.item() for i in range(df.shape[0]) ]

        split_ranges = [(0,200), (200,400), (400,600)]

        kmer_counts_seq_splits = []
        for i, sequence in enumerate(sequences):
            #kmer_counts_splits = []
            #for k in range(1,kmer+1):
            split_kmer_counts = []
            for j, split_range in enumerate(split_ranges):
                seq = sequence[slice(*split_range) ]
                #kmer_counts = np.array(compute_kmer_counts("ACTGNNN", k, list("ACTG"))).reshape(1,-1)
                kmer_counts = np.array(compute_kmer_counts(seq, kmer, alphabet)).reshape(1,-1)
                split_kmer_counts.append(kmer_counts)
            split_kmer_counts = np.concatenate(split_kmer_counts, axis = 1)
            #kmer_counts_splits.append(split_kmer_counts)
            kmer_counts_seq_splits.append(split_kmer_counts)
            
            #kmer_counts_splits = np.concatenate(kmer_counts_splits, axis = 1)
            #kmer_counts_seq_splits.append(kmer_counts_splits)
        #kmer_counts_seq_splits = np.concatenate(kmer_counts_seq_splits, axis = 0)
        #kmer_counts_seq_splits = np.concatenate(kmer_counts_splits, axis = 0)
        kmer_counts_seq_splits = np.concatenate(kmer_counts_seq_splits, axis = 0)
        print(kmer_counts_seq_splits.shape)

        np.savetxt(outfile, kmer_counts_seq_splits, fmt = '%d', delimiter = '\t')

/disks/data/paper_projects/Fantom/data/seq/lnc_e_1mer_features.csv
                                                   0
0  GGAGTGACTCACTGAACTACCAGCAGGCGCCCCCGACACAGGCGAG...
1  TGAGATGCAGTTTTGCTCTTGTTGCCCAGGCTGGAGTGCAACGGCT...
2  TTACCAAGTGCCTCTCGCATGGGAGTTTCTATACAAAGCCCTGGGC...
3  GGTAACAAAACTAAGGAGTGAAAAACTCTTCAGGCTAAGTTATTGA...
4  TCTCAATAATAATAATAATAATAATAATAATAATAATAATAACACT...
(3021, 1) 600
(3021, 15)
/disks/data/paper_projects/Fantom/data/seq/lnc_e_2mer_features.csv
                                                   0
0  GGAGTGACTCACTGAACTACCAGCAGGCGCCCCCGACACAGGCGAG...
1  TGAGATGCAGTTTTGCTCTTGTTGCCCAGGCTGGAGTGCAACGGCT...
2  TTACCAAGTGCCTCTCGCATGGGAGTTTCTATACAAAGCCCTGGGC...
3  GGTAACAAAACTAAGGAGTGAAAAACTCTTCAGGCTAAGTTATTGA...
4  TCTCAATAATAATAATAATAATAATAATAATAATAATAATAACACT...
(3021, 1) 600
(3021, 75)
/disks/data/paper_projects/Fantom/data/seq/lnc_e_3mer_features.csv
                                                   0
0  GGAGTGACTCACTGAACTACCAGCAGGCGCCCCCGACACAGGCGAG...
1  TGAG

In [24]:
kmer_counts_seq_splits.shape

(1, 192)

In [21]:
kmer_counts_seq_splits

array([[ 3,  2,  0,  7,  5,  9,  4,  1,  0,  4,  0,  1,  8,  2,  2,  6,
         5, 10,  3,  6, 13, 18,  6,  1,  0,  5,  2,  4,  3,  0,  0,  0,
         1,  0,  0,  0,  4,  9,  0,  1,  1,  2,  3,  1,  3,  1,  0,  2,
         3,  7,  2,  5,  2,  2,  1,  0,  0,  3,  2,  0,  3,  2,  3,  5,
         0,  4,  1,  1,  1, 13,  7,  2,  0,  4,  0,  1,  1,  1,  1,  1,
         3, 14,  2,  3, 15, 29,  6,  6,  1,  9,  2,  3,  0,  4,  1,  5,
         1,  2,  0,  0,  6,  8,  1,  1,  1,  2,  0,  0,  1,  2,  2,  0,
         2,  3,  2,  0,  1,  6,  1,  0,  1,  1,  1,  1,  5,  1,  0,  5,
         2,  1,  2,  4,  2,  3,  2,  1,  3,  1,  0,  0,  9,  5,  0, 13,
         2,  1,  0,  7,  5,  6,  3,  0,  1,  5,  2, 11,  1,  2,  1,  2,
         2,  1,  0,  1,  1,  0,  6,  2,  0,  3,  0,  1,  8,  3,  1,  2,
         3,  5,  2, 15,  2,  5,  9,  3,  0,  0,  2,  2,  6,  9,  2,  5]])

In [22]:
df_tmp = pd.read_csv(outfile)

In [23]:
df_tmp.shape

(0, 1)

In [119]:
# [s[:5] for s in sequences[:10]]

In [120]:

# kmer_count = compute_kmer_counts("ACTGNNN", 2, list("ACTG"))

# kmer_count

(1862, 252)