###  2 Bit encoding of k-mers

##### 📌 In this notebook:
 - Read DNA k-mers from a file
- Convert them into compact 2-bit encoded integers



Imports

In [14]:
import numpy as np

file loading

In [15]:
file_path = "kmc_1M.txt"


bruteforce appraoch

In [16]:
def encode_dna(dna_sequence):
    encoding_map = {
        'A': '00',
        'C': '01',
        'G': '10',
        'T': '11'
    }
    
    binary_string = ''.join(encoding_map[base] for base in dna_sequence)
    return binary_string


In [17]:
dna = "TTATACGT"
encoded = encode_dna(dna)
print(encoded)  

1111001100011011


##### Method 1: Lookup Method for bit encoding using shift operators

In [18]:
base_bits = {'A': 0b00, 'C': 0b01, 'G': 0b10, 'T': 0b11}


In [19]:
def encode_kmer(seq):
    encoded = 0
    for base in seq:
        encoded = (encoded << 2) | base_bits[base]
    return encoded


In [20]:

encoded_kmers = []
counts = []

with open(file_path, 'r') as file:
    for line in file:
        line = line.strip()
        if not line:
            continue
        seq, count = line.split()
        encoded = encode_kmer(seq)
        encoded_kmers.append(encoded)
        counts.append(int(count))

encoded_kmers = np.array(encoded_kmers, dtype=np.uint32)
counts = np.array(counts, dtype=np.uint32)

for seq, encoded in zip(encoded_kmers[:10], encoded_kmers[:10]):
    print(f"{encoded:016b}")

print(encoded_kmers.shape)

0001001000000011
0001001111110111
0001110101001101
0010010010001000
0010010011110111
0010011110100010
0010111011111010
0011000100010001
0011000101001111
0011101001101110
(1000000,)


Disadvantage: Slow lookups

##### Method 2:Bit Encoding of Kmers using ASCII values and masking them!

In [23]:
def bit_encoding_mask(seq):
    encoded = 0
    for char in seq:
        base_bits = (ord(char) >> 1) & 0b11
        encoded = (encoded << 2) | base_bits 

    return encoded

In [38]:
encoded_data = []

with open(file_path, "r") as file:
    for line in file:
        if not line.strip():
            continue 
        kmer, count = line.strip().split()
        kmer_length = len(kmer)

        encoded_kmer = bit_encoding_mask(kmer)
        encoded_data.append([encoded_kmer, int(count)])


In [39]:
# Choose dtype
if kmer_length <= 8:
    dtype = np.uint16
elif kmer_length <= 16:
    dtype = np.uint32
else:
    dtype = np.uint64
print(dtype)


<class 'numpy.uint32'>


In [41]:
np_encoded_array = np.array(encoded_data, dtype=dtype)

print("NumPy array shape:", np_encoded_array.shape)
print("First 5 rows:")
for encoded, count in np_encoded_array[:5]:
    print(f"Encoded: {encoded:032b} → Count: {count}")

NumPy array shape: (1000000, 2)
First 5 rows:
Encoded: 00000000000000000001001100000010 → Count: 193
Encoded: 00000000000000000001001010100110 → Count: 40
Encoded: 00000000000000000001100101001001 → Count: 42
Encoded: 00000000000000000011010011001100 → Count: 96
Encoded: 00000000000000000011010010100110 → Count: 75
