In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter

In [2]:
# Pre-processing paramaters
k = int(json.load(open('config.json'))['k'])
dna_sequence_character_set = ['A', 'C', 'G', 'T']
print(k)

5


In [3]:
# The following function returns all the possible DNA subsequences of length k
def find_all_dna_subsequences(set, k): 
    n = len(set)
    arr = []
    find_all_dna_subsequences_recursive(set, "", n, k, arr)
    return np.asarray(arr)
 
# This recursive function is a helper function for the above function
def find_all_dna_subsequences_recursive(set, prefix, n, k, arr):
    if (k == 0):
        arr.append(prefix)
        return
 
    for i in range(n):
        newPrefix = prefix + set[i]
        find_all_dna_subsequences_recursive(set, newPrefix, n, k - 1, arr)

In [4]:
# Read the human dataset
dna_sequences = np.asarray(pd.read_table('../Datasets/human_dataset.txt')['sequence'].to_list())

In [5]:
# Lambda function that extracts the k-mers from a single DNA sequence
k_mer = lambda sequence, k: np.asarray([sequence[i: i + k] for i in range(len(sequence) - k + 1)])

# Apply the function on all the elements of the DNA sequences array
k_mers = np.asarray([k_mer(sequence, k) for sequence in dna_sequences], dtype=object)

In [6]:
# Remove the k_mers that contain any character other than A, C, G, T like N
remove_undefined_k_mers = lambda k_mers: np.asarray([k_mer for k_mer in k_mers if all([character in dna_sequence_character_set for character in k_mer])])

# Testing the function
assert len(remove_undefined_k_mers(['ACGT', 'NACG', 'AAAAN'])) == 1
assert len(remove_undefined_k_mers(['ACGTN', 'NACG', 'AAAAN'])) == 0
assert len(remove_undefined_k_mers(['ACGT', 'ACG', 'AAAA'])) == 3

In [7]:
k_mers = np.asarray([remove_undefined_k_mers(k_mer) for k_mer in k_mers], dtype=object)

In [8]:
# Lambda function that counts the number of times each k-mer appears in a single DNA sequence
spectral_representation = lambda k_mers: pd.DataFrame.from_dict(Counter(k_mers), orient='index')

dna_spectral_representations = np.asarray([spectral_representation(k_mer) for k_mer in k_mers], dtype=object)

In [9]:
reference_dna_vector = find_all_dna_subsequences(dna_sequence_character_set, k)
dna_vector = np.zeros((dna_sequences.shape[0], np.power(len(dna_sequence_character_set), k)), dtype=int)

In [10]:
def get_dna_reperesentation_vector(dna_spectral_representation_subsequences, dna_spectral_representation_frequency):
    dna_sequence_vector = np.zeros((1, np.power(len(dna_sequence_character_set), k)), dtype=int)
    for i in range(len(dna_spectral_representation_subsequences)):
        index = np.where(reference_dna_vector == dna_spectral_representation_subsequences[i])[0]
        dna_sequence_vector[0][index] = dna_spectral_representation_frequency[i]
    return dna_sequence_vector

In [11]:
for i in range(dna_sequences.shape[0]):
    dna_vector[i] = get_dna_reperesentation_vector(dna_spectral_representations[i].index.values, dna_spectral_representations[i][0].values)

In [12]:
# Testing the pre-processing module
for i in range(len(dna_sequences)):
    assert np.sum(dna_vector[i]) == np.sum(dna_spectral_representations[i][0].values), f'{i} {np.sum(dna_vector[i])}, {np.sum(dna_spectral_representations[i][0].values)}'

In [13]:
np.save('../Saved Data/dna_spectral_representation.npy', dna_vector)