In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

In [2]:
# Pre-processing paramaters
k = int(json.load(open('config.json'))['k'])
dna_sequence_character_set = ['A', 'C', 'G', 'T']

In [3]:
# The following function returns all the possible DNA subsequences of length k
def find_all_dna_subsequences(set, k): 
    n = len(set)
    arr = []
    find_all_dna_subsequences_recursive(set, "", n, k, arr)
    return np.asarray(arr)
 
# This recursive function is a helper function for the above function
def find_all_dna_subsequences_recursive(set, prefix, n, k, arr):
    if (k == 0):
        arr.append(prefix)
        return
 
    for i in range(n):
        newPrefix = prefix + set[i]
        find_all_dna_subsequences_recursive(set, newPrefix, n, k - 1, arr)

        
reference_dna_vector = find_all_dna_subsequences(dna_sequence_character_set, k)

In [4]:
# Read the human dataset
num_classes = str(json.load(open('config.json'))['num_classes'])
dataset_file_name = 'human_dataset_' + num_classes + '.txt'
dna_sequences = np.asarray(pd.read_table('../Datasets/' + dataset_file_name)['sequence'].to_list())
labels = pd.read_table('../Datasets/' + dataset_file_name)['class'].to_numpy()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(dna_sequences, labels, test_size=0.2, random_state=20)

In [6]:
# Lambda function that extracts the k-mers from a single DNA sequence
k_mer = lambda sequence, k: np.asarray([sequence[i: i + k] for i in range(len(sequence) - k + 1)])

In [7]:
# Remove the k_mers that contain any character other than A, C, G, T like N
remove_undefined_k_mers = lambda k_mers: np.asarray([k_mer for k_mer in k_mers if all([character in dna_sequence_character_set for character in k_mer])])

# Testing the function
assert len(remove_undefined_k_mers(['ACGT', 'NACG', 'AAAAN'])) == 1
assert len(remove_undefined_k_mers(['ACGTN', 'NACG', 'AAAAN'])) == 0
assert len(remove_undefined_k_mers(['ACGT', 'ACG', 'AAAA'])) == 3

In [8]:
# Lambda function that counts the number of times each k-mer appears in a single DNA sequence
spectral_representation = lambda k_mers: pd.DataFrame.from_dict(Counter(k_mers), orient='index')

In [9]:
# Apply the function on all the elements of the DNA sequences array
k_mers_train = np.asarray([k_mer(sequence, k) for sequence in X_train], dtype=object)

k_mers_train = np.asarray([remove_undefined_k_mers(k_mer) for k_mer in k_mers_train], dtype=object)

dna_spectral_representations_train = np.asarray([spectral_representation(k_mer) for k_mer in k_mers_train], dtype=object)

#====================================================================================================

k_mers_test = np.asarray([k_mer(sequence, k) for sequence in X_test], dtype=object)

k_mers_test = np.asarray([remove_undefined_k_mers(k_mer) for k_mer in k_mers_test], dtype=object)

dna_spectral_representations_test = np.asarray([spectral_representation(k_mer) for k_mer in k_mers_test], dtype=object)

In [10]:
dna_vector_train = np.zeros((X_train.shape[0], np.power(len(dna_sequence_character_set), k)), dtype=int)
dna_vector_test = np.zeros((X_test.shape[0], np.power(len(dna_sequence_character_set), k)), dtype=int)

In [11]:
def get_dna_reperesentation_vector(dna_spectral_representation_subsequences, dna_spectral_representation_frequency):
    dna_sequence_vector = np.zeros((1, np.power(len(dna_sequence_character_set), k)), dtype=int)
    for i in range(len(dna_spectral_representation_subsequences)):
        index = np.where(reference_dna_vector == dna_spectral_representation_subsequences[i])[0]
        dna_sequence_vector[0][index] = dna_spectral_representation_frequency[i]
    return dna_sequence_vector

In [12]:
for i in range(X_train.shape[0]):
    dna_vector_train[i] = get_dna_reperesentation_vector(dna_spectral_representations_train[i].index.values, dna_spectral_representations_train[i][0].values)

for i in range(X_test.shape[0]):
    dna_vector_test[i] = get_dna_reperesentation_vector(dna_spectral_representations_test[i].index.values, dna_spectral_representations_test[i][0].values)

In [13]:
# Testing the pre-processing module
for i in range(len(X_train)):
    assert np.sum(dna_vector_train[i]) == np.sum(dna_spectral_representations_train[i][0].values), f'{i} {np.sum(dna_vector_train[i])}, {np.sum(dna_spectral_representations_train[i][0].values)}'

In [14]:
np.save('../Saved Data/Test Set/X_test.npy', X_test)
np.save('../Saved Data/Test Set/y_test.npy', y_test)
np.save('../Saved Data/Test Set/X_test_pre-processed.npy', dna_vector_test)
np.save('../Saved Data/Training Set/dna_spectral_representation.npy', dna_vector_train)
np.save('../Saved Data/Training Set/labels.npy', y_train)