In [2]:
# import packages
import numpy as np
import pandas as pd
import timeit
import time
from sklearn import preprocessing
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pickle as pkl
from umap.umap_ import UMAP
import umap.plot

## Parsing Input Data

First, some dataset statistics. We load our training labels (ground truth) and see how many reads a category (species) have.

In [3]:
label_df = pd.read_csv('./training_data/train_labels.csv')
label_df['species_name'].value_counts()

decoy                              413476
burkholderia_pseudomallei            3533
pseudomonas_aeruginosa               3126
klebsiella_michiganensis             2989
mycobacterium_ulcerans               2910
klebsiella_pneumoniae                2806
serratia_liquefaciens                2629
vibrio_parahaemolyticus              2579
salmonella_enterica_typhimurium      2507
yersinia_enterocolitica              2276
stenotrophomonas_maltophilia         2217
mycobacterium_tuberculosis           2175
clostridioides_difficile             2007
acinetobacter_baumannii              1964
legionella_pneumophila               1753
listeria_monocytogenes               1479
staphylococcus_aureus                1384
staphylococcus_pseudintermedius      1328
corynebacterium_ulcerans             1266
corynebacterium_diphtheriae          1194
streptococcus_suis                   1092
neisseria_gonorrhoeae                1087
streptococcus_agalactiae             1060
streptococcus_pneumoniae          

In [4]:
# snippet to load the grouth truth training labels and normalize the label predictions.
# your trained model will predict in this space (26 classes - pathogens and decoy)

le = preprocessing.LabelEncoder()
le.fit(label_df['species_name'].unique())
y_index = le.transform(label_df['species_name'].values)
label_df['labels'] = y_index

In [5]:
# Load dictionary that maps k-mer to their corresponding index.
# A k-mer and its reverse complement are mapped to the same index.

import json

with open("./training_data/6-mers.json", 'r') as dict_file:
    canonical_kmer_dict = json.load(dict_file)

In [21]:
# We define a utility function here that turns sequences to their 6-mer profiles.

def sequence_to_kmer_profile(sequence : str, k : int = 6):
    """
    Return the k-mer profile of the input sequence (string)
    """
    res = np.zeros(len(set(canonical_kmer_dict.values())))
    for i in range(len(sequence) - k + 1):
        k_mer = sequence[i:i + k]
        if k_mer in canonical_kmer_dict:
            res[canonical_kmer_dict[k_mer]] += 1
        else:
            res[-1] += 1

    res /= np.sum(res)
    return res

In [23]:
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader

class CS4220Dataset(Dataset):
    def __init__(self, data_file, label_df=None, k=6, samples_index=None, kmer_profile_on_the_fly=False, dtype=np.float32):
        """
        Dataset class to load large CS4220 sequence database.

        Args:
            - data_file (`str`): Can either be a *.fasta file if the input is raw reads, or *.npy file
                                 if the input is k-mer profile.
            - label_df (`pd.DataFrame` or `None`): A dataframe with "labels" column indicating the label
                                                   of the data (must match with data_file), or `None` if there is
                                                   no label (in the case of test sets).
            - k (`int`): The lengt of k-mer. We use 6 in this project.
            - samples_index (`List` or `None`): list of indices of data we sample from the data file. You
                                                can use this if the dataset is very large and can't fit in memory.
                                                set this to `None` if you want to use all the data.
            - kmer_profile_on_the_fly (`bool`): If input data_file is raw reads and this set to `True`,
                                                we will build k-mer profile on the fly. This is helpful if you want to
                                                alter the input sequences during training, or the k-mer profile can't fit in memory.
                                                Otherwise, we build k-mer profile in advance, which will speed up the
                                                training process.
            - dtype: type to store the k-mer profile. You may use, for example, `np.float32` for better precision,
                     or `np.float16` for smaller memory usage. If loaded from ".npy" file, it is always `np.float16`.
        """
        self.data_file = data_file

        if ".fasta" in data_file or ".fa" in data_file or ".fna" in data_file:
            self.is_raw_reads = True
        elif ".npy" in data_file:
            self.is_raw_reads = False
        else:
            raise TypeError(f"The input file must be either a fasta file containing raw reads (.fasta, .fa, .fna) or a numpy file containing k-mer profiles (.npy).")


        self.label_df = label_df
        self.kmer_profile_otf = kmer_profile_on_the_fly

        # k-mer length, set to be 6.
        self.k = k

        # the samples we take from the read dataset
        self.samples_index = samples_index

        self.dtype = dtype

        # Load the data and store in self.reads and self.labels
        self.X = []
        self.Y = []
        self._read_labels()
        self._read_data()


    def _read_labels(self):
        """
        Read the labels and record them in self.labels.
        """
        if self.label_df is None:
            self.Y = None
        elif self.samples_index is None:
            # Load the whole dataset
            self.Y = list(self.label_df["labels"])
        else:
            # Load only the data corresponding to the sampled index
            self.Y = list(self.label_df.iloc[self.samples_index]["labels"])


    def _read_data(self):
        if self.is_raw_reads:
            # Read the fasta file
            with open(self.data_file, 'r') as fasta_file:
                lines = fasta_file.readlines()

            read_range = self.samples_index if self.samples_index is not None else range(int(len(lines) / 2))
            if not self.kmer_profile_otf:
                self.X = np.zeros(
                    (len(read_range), len(set(canonical_kmer_dict.values()))),
                    dtype=self.dtype
                )

            for i, j in enumerate(tqdm(read_range, desc=f"Parsing fasta file {self.data_file}")):
                read = lines[j * 2 + 1].strip()
                if self.kmer_profile_otf:
                    # If chose to do k-mer profiling on the fly, simply store the reads
                    self.X.append(read)
                else:
                    # Otherwise, do k-mer profiling during training/testing, cost more time during training/testing
                    self.X[i, :] = sequence_to_kmer_profile(read, self.k)
        else:
            # Read the .npy file, and load the numpy matrix
            # Each row corresponds to a read, and each column corresponds to a k-mer (see training_data/6-mers.txt).
            self.X = np.load(self.data_file)
            if self.samples_index is not None:
                self.X = self.X[self.samples_index, :]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        """
        If you are using pytorch, this function helps taking data points during each epoch
        of your training.
        """
        x = self.X[idx]
        if self.kmer_profile_otf:
            read_tensor = torch.tensor(sequence_to_kmer_profile(x, self.k), dtype=self.dtype)
        else:
            read_tensor = torch.tensor(x)

        label = self.Y[idx] if self.Y is not None else None
        return read_tensor, label


# Example usage
#input_file_path = './training_data/train_raw_reads.fasta'
input_file_path = './training_data/train_6mers.npy'

sampled_dataset = CS4220Dataset(input_file_path, label_df, samples_index=None)

In [19]:
def create_decoy_sample(dataset, label_df, num_sample=10000, patient_num=10):
    sampled_decoy_idx = label_df[label_df['species_name'] == 'decoy'].sample(num_sample).index.values.tolist()
    sampled_data = dataset.X[sampled_decoy_idx]
    np.save(f'test_data/patient{patient_num}_6mers.npy', sampled_data)
    with open(f'test_data/patient{patient_num}_labels.csv', 'w') as f:
        f.write("labels\n")
        f.write("decoy\n")

In [20]:
create_decoy_sample(sampled_dataset, label_df)

In [26]:
patient_id = 10
with open('test_data/patient{}_6mers.npy'.format(patient_id), 'rb') as read_file:
    df_test = np.load(read_file)
df_test.shape

(10000, 2081)