In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
class PeptideDataset:
    ROOT = 'dataset/MHC_I_el_allele_specific'  # root directory containing peptide binding data
    ALL_AA = ['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    NUM_AA = len(ALL_AA)  # number of amino acids (20)
    
    def __init__(self, hla_allele, root=None, encoding='default', max_aa_len=14, padding='end', test_set='c004', input_format='linear'):
        '''
        Initialize dataset class for each human leukocyte antigen (HLA or MHC) allele.
        
        Parameters
        ----------
        hla_allele: str
            Folder name of HLA allele of interest
        
        root: str, optional
            Location of dataset
            
        encoding: str, optional
            Amino acid encoding style. Options: 'default', TBD
            
        max_aa_len: int, optional
            
        padding: str, optional
            Padding for amino acid sequence. Options: 'begin', 'end', 'after2', TBD
            
        test_set: str, optional
            Specify test set which should not be touched during model development.
            Options: 'c000', 'c001', 'c002', 'c003', 'c004'
        
        input_format: str
            Specify datum shape. Options: 'linear', '2d'
        '''
        
        self.hla_allele = hla_allele
        self.root = self.ROOT if self.root == None else root
        self.encoding = encoding
        self.max_aa_len = max_aa_len
        self.padding = padding
        self.test_set = test_set
        self.input_format = input_format
        
        self.data, self.targets = self.parse_csv()
        
       
    def parse_csv(self):
        '''
        Open up CSV and gets pandas dataframe for initializing class' properties.
        
        Returns
        ----------
        data: dict of ndarray
            Dataset (N_i x M) for each peptide group
        
        targets: dict of ndarray
            Labels (N_i x 1) for each peptide group
        '''
        
        files = os.listdir(os.path.join(self.root, self.hla_allele))
        files.remove(os.path.join(self.root, self.hla_allele, self.test_set))  # remove test set
        content = np.vstack([np.loadtxt(f, dtype = 'str') for f in files])
        
        raw_data = [encode(seq) for seq in content[:, 0]]
        targets = content[:, 1]
        
        _data = []
        for aa_seq in raw_data:
            _data.append(self.format_seq(aa_seq))
        data = np.array(_data)
        
        return data, targets
    
    def format_seq(self, seq):
        '''
        Converts an amino acid string sequence into a binary padded format.
        
        Parameters
        ----------
        seq: str
            Sequence of amino acids, small or big letters
        
        Returns
        ----------
        datum: ndarray
            Flat vector (N x 1) or 2D tensor ((N / BITS) x BITS) encoding amino acid sequence
        '''
        
        converted_seq, bits = self.encode_seq(string)
        padded_converted_seq = self.pad(seq)
        
        feat_vect = np.fromstring(padded_converted_seq, sep='')
        if self.input_format == '2d':
            feat_vect.reshape((int(len(padded_converted_seq) / bits), bits))
            
        return feat_vect
    
    def pad(self, seq):
        '''
        Pad binary string sequence to unify the sequence length.
        
        Parameters
        ----------
        seq: string
            Binary sequence of amino acids
        
        Returns
        ----------
        padded_seq: string
            String of padded amino acid sequence (N x 1)
        '''
        pass
        
    
    def encode_seq(self, _seq):
        '''
        Converts a string into an linear binary string of features.
        
        Parameters
        ----------
        _seq: str
            Sequence of amino acids, small or big letters
        
        Returns
        ----------
        bin_seq: ndarray
            String of binaries encoding each amino acid (N x 1)
        bits: int
            Number of bits used to encode each amino acid (default: NUM_AA=20) 
        '''
        
        seq = upper(_seq)  # make amino acid sequence all CAPS
        bin_seq = ''
        for aa in seq:
            if self.encoding == 'default':
                encoded_aa = self._encode_default(aa)
                bits = len(encoded_aa)  # get length of binary code for each amino acid
            bin_seq += encoded_aa  # append amino acid binary to the sequence
            
        return bin_seq, bits
            
    def _encode_default(self, aa):
        '''
        Converts a string character into an encoded linear binary string with the default encoding
        
        Parameters
        ----------
        aa: str
            String of length 1 representing one aminoacid
        
        Returns
        ----------
        bin_aa: str
            Binary string encoding an amino acid
        '''
    
        bin_aa = '0' * self.NUM_AA
        bin_aa[self.ALL_AA.find(aa)] = 1
        
        return bin_aa