In [1]:
import unittest
import pandas as pd
import os
import numpy as np

In [2]:
codes = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
         'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

In [3]:
class data_transformation():
    def __init__(self, path_data, path_mhc, allele_name = None, quant_data = True, encoding = "one-hot"):
        self.path_data = path_data
        self.path_mhc = path_mhc
        self.quant_data = quant_data
        self.encoding = encoding
        self.allele_name = allele_name
        
    def read_data(self):
        '''Reads dataset, mhc sequence data and joins them - returns the data as pandas dataframe'''
        cwd = os.getcwd()
        os.chdir('..')
        os.chdir('..')
        os.chdir('..')
        current_path = os.getcwd()
        data = pd.read_csv(current_path + self.path_data)
        mhc_data = pd.read_csv(current_path + self.path_mhc).loc[:, ["allele", "mhc_sequence"]]
        joined_data = pd.merge(data, mhc_data, left_on = "original_allele", right_on = "allele", how='inner').drop(["allele_y", "original_allele"], axis = 1)
        os.chdir(cwd) 
        return joined_data
    
    def filter_data(self):
        if not self.allele_name:
            data = self.read_data()
        else:
            data = self.read_data()
            data = data[data.allele_x == self.allele_name]
        if self.quant_data:
            data = data[data.measurement_type == "quantitative"].reset_index(drop = True)
        else:
            data = data.reset_index(drop = True)
        data["measurement_value"] = (1 - np.log(data["measurement_value"])) / np.log(50000)
        return data
    
    def encode_sequence(self, data, enc):
        # TODO better onehot encoding
        if self.encoding == "one-hot":
            if enc == "peptide":
                peptide_data = data.loc[:, ["peptide"]]
                tmp_data = data.append({"peptide": ''.join(codes)}, ignore_index = True)
                encode_data = tmp_data['peptide'].str.get_dummies(sep='').values[:-1]
            elif enc == "mhc":
                mhc_data = data.loc[:, ["mhc_sequence"]]
                tmp_data = data.append({"mhc_sequence": ''.join(codes)}, ignore_index = True)
                encode_data = tmp_data['mhc_sequence'].str.get_dummies(sep='').values[:-1]
        else:
            raise NotImplementedError
        return encode_data        
    
    def __getitem__(self):
        data = self.filter_data()
        peptide_seq = self.encode_sequence(data, enc = "peptide")
        mhc_seq = self.encode_sequence(data, enc = "mhc")
        target = np.array([data.measurement_value])
        return peptide_seq, mhc_seq, target

In [4]:
class data_class_test(unittest.TestCase):
    def setUp(self):
        self.class_data = data_transformation(path_data = "/data/data_curated_20180219/curated_training_data_no_mass_spec.csv",
                                              path_mhc = "/data/mhc_sequences_complete.csv",
                                              allele_name = "HLA-A*02:01",
                                              quant_data = True,
                                              encoding = "one-hot")
    
    def test_encode_sequence(self):
        self.assertEqual(self.class_data.encode_sequence(self.class_data.filter_data().loc[:0, :], enc = "peptide").shape,
                         len(self.class_data.filter_data().loc[:0, ["peptide"]].peptide[0]))
    
    def test_getitem(self):
        peptide_seq, mhc_seq, target = self.class_data.__getitem__()
        self.assertEqual(peptide_seq.shape, (31291, 20))
        
if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)

test_encode_sequence (__main__.data_class_test) ... ERROR
test_getitem (__main__.data_class_test) ... FAIL

ERROR: test_encode_sequence (__main__.data_class_test)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-4-86e69ebe9a15>", line 10, in test_encode_sequence
    self.assertEqual(t.encode_sequence(t.filter_data().loc[:0, :], enc = "peptide").shape,
NameError: name 't' is not defined

FAIL: test_getitem (__main__.data_class_test)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-4-86e69ebe9a15>", line 15, in test_getitem
    self.assertEqual(peptide_seq.shape, (31291, 20))
AssertionError: Tuples differ: (31290, 20) != (31291, 20)

First differing element 0:
31290
31291

- (31290, 20)
?      ^

+ (31291, 20)
?      ^


----------------------------------------------------------------------
Ran 2 tests in 7.615s

FAILED (failures=