In [1]:
import numpy as np
import pandas as pd

# List of valid amino acids-> 'X' represents ambiguous data).
AMINO_ACIDS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 
               'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y', 'X']

# AMINO_ACID_MAP: A dictionary that maps each amino acid to an index.
AMINO_ACID_MAP = {char: idx for idx, char in enumerate(AMINO_ACIDS)} 

# ONE_HOT_X: A one-hot encoding for the letter 'X' 
ONE_HOT_X = np.eye(len(AMINO_ACIDS))[-1]  

In [2]:
class SequenceProcessor:
    """Class to handle sequence processing tasks."""

    def __init__(self, file_path: str):
        """Initializes with the path to the CSV file."""
        self.file_path = file_path
        self.data = None
        self.processed_data = None

    def load_data(self):
        """Loads CSV file and rename columns."""
        self.data = pd.read_csv(self.file_path, header=None)
        self.data.columns = ['identifier', 'sequence']

    @staticmethod
    def one_hot_encode_vectorized(sequence, max_length):
        """ 
            1- Converts the sequence into one-hot encoded vectors for each amino acid.
            2- Pads sequences that are shorter than the maximum sequence length with the one-hot encoding of 'X'.
        """
        indices = [AMINO_ACID_MAP.get(char, len(AMINO_ACIDS) - 1) for char in sequence]
        one_hot = np.eye(len(AMINO_ACIDS), dtype=int)[indices]
        padded = np.vstack((one_hot, np.tile(ONE_HOT_X, (max_length - len(one_hot), 1))))[:max_length]
        return padded.flatten()

    @staticmethod
    def letter_composition_vectorized(sequence):
        """
            1- Computes the frequency distribution of amino acids in the sequence.
            2- Normalizes the counts to obtain a vector that represents the composition of the sequence.
        """
        counts = np.zeros(len(AMINO_ACIDS), dtype=float)
        indices = [AMINO_ACID_MAP.get(char, len(AMINO_ACIDS) - 1) for char in sequence]
        np.add.at(counts, indices, 1)
        total = len(sequence)
        return counts / total if total > 0 else counts
    
    def process_sequences(self):
        """ 
            1- This function processes the entire DataFrame of sequences.
            2- For each sequence, it applies the one_hot_encode_vectorized and letter_composition_vectorized functions.
            3- The result is a new DataFrame containing the identifier, one-hot encoded vector, and letter composition vector for each sequence.
        """
        max_length = max(self.data['sequence'].apply(len))
        one_hot_vectors = self.data['sequence'].apply(lambda seq: self.one_hot_encode_vectorized(seq, max_length))
        composition_vectors = self.data['sequence'].apply(self.letter_composition_vectorized)
        self.processed_data = pd.DataFrame({
            'identifier': self.data['identifier'],
            'one_hot_vector': one_hot_vectors,
            'composition_vector': composition_vectors
        })
        return self.processed_data
    
    def process_csv(self):
        """
        Load a CSV file, rename columns, and process sequences using vectorized functions.
        """
        if self.data is None:
            self.load_data()
        
        return self.process_sequences()

In [3]:
# File path for the input CSV
csv_file_path = "uniprot_sequences.csv"

# Creating an instance of SequenceProcessor
processor = SequenceProcessor(csv_file_path)

# Loading and processing the data
processor.load_data()
processed_data = processor.process_sequences()

# Displaying the processed data
print(processed_data.head())


  identifier                                     one_hot_vector  \
0         ID  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1          0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2          1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3          2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
4          3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   

                                  composition_vector  
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...  
1  [0.10465116279069768, 0.023255813953488372, 0....  
2  [0.11952191235059761, 0.0398406374501992, 0.05...  
3  [0.06258596973865199, 0.0171939477303989, 0.04...  
4  [0.0695742471443406, 0.017653167185877467, 0.0...  


In [4]:
import unittest
import pandas as pd

class TestSequenceProcessing(unittest.TestCase):
    
    def setUp(self):
        """Prepares sample data for testing."""
        self.data = pd.DataFrame({
            'identifier': ['protein_1', 'protein_2'],
            'sequence': ['ACDEFGHIKLM', 'ACDX']
        })
        self.pr=SequenceProcessor('uniprot_sequences.csv')
        self.processed_data = self.pr.process_csv() 
        
    def test_one_hot_encoding(self):
        """Verifies that the one-hot encoded vectors have the correct length (i.e., max_length * 21)."""
        encoded_1 = SequenceProcessor.one_hot_encode_vectorized('ACDEFGHIKLM', max_length=12)
        encoded_2 = SequenceProcessor.one_hot_encode_vectorized('ACDX', max_length=12)
        
        # Checks if length of one-hot encoded vector is correct
        self.assertEqual(len(encoded_1), 12 * 21)
        self.assertEqual(len(encoded_2), 12 * 21)
        
    def test_letter_composition(self):
        """Verifies that the sum of the composition vector is approximately 1.0 for each sequence, 
           which confirms the vector has been normalized. """
        composition_1 = SequenceProcessor.letter_composition_vectorized('ACDEFGHIKLM')
        composition_2 = SequenceProcessor.letter_composition_vectorized('ACDX')
        
        # Checks that the sum of composition vector is approximately 1.0
        self.assertAlmostEqual(np.sum(composition_1), 1.0, places=1)
        self.assertAlmostEqual(np.sum(composition_2), 1.0, places=1)
    
    def test_process_sequences(self):
        """Ensures that the processed DataFrame contains the expected columns: 
            'identifier', 'one_hot_vector', and 'composition_vector'."""
        processed_data=self.pr.process_sequences()
        
        # Checks if the processed data contains the correct columns
        self.assertIn('identifier', processed_data.columns)
        self.assertIn('one_hot_vector', processed_data.columns)
        self.assertIn('composition_vector', processed_data.columns)
        
    def test_process_csv(self):
        # Ensure the data contains the expected columns
        self.assertIn('identifier', self.processed_data.columns)
        self.assertIn('one_hot_vector', self.processed_data.columns)
        self.assertIn('composition_vector', self.processed_data.columns)
        
if __name__ == '__main__':
    # Run the test suite
    test_loader = unittest.TestLoader()
    test_suite = test_loader.loadTestsFromTestCase(TestSequenceProcessing)
    test_runner = unittest.TextTestRunner(verbosity=2)
    test_runner.run(test_suite)

test_letter_composition (__main__.TestSequenceProcessing.test_letter_composition)
Verifies that the sum of the composition vector is approximately 1.0 for each sequence, ... ok
test_one_hot_encoding (__main__.TestSequenceProcessing.test_one_hot_encoding)
Verifies that the one-hot encoded vectors have the correct length (i.e., max_length * 21). ... ok
test_process_csv (__main__.TestSequenceProcessing.test_process_csv)
Assumes the presence of a CSV file and ensures the processed result contains the expected columns. ... ok
test_process_sequences (__main__.TestSequenceProcessing.test_process_sequences)
Ensures that the processed DataFrame contains the expected columns: ... ok

----------------------------------------------------------------------
Ran 4 tests in 6.953s

OK
