In [1]:
#import modules
import pandas as pd
import numpy as np
import Bio
from Bio import Seq
from Bio import SeqIO
import torch
import matplotlib.pyplot as plt
import sys
from torch.utils.data import TensorDataset, DataLoader
import unittest
import python_files.functions as functions

In [2]:
#define data file paths for running tests locally
thermo_path_test = 'data/thermo_test.fasta'
psychro_path_test = 'data/psychro_test.fasta'
meso_path_test = 'data/meso_test.fasta'

In [26]:
# Define a class in which the tests will run
class TestDataLoader(unittest.TestCase):
    #define data file paths for test fasta files which contain 10 examples each
    thermo_path_test = 'data/thermo_test.fasta'
    psychro_path_test = 'data/psychro_test.fasta'
    meso_path_test = 'data/meso_test.fasta'
    #construct dataframes for each test fasta file
    df_thermo = functions.fasta_to_classified_df(thermo_path_test,'Thermophillic')
    df_psychro = functions.fasta_to_classified_df(psychro_path_test,'Psychrophillic')
    df_meso = functions.fasta_to_classified_df(meso_path_test,'Mesophillic')
    #construct combined dataframe
    df_combine = functions.combine_dfs([df_thermo,df_psychro,df_meso])
    #filter the combined dataframe
    df_filter = functions.filter_seqs(df_combine)
    #1hotencode the seqs and classes
    X_data = functions.seq1hot(df_filter['sequence'].tolist())
    y_data = functions.class1hot(df_filter['class'].tolist())
    #save and load tensors
    functions.save_tensor(X_data,'tensors/test_X_data.pt')
    functions.save_tensor(y_data,'tensors/test_y_data.pt')
    X_tensor = torch.load('tensors/test_X_data.pt')
    y_tensor = torch.load('tensors/test_y_data.pt')
    
    #define a test for the fasta_to_classified_df function
    def test_fasta_to_classified_df(self):
        #run the fasta_to_classified_df function on each test fasta file
        self.df_thermo = functions.fasta_to_classified_df(thermo_path_test,'Thermophillic')
        self.df_psychro = functions.fasta_to_classified_df(psychro_path_test,'Psychrophillic')
        self.df_meso = functions.fasta_to_classified_df(meso_path_test,'Mesophillic')
        #count the number of sequences in the thermo fasta file manually
        self.file = open(thermo_path_test,'r')
        self.count = 0
        for line in self.file:
            if line.startswith('>'):
                self.count = self.count+1
        #assert that the len of seqs in the fasta file (count) is equal to the length of the df (rows)
        self.assertEqual(self.df_thermo.shape[0], self.count)

    def test_combine_dfs(self):
        #construct combined dataframe
        self.df_list = [self.df_thermo, self.df_psychro, self.df_meso]
        #assert that the length of the combined df is equal to the sum of the lengths of each df
        self.assertEqual(len(functions.combine_dfs(self.df_list)), sum([len(l) for l in self.df_list]))

    def test_filter_seqs(self):
        self.assertEqual(geomean([1,1]), 1)
    '''   
    def test_seq1hot(self):
        self.assertEqual(geomean([1,1]), 1)
               
    def test_class1hot(self):
        self.assertEqual(geomean([3, 3, 3]), 3)
        
    def test_save_tensor(self):
        self.assertEqual(geomean([3, 3, 3]), 3)
    '''

10
10
10
Just made new tensor X_data
torch.Size([30, 500, 25])
<class 'torch.Tensor'>
Just made new tensor y_data
<class 'torch.Tensor'>
torch.Size([30, 3])


In [27]:
suite = unittest.TestLoader().loadTestsFromTestCase(TestDataLoader)
runner = unittest.TextTestRunner(verbosity=2)
runner.run(suite)

test_combine_dfs (__main__.TestDataLoader) ... ok
test_fasta_to_classified_df (__main__.TestDataLoader) ... 

30
10
10
10
10
10


ok

----------------------------------------------------------------------
Ran 2 tests in 0.035s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>

In [None]:
#assert that the length of classes is 100
assert 100 == y_array.shape[0]

#assert that the number of classes is 3 (thermo, meso, psychro)
assert 3 == y_array.shape[1]

In [None]:
# assert that the length of the sequences are 500 aas
assert 500 == X_array.shape[1]

# assert that the number of amino acids is 25
assert 25 == X_array.shape[2]

In [None]:

#define the class name
class correlation_tests(unittest.TestCase):
    import numpy as np
    #define a test dataframe
    df = pd.DataFrame([[-1, 1.5, 1], [1, 2, -1], [.5, 0.25, .5]])
    
    #define tests for the pairwise_correlation function
    def test_pairwise_correlation(self):
        #assert that the correlation between identical rows is around 1
        self.assertTrue(math.isclose(df_utils.pairwise_correlation(self.df)[1,1], 1.))
        
        #assert that the correlation matrix has the same number of rows as the dataframe
        self.assertTrue(df_utils.pairwise_correlation(self.df).shape[0] == self.df.shape[0])
        
#     #define tests for the corr_rowi_rowj function
#     def test_corr_rowi_rowj(self):
#         #assert that row indices can't be strings
#         try:
#             self.assertFalse(df_utils.corr_rowi_rowj(self.df, 2, '1'))
#             self.assertTrue(False)
#         except: